In [104]:
# Add the parent directory to the Python path
import sys
import os
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import loss function
from texstat.functions import *
import texstat.torch_filterbanks.filterbanks as fb

# Import extra packages
import numpy as np
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio
import torch
import torchaudio
import time

# Pick device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [105]:
# texture generation (for testing)
def texture_generator(sr, duration, num_sounds):
    sound = np.zeros(duration * sr)
    for i in range(num_sounds):
        frequency = 110 * 2**(7 * np.random.rand(1) + 1)
        sinusoid = np.sin(2 * np.pi * frequency * np.linspace(0, duration, duration * sr)+
                          np.random.rand(1) * 2 * np.pi)
        frequency_lfo = 5 * np.random.rand(1)
        lfo      = np.sin(2 * np.pi * frequency_lfo * np.linspace(0, duration, duration * sr) +
                          np.random.rand(1) * 2 * np.pi)
        sound += sinusoid * lfo
    sound = sound / np.max(np.abs(sound))  # Normalize the sound
    window = np.concatenate((np.hanning(sr)[:sr//2], np.ones((duration -1) * sr), np.hanning(sr)[sr//2:]))
    return sound * window  # Apply windowing to the sound

# Generate two sounds
sr = 44100
sound_1 = texture_generator(sr, 5, 150)
sound_2 = texture_generator(sr, 5, 150)

# display audio
display(Audio(sound_1, rate=sr)) # sound_1
display(Audio(sound_2, rate=sr)) # sound_2

# Pick parameters
frame_size    = 2**16
N_filter_bank = 16
M_filter_bank = 6
N_moments     = 4
alpha         = torch.tensor([10, 1, 1/10, 1/100])
beta          = torch.tensor([1, 1, 1, 1, 1])

# Chop segments of both audios and make them tensors
sound_1_segment = torch.tensor(sound_1[:frame_size], device=device)
sound_2_segment = torch.tensor(sound_2[:frame_size], device=device)

# Make a batch out of the sounds
batch_size = 8
sound_1_batch = torch.stack([sound_1_segment] * batch_size).to(device)
sound_2_batch = torch.stack([sound_2_segment] * batch_size).to(device)

# Initilize texstat
from texstat.functions import texstat_wrapper

texstat = texstat_wrapper( frame_size = frame_size,
    N_filter_bank = N_filter_bank, 
    M_filter_bank = M_filter_bank, 
    N_moments     = N_moments, 
    sampling_rate = sr, 
    downsampling_factor = 4, 
    alpha               = alpha,
    beta                = beta,
    spectrum_lower_bound = 20, 
    spectrum_higher_bound = 22050, 
    device=device)

In [106]:
stats_sound_1 = texstat.stats(sound_1_segment)
stats_1_sound_1, stats_2_sound_1, stats_3_sound_1, stats_4_sound_1, stats_5_sound_1 = stats_sound_1
print("Sound 1 summary statistics: ")
print("Stats_1_sound_1:", stats_1_sound_1)
print("Stats_2_sound_1:", stats_2_sound_1)
print("Stats_3_sound_1:", stats_3_sound_1)
print("Stats_4_sound_1:", stats_4_sound_1)
print("Stats_5_sound_1:", stats_5_sound_1)

Sound 1 summary statistics: 
Stats_1_sound_1: tensor([[5.7134e-04, 2.1949e+01, 9.6678e-02, 1.8290e-03],
        [3.6681e-01, 4.8181e-01, 9.9997e-02, 6.3773e-03],
        [5.9374e-01, 4.5104e-01, 9.9998e-02, 6.6072e-03],
        [8.4948e-01, 4.1696e-01, 9.9998e-02, 4.1140e-03],
        [6.2726e-01, 4.5705e-01, 9.9998e-02, 6.4956e-03],
        [7.2978e-01, 4.4239e-01, 9.9998e-02, 5.7196e-03],
        [4.8434e-01, 4.3205e-01, 9.9997e-02, 6.1393e-03],
        [5.0004e-01, 4.3071e-01, 9.9998e-02, 5.8687e-03],
        [5.0345e-01, 4.2591e-01, 9.9998e-02, 4.8921e-03],
        [7.6736e-01, 4.6579e-01, 9.9998e-02, 7.0465e-03],
        [5.8931e-01, 4.6018e-01, 9.9998e-02, 6.7186e-03],
        [7.6335e-01, 4.4365e-01, 9.9998e-02, 5.9488e-03],
        [5.9404e-01, 4.5976e-01, 9.9998e-02, 6.0894e-03],
        [4.5679e-01, 4.4873e-01, 9.9997e-02, 5.9828e-03],
        [4.9891e-01, 4.3604e-01, 9.9998e-02, 4.1890e-03],
        [6.8397e-01, 4.2783e-01, 9.9998e-02, 5.0498e-03]], device='cuda:0')
Stats_2_

In [107]:
loss = texstat.loss(sound_1_segment, sound_2_segment)
print("Loss:", loss)

Loss: tensor(2.4151, device='cuda:0', dtype=torch.float64)


In [108]:
# Compute summary statistics for the batch corresponding to sound_1
stats_sound_1_batch = texstat.stats(sound_1_batch)
stats_1_sound_1_batch, stats_2_sound_1_batch, stats_3_sound_1_batch, stats_4_sound_1_batch, stats_5_sound_1_batch = stats_sound_1_batch
print("Sound 1 summary statistics: ")
print("Stats_1_sound_1_batch:", stats_1_sound_1_batch)
print("Stats_2_sound_1_batch:", stats_2_sound_1_batch)
print("Stats_3_sound_1_batch:", stats_3_sound_1_batch)
print("Stats_4_sound_1_batch:", stats_4_sound_1_batch)
print("Stats_5_sound_1_batch:", stats_5_sound_1_batch)

Sound 1 summary statistics: 
Stats_1_sound_1_batch: tensor([[[5.7134e-04, 2.1949e+01, 9.6678e-02, 1.8290e-03],
         [3.6681e-01, 4.8181e-01, 9.9997e-02, 6.3773e-03],
         [5.9374e-01, 4.5104e-01, 9.9998e-02, 6.6072e-03],
         [8.4948e-01, 4.1696e-01, 9.9998e-02, 4.1140e-03],
         [6.2726e-01, 4.5705e-01, 9.9998e-02, 6.4956e-03],
         [7.2978e-01, 4.4239e-01, 9.9998e-02, 5.7196e-03],
         [4.8434e-01, 4.3205e-01, 9.9997e-02, 6.1393e-03],
         [5.0004e-01, 4.3071e-01, 9.9998e-02, 5.8687e-03],
         [5.0345e-01, 4.2591e-01, 9.9998e-02, 4.8921e-03],
         [7.6736e-01, 4.6579e-01, 9.9998e-02, 7.0465e-03],
         [5.8931e-01, 4.6018e-01, 9.9998e-02, 6.7186e-03],
         [7.6335e-01, 4.4365e-01, 9.9998e-02, 5.9488e-03],
         [5.9404e-01, 4.5976e-01, 9.9998e-02, 6.0894e-03],
         [4.5679e-01, 4.4873e-01, 9.9997e-02, 5.9828e-03],
         [4.9891e-01, 4.3604e-01, 9.9998e-02, 4.1890e-03],
         [6.8397e-01, 4.2783e-01, 9.9998e-02, 5.0498e-03]],

  

In [109]:
# Compute loss function between batches of sound_1 and sound_2
loss_batch = texstat.loss(sound_1_batch, sound_2_batch)
print("Loss batch:", loss_batch)

Loss batch: tensor(2.4151, device='cuda:0', dtype=torch.float64)
