# Imports

In [1]:
import sys
sys.path.append('/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/scripts/')

import os
import librosa
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

# Functions

# Settings

In [None]:
audio_paths_file_path = '/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/feature_extractor/voxceleb_2_dev_feature_extractor_paths.lst'
prepend_directory = '/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCeleb2/dev/'
sampling_rate = 16000
pre_emph_coef = 0.97
n_fft_secs = 0.25
win_length_secs = 0.25
hop_length_secs = 0.01
n_mels = 80
window = 'hamming'

# Analysis

In [None]:
with open(audio_paths_file_path, 'r') as file:
    
    for line in file:
        
        audio_path = line.replace("\n", "")
        load_audio_path = os.path.join(prepend_directory, audio_path)
        
        print(load_audio_path)
        
        samples, sampling_rate = librosa.load(
            f'{load_audio_path}',
            sr = sampling_rate,
            mono = True, # converts to mono channel
            )
        
        samples_mult = samples.copy()
        samples_mult = samples_mult * 32768
        
        samples_pre_emph = samples_mult.copy()
        samples_pre_emph[1:] = samples_pre_emph[1:] - pre_emph_coef * samples_pre_emph[:-1]
        samples_pre_emph[0] *= (1 - pre_emph_coef)
        
        D = librosa.stft(
            samples_pre_emph, 
            n_fft = int(n_fft_secs * sampling_rate), 
            hop_length = int(hop_length_secs * sampling_rate),
            win_length = int(win_length_secs * sampling_rate), 
            window = window, 
            center = False,
            )
        
        magnitudes = np.abs(D)
        low_freq = 0
        high_freq = sampling_rate / 2

        mel_spectrogram = librosa.feature.melspectrogram(
            S = magnitudes, 
            sr = sampling_rate, 
            n_mels = n_mels, 
            fmin = low_freq, 
            fmax = high_freq, 
            norm = None,
            )

        # TODO this array has to be trasposed in later methods. why not traspose now?
        log_mel_spectrogram = np.log(np.maximum(1, mel_spectrogram))
        
        break

In [None]:
df = pd.DataFrame(
    {
        "samples" : samples,
        "samples_mult" : samples_mult,
        "samples_pre_emph" : samples_pre_emph,
    }
)

In [None]:
df.describe()

In [None]:
sns.boxplot(data = df)

plt.show()

In [None]:
df_mel = pd.DataFrame(
log_mel_spectrogram[:, :15]
)

In [None]:
df_mel.describe()

In [None]:
sns.boxplot(data = df_mel)

plt.show()

In [None]:
features = np.transpose(log_mel_spectrogram)
mean = np.mean(features, axis = 0)
features = features - mean

In [None]:
std = np.std(features, axis = 0)
std = np.where(std > 0.01, std, 1.0)
features_std = features / std

In [None]:
features_norm = features_std / np.max(np.abs(features_std),axis=0)

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features[:, :90]))

plt.xlabel("mel_band")

plt.show()

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features_std[:, :90]))

plt.xlabel("mel_band")

plt.show()

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features_norm[:, :90]))

plt.xlabel("mel_band")

plt.show()

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features.T[:, :180]))

plt.xlabel("frame")

plt.show()

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features_std.T[:, :180]))

plt.xlabel("frame")

plt.show()

In [None]:
plt.figure(figsize = (18,7))

sns.boxplot(data = pd.DataFrame(features_norm.T[:, :180]))

plt.xlabel("frame")

plt.show()