# 1 - Imports

In [1]:
import sys
sys.path.append('/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/scripts/')

import argparse
import librosa
import librosa.display as ld
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from feature_extractor import FeatureExtractor
from settings import FEATURE_EXTRACTOR_DEFAULT_SETTINGS

# 2 - Feature extractor class

In [None]:
default_params_dict = FEATURE_EXTRACTOR_DEFAULT_SETTINGS

default_params_dict['audio_paths_file_folder'] = '/home/usuaris/veu/federico.costa/git_repositories/DoubleAttentionSpeakerVerification/feature_extractor/'
default_params_dict['n_mels'] = 80

default_params = argparse.Namespace(**default_params_dict)

feature_extractor = FeatureExtractor(default_params)

# 3 - Tests

In [None]:
feature_extractor.count_input_lines()

with open(feature_extractor.params.audio_paths_file_path, 'r') as file:
    
    for line in file:

        audio_path = line.replace("\n", "")

        print(f"[Feature Extractor] Processing file {audio_path}...")

        file_dump_path = '.'.join(line.split(".")[:-1]) # remove the file extension
        file_dump_path = file_dump_path + ".pickle" # add the pickle extension

        if (feature_extractor.params.overwrite == True) or (feature_extractor.params.overwrite == False and not os.path.exists(file_dump_path)):
                    
            samples, sampling_rate = librosa.load(
                f'{audio_path}',
                sr = feature_extractor.params.sampling_rate,
                mono = True, # converts to mono channel
                ) 

            assert int(sampling_rate) == int(feature_extractor.params.sampling_rate)
            
            # Pre-emphasis
            samples *= 32768 # HACK why this number?
            samples[1:] = samples[1:] - feature_extractor.params.pre_emph_coef * samples[:-1]
            samples[0] *= (1 - feature_extractor.params.pre_emph_coef)

            # Short time Fourier Transform
            D = librosa.stft(
                samples, 
                n_fft = int(feature_extractor.params.n_fft_secs * sampling_rate), 
                hop_length = int(feature_extractor.params.hop_length_secs * sampling_rate),
                win_length = int(feature_extractor.params.win_length_secs * sampling_rate), 
                window = feature_extractor.params.window, 
                center = False,
                )
            
            magnitudes = np.abs(D)
            low_freq = 0
            high_freq = sampling_rate / 2

            mel_spectrogram = librosa.feature.melspectrogram(
                S = magnitudes, 
                sr = sampling_rate, 
                n_mels = feature_extractor.params.n_mels, 
                fmin = low_freq, 
                fmax = high_freq, 
                norm = None,
                )
        
            log_mel_spectrogram = np.log(np.maximum(1, mel_spectrogram))
        
        break

In [None]:
log_mel_spectrogram.shape

In [None]:
plt.figure(figsize=(25, 10))

librosa.display.specshow(
    mel_spectrogram, 
    x_axis = "s",
    y_axis = "mel", 
    sr = sampling_rate,
    )

plt.colorbar(format="%+2.f dB")
plt.show()

In [None]:
plt.figure(figsize=(25, 10))

librosa.display.specshow(
    log_mel_spectrogram, 
    x_axis = "s",
    y_axis = "mel", 
    sr = sampling_rate,
    )

plt.colorbar(format="%+2.f dB")
plt.show()

In [6]:
import pickle
with open("/home/usuaris/veussd/DATABASES/VoxCeleb/VoxCeleb1/test/id10270/5r0dWxy17C8/00001.pickle", 'rb') as pickle_file:
    features = pickle.load(pickle_file)

In [7]:
features

{'features': array([[4.4989624, 3.799679 , 4.1704392, ..., 4.0843544, 4.1892056,
         5.627227 ],
        [5.3594484, 5.061476 , 3.7799945, ..., 5.258099 , 5.163255 ,
         5.126963 ],
        [5.2957215, 5.913182 , 4.7764177, ..., 7.3524   , 6.997459 ,
         6.7028084],
        ...,
        [8.388144 , 8.067397 , 8.272516 , ..., 8.258151 , 8.407291 ,
         8.436576 ],
        [7.914051 , 7.999135 , 7.889348 , ..., 8.384672 , 7.8605113,
         7.8729672],
        [7.6988573, 7.117542 , 7.7052364, ..., 8.167991 , 7.3849   ,
         7.6992416]], dtype=float32),
 'settings': Namespace(audio_paths_file_folder='./feature_extractor/', audio_paths_file_name='feature_extractor_paths.lst', sampling_rate=16000, n_fft_secs=0.023, window='hamming', win_length_secs=0.023, hop_length_secs=0.01, pre_emph_coef=0.97, n_mels=80, overwrite=True, verbose=False, audio_paths_file_path='./feature_extractor/feature_extractor_paths.lst')}

In [11]:
features["settings"].sampling_rate

16000