In [1]:
import os

os.environ['LC_ALL'] ='C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import random
import numpy as np
import pandas as pd
import time

from pathlib import Path
from tqdm import tqdm

import torchaudio



In [2]:
import numpy as np
import librosa, librosa.display

def get_waveforms(file, sample_rate=16000):
    
    # load an individual sample audio file
    # read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
    # don't need to store the sample rate that librosa.load returns
    waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
    
    # make sure waveform vectors are homogenous by defining explicitly
    waveform_homo = np.zeros((int(sample_rate*3,)))
    waveform_homo[:len(waveform)] = waveform
    
    # return a single file's waveform                                      
    return waveform_homo

In [3]:
def load_data(path, emotion):
    waveforms, emotions = [], []
    for p in path:
        waveforms.append(get_waveforms(p))
    for e in emotion:
        emotions.append(e)
    
    return waveforms, emotions

In [4]:
def feature_mfcc(
    waveform, 
    sample_rate,
    n_mfcc = 40,
    fft = 1024,
    winlen = 512,
    window='hamming',
    #hop=256, # increases # of time steps; was not helpful
    mels=128
    ):

    # Compute the MFCCs for all STFT frames 
    # 40 mel filterbanks (n_mfcc) = 40 coefficients
    mfc_coefficients=librosa.feature.mfcc(
        y=waveform, 
        sr=sample_rate, 
        n_mfcc=n_mfcc,
        n_fft=fft, 
        win_length=winlen, 
        window=window, 
        #hop_length=hop, 
        n_mels=mels, 
        fmax=sample_rate/2
        ) 

    return mfc_coefficients

def get_features(waveforms, sample_rate=16000):
    features = []
    
    # initialize counter to track progress
    file_count = 0

    # process each waveform individually to get its MFCCs
    for waveform in waveforms:
        mfccs = feature_mfcc(waveform, sample_rate)
        features.append(mfccs)
        file_count += 1
        # print progress 
        print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')
    
    # return all features from list of waveforms
    return features

In [5]:
from sklearn.preprocessing import StandardScaler

def feature_scaling(X_train, X_test, y_train, y_test):
    scaler = StandardScaler()

    #### Scale the training data ####
    # store shape so we can transform it back 
    N,C,H,W = X_train.shape
    # Reshape to 1D because StandardScaler operates on a 1D array
    # tell numpy to infer shape of 1D array with '-1' argument
    X_train = np.reshape(X_train, (N,-1)) 
    X_train = scaler.fit_transform(X_train)
    # Transform back to NxCxHxW 4D tensor format
    X_train = np.reshape(X_train, (N,C,H,W))

    #### Scale the test set ####
    N,C,H,W = X_test.shape
    X_test = np.reshape(X_test, (N,-1))
    X_test = scaler.transform(X_test)
    X_test = np.reshape(X_test, (N,C,H,W))

    return X_train, X_test, y_train, y_test

In [7]:
import numpy as np
from datasets import load_dataset, load_metric
import os

save_dir = 'via_wav2vec'
preprocess_dir = 'via_wav2vec_preprocess'
os.makedirs(preprocess_dir, exist_ok=True)

dataset_1d = []
for fold in range(5):
    filename = os.path.join(save_dir, str(fold)+'.npy')
    
    with open(filename, 'rb') as f:
        X_train = np.load(f)
        y_train = np.load(f)
        X_test = np.load(f)
        y_test = np.load(f)
    
    features_train = get_features(X_train)
    features_test = get_features(X_test)
#     print(f'\nMFCC features shape: {len(features_train)}, {len(y_train)} {len(features_test)}, {len(y_test)}')
    
    X_train = np.expand_dims(features_train,1)
    X_test = np.expand_dims(features_test,1)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
#     print(f'Shape of 4D feature array for input tensor: {X_train.shape} train, {X_test.shape} test')
#     print(f'Shape of emotion labels: {y_train.shape} train, {y_test.shape} test')
    
    X_train, X_test, y_train, y_test = feature_scaling(X_train, X_test, y_train, y_test)
    print(f'X_train scaled:{X_train.shape}, y_train:{y_train.shape}')
    print(f'X_test scaled:{X_test.shape}, y_test:{y_test.shape}')
    
    prepcoess_filename = os.path.join(preprocess_dir, str(fold)+'.npy')
    with open(prepcoess_filename, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
    
    print(f'\nsaved to {prepcoess_filename}')

 Processed 600/600 waveformsmsX_train scaled:(2280, 1, 40, 98), y_train:(2280,)
X_test scaled:(600, 1, 40, 98), y_test:(600,)

saved to via_wav2vec_preprocess\0.npy
 Processed 600/600 waveformsmsX_train scaled:(2280, 1, 40, 98), y_train:(2280,)
X_test scaled:(600, 1, 40, 98), y_test:(600,)

saved to via_wav2vec_preprocess\1.npy
 Processed 600/600 waveformsmsX_train scaled:(2280, 1, 40, 98), y_train:(2280,)
X_test scaled:(600, 1, 40, 98), y_test:(600,)

saved to via_wav2vec_preprocess\2.npy
 Processed 600/600 waveformsmsX_train scaled:(2280, 1, 40, 98), y_train:(2280,)
X_test scaled:(600, 1, 40, 98), y_test:(600,)

saved to via_wav2vec_preprocess\3.npy
 Processed 480/480 waveformsmsX_train scaled:(2400, 1, 40, 98), y_train:(2400,)
X_test scaled:(480, 1, 40, 98), y_test:(480,)

saved to via_wav2vec_preprocess\4.npy
