In [1]:
# Get the critical imports out of the way
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import librosa.display
import soundfile
import os
# matplotlib complains about the behaviour of librosa.display, so we'll ignore those warnings:
import warnings; warnings.filterwarnings('ignore')
from IPython.core.display import HTML 

In [2]:
import librosa

def feature_chromagram(waveform, sample_rate):
    # STFT computed here explicitly; mel spectrogram and MFCC functions do this under the hood
    stft_spectrogram=np.abs(librosa.stft(waveform))
    # Produce the chromagram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=sample_rate).T,axis=0)
    return chromagram

def feature_melspectrogram(waveform, sample_rate):
    # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128, fmax=8000).T,axis=0)
    return melspectrogram

def feature_mfcc(waveform, sample_rate):
    # Compute the MFCCs for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # 40 filterbanks = 40 coefficients
    mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=40).T, axis=0)
    return mfc_coefficients

In [7]:
def get_features(file):
    # load an individual soundfile
     with soundfile.SoundFile(file) as audio:
        waveform, sample_rate = librosa.load(file, duration=3, offset=0.5, sr=48000)
        # compute features of soundfile
        chromagram = feature_chromagram(waveform, sample_rate)
        melspectrogram = feature_melspectrogram(waveform, sample_rate)
        mfc_coefficients = feature_mfcc(waveform, sample_rate)
        
        feature_matrix=np.array([])
        # use np.hstack to stack our feature arrays horizontally to create a feature matrix
        feature_matrix = np.hstack((chromagram, melspectrogram, mfc_coefficients))
        
        return feature_matrix

In [15]:
def speech_file_to_array_fn(path):
    return get_features(path)

def label_to_id(label):
    label_list = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fear', 'Disgust', 'Surprise']

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples, input_column = "path", output_column = "emotion"):
    """
    Load the recordings with their labels.
    :param examples:[DataFrame]  with the samples of the training or test sets.
    :param input_column:[str]  Column that contain the paths to the recordings
    :param output_column:[str]  Column that contain the emotion associated to each recording
    :param target_sampling_rate:[int] Global variable with the expected sampling rate of the model
    """
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label) for label in examples[output_column]]

    result = {
        'input_values': speech_list,
        'labels': target_list
    }

    return result

In [16]:
import numpy as np
from datasets import load_dataset

save_dir = 'via_IliaZenkov'

dataset_1d = []
for fold in range(5):
    save_path = os.path.join('audio_48k', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    data_files = {
        "train": os.path.join(save_path, "train.csv"),
        "validation": os.path.join(save_path, "test.csv"),
    }
    
    #Load data
    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    
    train = preprocess_function(train_dataset)
    test = preprocess_function(eval_dataset)
    
    X_train = np.array(train["input_values"])
    y_train = np.array(train['labels'])
    X_test = np.array(test["input_values"])
    y_test = np.array(test['labels'])
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    numpy_name = os.path.join(save_dir, str(fold) + '.npy')
    os.makedirs(save_dir, exist_ok=True)
    
    with open(numpy_name, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)

Using custom data configuration default-b8bcc55f28dc144d
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-b8bcc55f28dc144d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(1140, 180) (1140,) (300, 180) (300,)


Using custom data configuration default-6481803daa7c2721
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-6481803daa7c2721/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(1140, 180) (1140,) (300, 180) (300,)


Using custom data configuration default-551db4a7f964cc4f
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-551db4a7f964cc4f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(1140, 180) (1140,) (300, 180) (300,)


Using custom data configuration default-10664dd408e3abb0
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-10664dd408e3abb0/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(1140, 180) (1140,) (300, 180) (300,)


Using custom data configuration default-325a78e528c91f5d
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-325a78e528c91f5d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(1200, 180) (1200,) (240, 180) (240,)


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

npy_path = 'via_IliaZenkov'
all_npy = os.listdir(npy_path)

scaler = StandardScaler()

for fold in all_npy:
    npy = os.path.join(npy_path, fold)
    
    with open(npy, 'rb') as f:
        X_train = np.load(f)
        y_train = np.load(f)
        X_test = np.load(f)
        y_test = np.load(f)
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    
    pkl_name = f'./model/MLPClassifier'+fold+'.pkl'
    
    model = MLPClassifier(
        activation='logistic', 
        solver='adam', 
        alpha=0.001, 
        beta_1=0.9,
        beta_2=0.999,
        batch_size=256, 
        epsilon=1e-08, 
        hidden_layer_sizes=(300,), 
        learning_rate='adaptive',
        max_iter=1000, # I've found for this task, loss converges at ~1000 iterations
        random_state=69,
    )

    model.fit(X_train, y_train)

    print(f'MLP Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
    print(f'MLP Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

MLP Model's accuracy on training set is 100.00%
MLP Model's accuracy on test set is 46.00%
MLP Model's accuracy on training set is 100.00%
MLP Model's accuracy on test set is 42.67%
MLP Model's accuracy on training set is 100.00%
MLP Model's accuracy on test set is 41.00%
MLP Model's accuracy on training set is 100.00%
MLP Model's accuracy on test set is 40.33%
MLP Model's accuracy on training set is 100.00%
MLP Model's accuracy on test set is 33.33%
