# Baseline1 - Model Training (Prosodic Features) - PT-BR-SER (Coraa V6)

This is a baseline model trained with prosodic features. We use a simple MLP implementation of scikit-learn.

We recommend that participants explore different classification models, feature selection, data balancing, data augmentation and other techniques such as classifier ensemble.

# Prosodic Features

Related methods.

> Luengo, I., Navas, E., Hernáez, I., & Sánchez, J. (2005). Automatic emotion recognition using prosodic parameters. In Ninth European conference on speech communication and technology.

> Rao, K. S., Koolagudi, S. G., & Vempada, R. R. (2013). Emotion recognition from speech using global and local prosodic features. International journal of speech technology, 16(2), 143-160.

In [1]:
import pandas as pd


df_prosodic = pd.read_csv('prosodic_features.csv').sort_values(by='sound_filepath')
df_prosodic

Unnamed: 0,sound_filepath,local_jitter,local_shimmer,min_intensity,relative_min_intensity_time,max_intensity,relative_max_intensity_time,mean_intensity,stddev_intensity,q1_intensity,...,f2_median,f3_median,f4_median,formant_dispersion,average_formant,mff,fitch_vtl,delta_f,vtl_delta_f,label
176,train/bfamcv01_segment163_neutral.wav,0.028242,0.142011,39.708540,0.265683,77.218046,0.939734,54.882812,11.290841,42.222614,...,1668.704638,2780.747781,3864.331639,1111.428274,2210.957718,1755.829831,15.955502,1106.910026,15.809776,neutral
318,train/bfamcv01_segment168_non-neutral-male.wav,0.023441,0.151924,40.001809,0.510550,80.188905,0.915009,65.918816,10.749796,59.660497,...,1495.071504,2665.935573,3759.576021,1065.739244,2120.735347,1703.789338,16.454911,1064.149591,16.445056,non-neutral-male
243,train/bfamcv01_segment170_non-neutral-male.wav,0.025107,0.155780,54.215080,0.021131,82.647111,0.222413,73.301110,5.615585,69.633575,...,1518.161653,2617.090499,3685.031540,1056.800377,2083.728525,1656.793388,16.907869,1046.423539,16.723630,non-neutral-male
29,train/bfamcv01_segment173_neutral.wav,0.024952,0.137728,41.921520,0.916337,80.672787,0.713576,65.547409,9.102396,60.707451,...,1855.881674,2745.636280,3758.225874,1062.169591,2232.865232,1819.027804,15.420238,1099.407730,15.917661,neutral
108,train/bfamcv01_segment177_neutral.wav,0.033259,0.166011,40.513441,0.025059,79.816216,0.661608,61.535998,10.995404,53.425930,...,1428.677716,2676.218905,3688.047035,1042.658269,2088.253971,1676.386283,16.738004,1048.655456,16.688036,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,train/bpubmn14_segment87_neutral.wav,0.041100,0.141838,42.126983,0.462094,74.910851,0.526314,61.839477,8.295658,55.210783,...,1678.011206,2708.243626,4119.592452,1205.076034,2252.552908,1752.953134,16.003611,1140.875316,15.339100,neutral
166,train/bpubmn14_segment89_neutral.wav,0.024239,0.089185,32.775147,0.191657,83.519998,0.127268,62.643103,13.508191,52.108283,...,1401.257606,2570.854063,4121.385381,1225.602858,2134.518464,1602.862834,17.573502,1103.626610,15.856812,neutral
446,train/bpubmn14_segment92_neutral.wav,0.036892,0.147278,46.896159,0.396711,75.489070,0.258380,62.336150,9.449077,51.194764,...,1599.102467,2719.948762,4049.753725,1195.993764,2207.644347,1688.781287,16.643356,1123.978565,15.569692,neutral
405,train/bpubmn14_segment95_neutral.wav,0.030630,0.131234,35.496753,0.008154,79.249207,0.606163,63.141842,10.564453,57.478244,...,1469.688653,2648.861557,4008.300379,1188.653000,2142.297992,1620.879044,17.359837,1098.900423,15.925010,neutral


# Selecting Prosodic Features

In [2]:
features = ['local_jitter', 'local_shimmer', 'min_intensity',
       'relative_min_intensity_time', 'max_intensity',
       'relative_max_intensity_time', 'mean_intensity', 'stddev_intensity',
       'q1_intensity', 'median_intensity', 'q3_intensity', 'voiced_fraction',
       'min_pitch', 'relative_min_pitch_time', 'max_pitch',
       'relative_max_pitch_time', 'mean_pitch', 'stddev_pitch', 'q1_pitch',
       'q3_pitch', 'mean_absolute_pitch_slope',
       'pitch_slope_without_octave_jumps', 'min_hnr', 'relative_min_hnr_time',
       'max_hnr', 'relative_max_hnr_time', 'mean_hnr', 'stddev_hnr', 'min_gne',
       'max_gne', 'mean_gne', 'stddev_gne', 'sum_gne', 'band_energy',
       'band_density', 'band_energy_difference', 'band_density_difference',
       'center_of_gravity_spectrum', 'stddev_spectrum', 'skewness_spectrum',
       'kurtosis_spectrum', 'central_moment_spectrum', 'f1_mean', 'f2_mean',
       'f3_mean', 'f4_mean', 'f1_median', 'f2_median', 'f3_median',
       'f4_median', 'formant_dispersion', 'average_formant', 'mff',
       'fitch_vtl', 'delta_f', 'vtl_delta_f']

X = df_prosodic[features]
y = df_prosodic.label.to_list()

In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[ 0.15466298,  0.03759922,  0.00903401, ..., -0.08195452,
         0.09025836, -0.13605919],
       [-0.37707385,  0.28815011,  0.03791452, ...,  0.41144312,
        -0.75567687,  0.74055292],
       [-0.19249342,  0.38560712,  1.43760817, ...,  0.85894865,
        -1.10635364,  1.12495186],
       ...,
       [ 1.11269049,  0.17071896,  0.71685586, ...,  0.59761982,
         0.42792746, -0.46734766],
       [ 0.41917904, -0.23481335, -0.40573416, ...,  1.30547642,
        -0.06819665,  0.02294944],
       [ 0.00737131,  0.33837638, -0.42113221, ...,  0.98984362,
        -0.09233041,  0.04737115]])

# Model Selection

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.dummy import DummyClassifier

import numpy as np


classifiers = [MLPClassifier(activation='logistic',random_state=1,max_iter=3000),
               MLPClassifier(activation='tanh',random_state=1,max_iter=3000),
               MLPClassifier(activation='relu',random_state=1,max_iter=3000),
               DummyClassifier(strategy="most_frequent",random_state=1),
               DummyClassifier(strategy="stratified",random_state=1),
               DummyClassifier(strategy="uniform",random_state=1)]

In [9]:
for classifier in classifiers:
    print('Running ',classifier)
    kf=StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    scores=cross_val_score(classifier, X, y, cv=kf, scoring='f1_macro')
    print(scores,'f1_macro=',np.mean(scores))
    print('-------')

Running  MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=3000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
[0.5715551  0.553338   0.57689003 0.43440978 0.54970933] f1_macro= 0.537180449941826
-------
Running  MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=3000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
       

# Model Training

In [12]:
clf = MLPClassifier(activation='logistic',random_state=1,max_iter=3000)
clf.fit(X,y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=3000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [13]:
import pickle

with open('model.prosodic.pkl', 'wb') as fid_model:
    pickle.dump(clf,fid_model)

with open('scaler.prosodic.pkl', 'wb') as fid_scaler:
    pickle.dump(sc,fid_scaler)
