In [31]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [32]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [33]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('midFeaturesTrainFinal.csv')


data.head()

print(data.shape)

(4876, 140)


In [34]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [35]:
data.sample(5)

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
1498,1026_MTI_ANG_XX,1026,ANG,MTI,0.074113,0.047351,3.034524,0.161432,0.192393,0.409144,...,0.017268,0.033351,0.021844,0.013262,0.003805,0.027313,0.013283,0.017246,0.009878,0.010973
3249,1062_TAI_HAP_XX,1062,HAP,TAI,0.104725,0.024153,2.871963,0.171479,0.183264,0.526493,...,0.022742,0.022484,0.014104,0.019203,0.005016,0.0126,0.017749,0.023536,0.015686,0.008539
4578,1086_MTI_ANG_XX,1086,ANG,MTI,0.129002,0.018246,2.919363,0.225928,0.207958,0.982822,...,0.012703,0.016238,0.016446,0.013761,0.01218,0.012992,0.019996,0.036453,0.030947,0.011454
4273,1082_DFA_ANG_XX,1082,ANG,DFA,0.22505,0.005334,2.879165,0.323732,0.225722,1.85013,...,0.015552,0.019404,0.01363,0.023788,0.010338,0.009185,0.008111,0.009636,0.00803,0.008875
789,1015_TSI_NEU_XX,1015,NEU,TSI,0.267703,0.017095,2.330817,0.32123,0.205741,0.730224,...,0.014442,0.027422,0.02094,0.014684,0.001602,0.007227,0.007098,0.011852,0.013485,0.0083


In [36]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

NEU    0.178718
HAP    0.164359
FEA    0.164359
ANG    0.164359
SAD    0.164103
DIS    0.164103
Name: Emotion, dtype: float64

In [37]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
DIS    0.164959
ANG    0.163934
SAD    0.163934
FEA    0.163934
HAP    0.163934
Name: Emotion, dtype: float64

In [38]:
y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID'])
X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

In [39]:
X_train

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
837,0.083640,0.008714,2.907248,0.154361,0.169981,0.577404,0.010233,0.114029,-29.271150,2.186646,...,0.019546,0.028316,0.014390,0.012027,0.009329,0.017190,0.008330,0.008765,0.013606,0.009506
2490,0.151667,0.005787,2.860109,0.249394,0.251518,0.661992,0.007877,0.190744,-30.414050,3.020177,...,0.023129,0.021614,0.032285,0.012211,0.004090,0.012739,0.007861,0.010684,0.006603,0.010285
3885,0.081441,0.042436,3.050415,0.161888,0.187664,0.368531,0.015429,0.106367,-25.875142,2.615701,...,0.003848,0.006165,0.037184,0.027610,0.003999,0.026874,0.024719,0.020995,0.012547,0.015755
845,0.150406,0.010609,2.928441,0.227848,0.218879,0.627350,0.010261,0.211001,-29.907287,2.702372,...,0.029635,0.032077,0.014713,0.012294,0.004989,0.013047,0.008941,0.010526,0.008577,0.010364
1890,0.087894,0.021971,3.021540,0.146429,0.162692,0.525216,0.010559,0.115676,-28.654314,2.266108,...,0.036706,0.020806,0.005194,0.009952,0.025127,0.020504,0.016022,0.008138,0.002314,0.012838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,0.119205,0.016565,2.871881,0.213218,0.202107,0.942230,0.007878,0.192428,-25.248729,1.557695,...,0.011450,0.007873,0.011765,0.012555,0.014883,0.008454,0.021950,0.024942,0.023826,0.008226
3271,0.138713,0.015348,2.759433,0.218334,0.191528,1.045024,0.009202,0.207948,-28.248249,0.985611,...,0.022454,0.016348,0.030786,0.028595,0.020358,0.004879,0.009590,0.031912,0.014395,0.013876
3148,0.049920,0.047684,2.882915,0.130239,0.187099,0.211214,0.014590,0.072257,-26.456010,3.645616,...,0.031786,0.044909,0.045720,0.032407,0.000848,0.009181,0.015284,0.004561,0.001602,0.013600
4337,0.135008,0.016936,2.900700,0.222885,0.202980,0.792664,0.008293,0.174590,-26.584649,1.267600,...,0.016742,0.032236,0.036552,0.001970,0.000609,0.001014,0.015637,0.009353,0.003242,0.009500


In [40]:
## PCA is stored in decomposition
from sklearn.decomposition import PCA

In [42]:
pca = PCA(n_components=.9)
pca.fit(X_train)
pca.explained_variance_ratio_.shape

(8,)

In [43]:
#pca1 = PCA(n_components=.9)
#pca1.fit(X_test)
#pca1.explained_variance_ratio_.shape

In [44]:
#X_train_transform = pca.transform(X_train)
#X_test_transform  = pca1.transform(X_test)


#X_train_transform


In [45]:
X_train_transform = pca.transform(X_train)
X_test_transform  = pca.transform(X_test)


In [46]:
print(X_train_transform.shape)
print(X_test_transform.shape)

(3900, 8)
(976, 8)


In [47]:
X_train["comp_1"] = X_train_transform[:,0]
X_train["comp_2"] = X_train_transform[:,1]
X_train["comp_3"] = X_train_transform[:,2]
X_train["comp_4"] = X_train_transform[:,3]
X_train["comp_5"] = X_train_transform[:,4]
X_train["comp_6"] = X_train_transform[:,5]
X_train["comp_7"] = X_train_transform[:,6]
X_train["comp_8"] = X_train_transform[:,7]


In [48]:
X_test["comp_1"] = X_test_transform[:,0]
X_test["comp_2"] = X_test_transform[:,1]
X_test["comp_3"] = X_test_transform[:,2]
X_test["comp_4"] = X_test_transform[:,3]
X_test["comp_5"] = X_test_transform[:,4]
X_test["comp_6"] = X_test_transform[:,5]
X_test["comp_7"] = X_test_transform[:,6]
X_test["comp_8"] = X_test_transform[:,7]

In [49]:
X_train

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_12_std,delta chroma_std_std,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8
837,0.083640,0.008714,2.907248,0.154361,0.169981,0.577404,0.010233,0.114029,-29.271150,2.186646,...,0.013606,0.009506,1.318566,-0.455743,0.670038,-0.597643,-0.364124,0.116550,0.101812,0.180327
2490,0.151667,0.005787,2.860109,0.249394,0.251518,0.661992,0.007877,0.190744,-30.414050,3.020177,...,0.006603,0.010285,2.180750,-1.705006,-0.402540,-0.904357,-0.085299,-0.509824,0.194898,0.335195
3885,0.081441,0.042436,3.050415,0.161888,0.187664,0.368531,0.015429,0.106367,-25.875142,2.615701,...,0.012547,0.015755,-2.131629,-0.502212,0.045954,-0.021329,-0.707757,0.432451,0.216758,0.019772
845,0.150406,0.010609,2.928441,0.227848,0.218879,0.627350,0.010261,0.211001,-29.907287,2.702372,...,0.008577,0.010364,2.049486,-0.981337,0.150813,0.074958,-0.509233,-0.028316,0.247039,0.055626
1890,0.087894,0.021971,3.021540,0.146429,0.162692,0.525216,0.010559,0.115676,-28.654314,2.266108,...,0.002314,0.012838,0.798167,-0.506154,0.654587,-0.229511,0.919215,-0.427422,0.195149,-0.238851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,0.119205,0.016565,2.871881,0.213218,0.202107,0.942230,0.007878,0.192428,-25.248729,1.557695,...,0.023826,0.008226,-2.444884,0.782536,-0.501821,0.177648,0.429589,-0.116166,-0.233305,-0.010822
3271,0.138713,0.015348,2.759433,0.218334,0.191528,1.045024,0.009202,0.207948,-28.248249,0.985611,...,0.014395,0.013876,0.728534,0.960575,-0.283309,0.167878,-0.479614,-0.157639,0.256635,0.202741
3148,0.049920,0.047684,2.882915,0.130239,0.187099,0.211214,0.014590,0.072257,-26.456010,3.645616,...,0.001602,0.013600,-1.584057,-1.827134,-1.472068,0.542388,0.375684,-0.129403,0.042295,-0.019760
4337,0.135008,0.016936,2.900700,0.222885,0.202980,0.792664,0.008293,0.174590,-26.584649,1.267600,...,0.003242,0.009500,-1.286763,0.569853,-0.046516,-0.636519,0.223718,-0.182746,0.558956,-0.257396


In [50]:
X_test

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_12_std,delta chroma_std_std,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8
2589,0.235680,0.009760,2.952210,0.303350,0.216137,0.999395,0.010363,0.320692,-31.379200,1.765553,...,0.012844,0.018662,3.829446,-0.167321,-0.302246,0.371035,-0.812932,-0.244435,-0.096149,0.200345
628,0.124846,0.017471,2.810887,0.192076,0.194807,0.554240,0.013059,0.157980,-27.567072,2.093900,...,0.015214,0.011894,-0.222104,0.020181,0.964901,0.037393,-0.277781,-0.093168,0.062259,0.115177
3624,0.141802,0.013829,2.703969,0.214140,0.213496,0.630885,0.009897,0.200681,-28.366409,2.738096,...,0.012243,0.014548,0.683355,-0.576224,0.780313,0.772934,-0.321026,-0.232795,0.010974,0.007398
334,0.188074,0.031745,2.936792,0.285766,0.225586,1.338883,0.008230,0.319117,-26.487695,0.816579,...,0.013910,0.012083,-1.037803,1.428396,-0.609474,0.139821,-0.436549,0.041268,0.308474,-0.104781
2033,0.130984,0.032302,2.887379,0.195649,0.194125,0.643476,0.010679,0.157980,-26.356976,2.375391,...,0.024539,0.011692,-1.453639,-0.092132,0.596180,0.212327,0.028412,-0.050615,0.226806,-0.240500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4030,0.104088,0.016065,2.913445,0.184305,0.180524,0.951585,0.007368,0.163007,-26.615970,1.705458,...,0.016384,0.008215,-1.140757,0.569806,0.584158,-0.121291,0.044102,0.098803,0.181383,-0.027434
614,0.126715,0.018467,2.849393,0.206451,0.206298,0.759920,0.010864,0.161459,-28.774425,1.332989,...,0.010146,0.013961,0.906947,0.404402,0.392257,-0.868512,-0.086728,-0.290278,-0.207272,0.214023
2150,0.174011,0.014187,2.826650,0.243340,0.223314,0.847289,0.010034,0.246514,-28.961452,2.029162,...,0.008611,0.010318,1.398766,-0.050877,0.253441,0.652273,-0.363940,0.032580,0.091492,-0.060725
4016,0.056667,0.012202,2.869117,0.133142,0.174424,0.280502,0.012428,0.071459,-29.508417,3.197245,...,0.025126,0.010533,1.446964,-1.695385,0.060733,-0.165305,-0.159421,-0.189272,0.265207,-0.210041


In [51]:
# Add a column to the y vectors encoding each of the emotions.

y_train_dummies = pd.get_dummies(y_train)
y_train         = pd.concat([y_train, y_train_dummies], axis=1)

y_test_dummies  = pd.get_dummies(y_test)
y_test          = pd.concat([y_test, y_test_dummies], axis=1)


In [52]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'HAP' and instances where emotion is 'NEU'. 

# Get the rows of X_train, X_test corresponding to just the emotions 'HAP' and 'NEU'

X_train_hap = X_train.loc[(y_train.Emotion == 'HAP') | (y_train.Emotion == 'NEU')]
X_test_hap  = X_test.loc[(y_test.Emotion == 'HAP') | (y_test.Emotion == 'NEU')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_train_hap = y_train.loc[(y_train.Emotion == 'HAP') | (y_train.Emotion == 'NEU')].Emotion_HAP
y_test_hap  = y_test.loc[(y_test.Emotion == 'HAP') | (y_test.Emotion == 'NEU')].Emotion_HAP

In [53]:
X_train_hap

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_12_std,delta chroma_std_std,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8
837,0.083640,0.008714,2.907248,0.154361,0.169981,0.577404,0.010233,0.114029,-29.271150,2.186646,...,0.013606,0.009506,1.318566,-0.455743,0.670038,-0.597643,-0.364124,0.116550,0.101812,0.180327
2865,0.149429,0.035009,2.791704,0.216961,0.189781,0.844939,0.010507,0.199283,-25.511441,1.502688,...,0.013935,0.011268,-2.016468,1.053339,0.807712,0.592642,0.006880,0.013533,0.037840,-0.073384
1520,0.256905,0.010561,2.896105,0.320240,0.218912,1.293098,0.007860,0.386708,-28.240995,1.338310,...,0.019076,0.011167,0.629027,0.857819,-0.130092,-0.082963,-0.457664,0.188952,-0.514057,0.265090
4387,0.141181,0.020960,2.860288,0.215679,0.210968,0.697310,0.009616,0.206195,-26.825397,2.904020,...,0.006555,0.008717,-1.013425,-0.670661,-0.377458,0.394598,0.272703,0.323641,-0.030125,0.073485
2446,0.138815,0.029166,2.925574,0.215789,0.202667,0.859799,0.011595,0.203428,-26.520580,2.123277,...,0.015622,0.014593,-1.120963,0.307861,0.534230,0.523530,0.273403,-0.138975,-0.131245,0.201737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3792,0.097672,0.018550,2.866305,0.177943,0.190071,0.676182,0.009996,0.131030,-26.738595,2.163153,...,0.029265,0.010182,-1.013123,0.026828,0.138536,0.312861,-0.133802,0.058443,-0.103318,0.057635
3536,0.150066,0.022885,2.881374,0.228971,0.203044,0.778530,0.010062,0.243123,-27.778914,2.489779,...,0.006413,0.012818,-0.131901,-0.674997,-0.979576,-0.216214,0.580227,0.088088,-0.178403,-0.373388
3817,0.109860,0.021370,2.953298,0.183701,0.179931,0.604015,0.014405,0.177944,-26.924905,2.437443,...,0.013362,0.009623,-1.064992,-0.471699,-0.327182,-0.199891,-0.603175,0.215551,0.189528,0.098006
4337,0.135008,0.016936,2.900700,0.222885,0.202980,0.792664,0.008293,0.174590,-26.584649,1.267600,...,0.003242,0.009500,-1.286763,0.569853,-0.046516,-0.636519,0.223718,-0.182746,0.558956,-0.257396


In [54]:
X_train_sub = X_train_hap [["comp_1", "comp_2", "comp_3", "comp_4", "comp_5", "comp_6", "comp_7", "comp_8"]]
X_test_sub  = X_test_hap [["comp_1", "comp_2", "comp_3", "comp_4", "comp_5", "comp_6", "comp_7", "comp_8"]]

y_train_sub = y_train_hap
y_test_sub  = y_test_hap

In [55]:
X_train_sub

Unnamed: 0,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8
837,1.318566,-0.455743,0.670038,-0.597643,-0.364124,0.116550,0.101812,0.180327
2865,-2.016468,1.053339,0.807712,0.592642,0.006880,0.013533,0.037840,-0.073384
1520,0.629027,0.857819,-0.130092,-0.082963,-0.457664,0.188952,-0.514057,0.265090
4387,-1.013425,-0.670661,-0.377458,0.394598,0.272703,0.323641,-0.030125,0.073485
2446,-1.120963,0.307861,0.534230,0.523530,0.273403,-0.138975,-0.131245,0.201737
...,...,...,...,...,...,...,...,...
3792,-1.013123,0.026828,0.138536,0.312861,-0.133802,0.058443,-0.103318,0.057635
3536,-0.131901,-0.674997,-0.979576,-0.216214,0.580227,0.088088,-0.178403,-0.373388
3817,-1.064992,-0.471699,-0.327182,-0.199891,-0.603175,0.215551,0.189528,0.098006
4337,-1.286763,0.569853,-0.046516,-0.636519,0.223718,-0.182746,0.558956,-0.257396


In [56]:
X_train_sub

Unnamed: 0,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8
837,1.318566,-0.455743,0.670038,-0.597643,-0.364124,0.116550,0.101812,0.180327
2865,-2.016468,1.053339,0.807712,0.592642,0.006880,0.013533,0.037840,-0.073384
1520,0.629027,0.857819,-0.130092,-0.082963,-0.457664,0.188952,-0.514057,0.265090
4387,-1.013425,-0.670661,-0.377458,0.394598,0.272703,0.323641,-0.030125,0.073485
2446,-1.120963,0.307861,0.534230,0.523530,0.273403,-0.138975,-0.131245,0.201737
...,...,...,...,...,...,...,...,...
3792,-1.013123,0.026828,0.138536,0.312861,-0.133802,0.058443,-0.103318,0.057635
3536,-0.131901,-0.674997,-0.979576,-0.216214,0.580227,0.088088,-0.178403,-0.373388
3817,-1.064992,-0.471699,-0.327182,-0.199891,-0.603175,0.215551,0.189528,0.098006
4337,-1.286763,0.569853,-0.046516,-0.636519,0.223718,-0.182746,0.558956,-0.257396


In [57]:
print("X_train_sub",  X_train_hap.shape)
print("y_train_sub",  y_train_hap.shape)

X_train_sub (1338, 144)
y_train_sub (1338,)


In [58]:
np.unique(y_test_hap)

array([0, 1], dtype=uint8)

In [59]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data
pipe.fit(X_train_sub, y_train_sub)
    
# Get the model's prediction on the test data
pred_sub = pipe.predict(X_test_sub)


In [60]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the happy/neutral test set is:")
print(confusion_matrix(y_test_sub, pred_sub))
print()

# Look at the confusion matrix for the training data:
pred_train_sub = pipe.predict(X_train_sub)
print("confusion matrix for the happy/netural train set is:")
print(confusion_matrix(y_train_sub, pred_train_sub))


confusion matrix for the happy/neutral test set is:
[[124  51]
 [ 63  97]]

confusion matrix for the happy/netural train set is:
[[578 119]
 [199 442]]
