In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [3]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('midFeaturesTrainFinalWithChars.csv')


data.head()

print(data.shape)

(4876, 144)


In [4]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [5]:
data.sample(5)

Unnamed: 0,FileID,actorID,Emotion,SentenceID,Age,Sex,Race,Ethnicity,zcr_mean,energy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
4194,1079_TSI_ANG_XX,1079,ANG,TSI,21,Female,Caucasian,Hispanic,0.303728,0.015018,...,0.022552,0.030333,0.017159,0.02724,0.01108,0.013277,0.012315,0.006473,0.002274,0.013206
60,1001_TSI_SAD_XX,1001,SAD,TSI,51,Male,Caucasian,Not Hispanic,0.309678,0.017705,...,0.017711,0.008719,0.015291,0.015983,0.001156,0.003163,0.003789,0.009982,0.010965,0.008639
596,1012_DFA_ANG_XX,1012,ANG,DFA,23,Female,Caucasian,Not Hispanic,0.185668,0.011214,...,0.024365,0.031473,0.009979,0.00877,0.012936,0.012354,0.00825,0.005689,0.003887,0.010146
4219,1081_ITH_ANG_XX,1081,ANG,ITH,30,Male,Asian,Not Hispanic,0.153446,0.015864,...,0.015763,0.014054,0.011688,0.012647,0.015345,0.011847,0.015263,0.030384,0.018206,0.010738
3150,1061_ITH_DIS_XX,1061,DIS,ITH,51,Female,African American,Not Hispanic,0.169124,0.007343,...,0.030847,0.026699,0.027932,0.033733,0.017686,0.010072,0.013512,0.015875,0.003901,0.012618


In [6]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

NEU    0.178718
HAP    0.164359
FEA    0.164359
ANG    0.164359
SAD    0.164103
DIS    0.164103
Name: Emotion, dtype: float64

In [7]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
DIS    0.164959
ANG    0.163934
SAD    0.163934
FEA    0.163934
HAP    0.163934
Name: Emotion, dtype: float64

In [8]:
y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID', 'Age', 'Race', 'Ethnicity'])
X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID', 'Age', 'Race', 'Ethnicity'])

In [9]:
X_test

Unnamed: 0,Sex,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
2589,Male,0.235680,0.009760,2.952210,0.303350,0.216137,0.999395,0.010363,0.320692,-31.379200,...,0.017679,0.015012,0.014373,0.056667,0.006126,0.012145,0.010345,0.022572,0.012844,0.018662
628,Female,0.124846,0.017471,2.810887,0.192076,0.194807,0.554240,0.013059,0.157980,-27.567072,...,0.017899,0.035160,0.038657,0.027456,0.017228,0.007149,0.011994,0.012888,0.015214,0.011894
3624,Male,0.141802,0.013829,2.703969,0.214140,0.213496,0.630885,0.009897,0.200681,-28.366409,...,0.026149,0.023505,0.020730,0.034072,0.004667,0.002825,0.017856,0.014437,0.012243,0.014548
334,Female,0.188074,0.031745,2.936792,0.285766,0.225586,1.338883,0.008230,0.319117,-26.487695,...,0.027717,0.021225,0.019300,0.012368,0.010234,0.019737,0.008871,0.020423,0.013910,0.012083
2033,Male,0.130984,0.032302,2.887379,0.195649,0.194125,0.643476,0.010679,0.157980,-26.356976,...,0.009061,0.014462,0.025784,0.020488,0.025813,0.009139,0.023292,0.024041,0.024539,0.011692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4030,Male,0.104088,0.016065,2.913445,0.184305,0.180524,0.951585,0.007368,0.163007,-26.615970,...,0.016347,0.023897,0.026896,0.012517,0.007997,0.016621,0.019528,0.013201,0.016384,0.008215
614,Female,0.126715,0.018467,2.849393,0.206451,0.206298,0.759920,0.010864,0.161459,-28.774425,...,0.017063,0.027947,0.031472,0.024461,0.018151,0.011833,0.007591,0.017712,0.010146,0.013961
2150,Male,0.174011,0.014187,2.826650,0.243340,0.223314,0.847289,0.010034,0.246514,-28.961452,...,0.022357,0.017457,0.024332,0.017196,0.011724,0.016242,0.010276,0.013545,0.008611,0.010318
4016,Male,0.056667,0.012202,2.869117,0.133142,0.174424,0.280502,0.012428,0.071459,-29.508417,...,0.024749,0.016428,0.024495,0.041709,0.004705,0.023955,0.020503,0.025600,0.025126,0.010533


In [10]:
# Add a column to the y vectors encoding each of the emotions.

y_train_dummies = pd.get_dummies(y_train)
y_train         = pd.concat([y_train, y_train_dummies], axis=1)

y_test_dummies  = pd.get_dummies(y_test)
y_test          = pd.concat([y_test, y_test_dummies], axis=1)


In [11]:
X_train_temp = X_train.loc[(X_train.Sex == 'Male')]
X_test_temp  = X_test.loc[(X_test.Sex == 'Male')]

X_train_sex = X_train_temp.drop(columns  = ['Sex'])
X_test_sex  = X_test_temp.drop(columns   = ['Sex'])

y_train_sex = y_train.loc[(X_train.Sex == 'Male')]
y_test_sex  = y_test.loc[(X_test.Sex == 'Male')]




In [12]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'HAP' and instances where emotion is 'NEU'. 

# Get the rows of X_train, X_test corresponding to just the emotions 'HAP' and 'NEU'

X_train_hap = X_train_sex.loc[(y_train_sex.Emotion == 'HAP') | (y_train_sex.Emotion == 'NEU')]
X_test_hap  = X_test_sex.loc[(y_test_sex.Emotion == 'HAP') | (y_test_sex.Emotion == 'NEU')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'HAP' and 'NEU'

y_train_hap = y_train_sex.loc[(y_train_sex.Emotion == 'HAP') | (y_train_sex.Emotion == 'NEU')].Emotion_HAP
y_test_hap  = y_test_sex.loc[(y_test_sex.Emotion == 'HAP') | (y_test_sex.Emotion == 'NEU')].Emotion_HAP


In [13]:
X_train_ang = X_train_sex.loc[(y_train_sex.Emotion == 'ANG') | (y_train_sex.Emotion == 'NEU')]
X_test_ang  = X_test_sex.loc[(y_test_sex.Emotion == 'ANG') | (y_test_sex.Emotion == 'NEU')]

y_train_ang = y_train_sex.loc[(y_train_sex.Emotion == 'ANG') | (y_train_sex.Emotion == 'NEU')].Emotion_ANG
y_test_ang = y_test_sex.loc[(y_test_sex.Emotion == 'ANG') | (y_test_sex.Emotion == 'NEU')].Emotion_ANG

In [14]:
X_train_sad = X_train_sex.loc[(y_train_sex.Emotion == 'SAD') | (y_train_sex.Emotion == 'NEU')]
X_test_sad  = X_test_sex.loc[(y_test_sex.Emotion == 'SAD') | (y_test_sex.Emotion == 'NEU')]

y_train_sad = y_train_sex.loc[(y_train_sex.Emotion == 'SAD') | (y_train_sex.Emotion == 'NEU')].Emotion_SAD
y_test_sad  = y_test_sex.loc[(y_test_sex.Emotion == 'SAD') | (y_test_sex.Emotion == 'NEU')].Emotion_SAD

In [15]:
X_train_dis = X_train_sex.loc[(y_train_sex.Emotion == 'DIS') | (y_train_sex.Emotion == 'NEU')]
X_test_dis  = X_test_sex.loc[(y_test_sex.Emotion == 'DIS') | (y_test_sex.Emotion == 'NEU')]

y_train_dis = y_train_sex.loc[(y_train_sex.Emotion == 'DIS') | (y_train_sex.Emotion == 'NEU')].Emotion_DIS
y_test_dis  = y_test_sex.loc[(y_test_sex.Emotion == 'DIS') | (y_test_sex.Emotion == 'NEU')].Emotion_DIS

In [16]:
X_train_fea = X_train_sex.loc[(y_train_sex.Emotion == 'FEA') | (y_train_sex.Emotion == 'NEU')]
X_test_fea  = X_test_sex.loc[(y_test_sex.Emotion == 'FEA') | (y_test_sex.Emotion == 'NEU')]

y_train_fea = y_train_sex.loc[(y_train_sex.Emotion == 'FEA') | (y_train_sex.Emotion == 'NEU')].Emotion_FEA
y_test_fea  = y_test_sex.loc[(y_test_sex.Emotion == 'FEA') | (y_test_sex.Emotion == 'NEU')].Emotion_FEA

In [17]:
#X_train_sub = X_train
#X_test_sub  = X_test

#y_train_sub = y_train
#y_test_sub  = y_test

X_train_hap

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
837,0.083640,0.008714,2.907248,0.154361,0.169981,0.577404,0.010233,0.114029,-29.271150,2.186646,...,0.019546,0.028316,0.014390,0.012027,0.009329,0.017190,0.008330,0.008765,0.013606,0.009506
1520,0.256905,0.010561,2.896105,0.320240,0.218912,1.293098,0.007860,0.386708,-28.240995,1.338310,...,0.031973,0.020404,0.017673,0.012071,0.003121,0.005221,0.007172,0.013510,0.019076,0.011167
4387,0.141181,0.020960,2.860288,0.215679,0.210968,0.697310,0.009616,0.206195,-26.825397,2.904020,...,0.021346,0.027417,0.027228,0.020983,0.001063,0.012922,0.014918,0.013080,0.006555,0.008717
2037,0.069602,0.021352,2.940212,0.142247,0.175371,0.351410,0.012173,0.089776,-27.947142,3.050918,...,0.010065,0.003914,0.015036,0.023425,0.015738,0.012535,0.033597,0.026068,0.025076,0.013006
3341,0.084889,0.024041,2.696711,0.170332,0.191046,0.438256,0.011342,0.102234,-26.910799,2.322689,...,0.042847,0.017007,0.012116,0.015498,0.014017,0.012739,0.011837,0.018645,0.015275,0.013584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2184,0.095390,0.023484,2.797787,0.180074,0.194153,0.497705,0.011376,0.122153,-26.889665,2.028090,...,0.022023,0.024481,0.030945,0.042850,0.016385,0.023928,0.015026,0.018300,0.006639,0.015570
1300,0.073976,0.032121,3.022426,0.143086,0.177297,0.436937,0.013505,0.095648,-25.888176,2.923838,...,0.020277,0.021812,0.009861,0.009344,0.028006,0.044578,0.004752,0.008757,0.010958,0.013069
3584,0.083162,0.025444,2.882660,0.174315,0.205657,0.421808,0.010485,0.101602,-28.349770,2.906958,...,0.025261,0.029196,0.006812,0.015437,0.011145,0.019768,0.023299,0.018405,0.005539,0.010592
3536,0.150066,0.022885,2.881374,0.228971,0.203044,0.778530,0.010062,0.243123,-27.778914,2.489779,...,0.007250,0.011710,0.024362,0.015174,0.029199,0.018537,0.023584,0.034893,0.006413,0.012818


In [18]:
print("X_train_sub",  X_train_sad.shape)
print("y_train_sub",  y_train_sad.shape)

X_train_sub (700, 136)
y_train_sub (700,)


In [19]:
np.unique(y_test_ang)

array([0, 1], dtype=uint8)

In [20]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data

pipe.fit(X_train_hap, y_train_hap)
pipe.fit(X_train_ang, y_train_ang)
pipe.fit(X_train_sad, y_train_sad)
pipe.fit(X_train_dis, y_train_dis)
pipe.fit(X_train_fea, y_train_fea)

    
# Get the model's prediction on the test data

pred_hap = pipe.predict(X_test_hap)
pred_ang = pipe.predict(X_test_ang)
pred_sad = pipe.predict(X_test_sad)
pred_dis = pipe.predict(X_test_dis)
pred_fea = pipe.predict(X_test_fea)



In [21]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the happy/neutral test set is:")
print(confusion_matrix(y_test_hap, pred_hap))
print()

# Look at the confusion matrix for the training data:
pred_train_hap = pipe.predict(X_train_hap)
print("confusion matrix for the happy/netural train set is:")
print(confusion_matrix(y_train_hap, pred_train_hap))


confusion matrix for the happy/neutral test set is:
[[96 14]
 [46 43]]

confusion matrix for the happy/netural train set is:
[[345  12]
 [157 182]]


In [22]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the angry/neutral test set is:")
print(confusion_matrix(y_test_ang, pred_ang))
print()

# Look at the confusion matrix for the training data:
pred_train_ang = pipe.predict(X_train_ang)
print("confusion matrix for the angry/netural train set is:")
print(confusion_matrix(y_train_ang, pred_train_ang))


confusion matrix for the angry/neutral test set is:
[[96 14]
 [38 45]]

confusion matrix for the angry/netural train set is:
[[345  12]
 [123 222]]


In [23]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the sad/neutral test set is:")
print(confusion_matrix(y_test_sad, pred_sad))
print()

# Look at the confusion matrix for the training data:
pred_train_sad = pipe.predict(X_train_sad)
print("confusion matrix for the sad/netural train set is:")
print(confusion_matrix(y_train_sad, pred_train_sad))


confusion matrix for the sad/neutral test set is:
[[96 14]
 [52 33]]

confusion matrix for the sad/netural train set is:
[[345  12]
 [195 148]]


In [24]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the disgust/neutral test set is:")
print(confusion_matrix(y_test_dis, pred_dis))
print()

# Look at the confusion matrix for the training data:
pred_train_dis = pipe.predict(X_train_dis)
print("confusion matrix for the disgust/netural train set is:")
print(confusion_matrix(y_train_dis, pred_train_dis))


confusion matrix for the disgust/neutral test set is:
[[96 14]
 [62 27]]

confusion matrix for the disgust/netural train set is:
[[345  12]
 [227 112]]


In [25]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the fear/neutral test set is:")
print(confusion_matrix(y_test_fea, pred_fea))
print()

# Look at the confusion matrix for the training data:
pred_train_fea = pipe.predict(X_train_fea)
print("confusion matrix for the fear/netural train set is:")
print(confusion_matrix(y_train_fea, pred_train_fea))


confusion matrix for the fear/neutral test set is:
[[96 14]
 [26 59]]

confusion matrix for the fear/netural train set is:
[[345  12]
 [ 26 317]]
