In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [3]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('midFeaturesTrainFinal.csv')


data.head()

print(data.shape)

(4876, 140)


In [4]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [5]:
data.sample(5)

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
1132,1021_IEO_NEU_XX,1021,NEU,IEO,0.109845,0.022935,2.797759,0.175797,0.172812,0.818707,...,0.003497,0.005871,0.022289,0.011769,0.002003,0.016609,0.020889,0.026449,0.015487,0.009683
1594,1027_WSI_SAD_XX,1027,SAD,WSI,0.084358,0.034149,2.842085,0.155885,0.183968,0.574514,...,0.011631,0.036246,0.037054,0.017512,0.002321,0.005637,0.009315,0.00665,0.002548,0.009674
4233,1081_IWL_FEA_XX,1081,FEA,IWL,0.092638,0.010302,2.914245,0.155239,0.171098,0.683903,...,0.007142,0.011719,0.006387,0.015237,0.003943,0.005497,0.023178,0.036379,0.018944,0.008619
3851,1074_TAI_FEA_XX,1074,FEA,TAI,0.141583,0.029665,2.921624,0.211662,0.198441,0.764911,...,0.014176,0.024852,0.030233,0.022923,0.012922,0.021377,0.008239,0.01472,0.015086,0.011235
4847,1091_MTI_DIS_XX,1091,DIS,MTI,0.128833,0.02238,2.881908,0.225087,0.22069,0.747327,...,0.011378,0.01766,0.009061,0.01822,0.015127,0.01251,0.015253,0.036717,0.028056,0.012897


In [6]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

NEU    0.178718
HAP    0.164359
FEA    0.164359
ANG    0.164359
SAD    0.164103
DIS    0.164103
Name: Emotion, dtype: float64

In [7]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
DIS    0.164959
ANG    0.163934
SAD    0.163934
FEA    0.163934
HAP    0.163934
Name: Emotion, dtype: float64

In [8]:
y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion'])
X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion'])

In [9]:
X_train

Unnamed: 0,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
837,MTI,0.083640,0.008714,2.907248,0.154361,0.169981,0.577404,0.010233,0.114029,-29.271150,...,0.019546,0.028316,0.014390,0.012027,0.009329,0.017190,0.008330,0.008765,0.013606,0.009506
2490,ITS,0.151667,0.005787,2.860109,0.249394,0.251518,0.661992,0.007877,0.190744,-30.414050,...,0.023129,0.021614,0.032285,0.012211,0.004090,0.012739,0.007861,0.010684,0.006603,0.010285
3885,IOM,0.081441,0.042436,3.050415,0.161888,0.187664,0.368531,0.015429,0.106367,-25.875142,...,0.003848,0.006165,0.037184,0.027610,0.003999,0.026874,0.024719,0.020995,0.012547,0.015755
845,TAI,0.150406,0.010609,2.928441,0.227848,0.218879,0.627350,0.010261,0.211001,-29.907287,...,0.029635,0.032077,0.014713,0.012294,0.004989,0.013047,0.008941,0.010526,0.008577,0.010364
1890,IWL,0.087894,0.021971,3.021540,0.146429,0.162692,0.525216,0.010559,0.115676,-28.654314,...,0.036706,0.020806,0.005194,0.009952,0.025127,0.020504,0.016022,0.008138,0.002314,0.012838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,TIE,0.119205,0.016565,2.871881,0.213218,0.202107,0.942230,0.007878,0.192428,-25.248729,...,0.011450,0.007873,0.011765,0.012555,0.014883,0.008454,0.021950,0.024942,0.023826,0.008226
3271,DFA,0.138713,0.015348,2.759433,0.218334,0.191528,1.045024,0.009202,0.207948,-28.248249,...,0.022454,0.016348,0.030786,0.028595,0.020358,0.004879,0.009590,0.031912,0.014395,0.013876
3148,IOM,0.049920,0.047684,2.882915,0.130239,0.187099,0.211214,0.014590,0.072257,-26.456010,...,0.031786,0.044909,0.045720,0.032407,0.000848,0.009181,0.015284,0.004561,0.001602,0.013600
4337,WSI,0.135008,0.016936,2.900700,0.222885,0.202980,0.792664,0.008293,0.174590,-26.584649,...,0.016742,0.032236,0.036552,0.001970,0.000609,0.001014,0.015637,0.009353,0.003242,0.009500


In [10]:
# Add a column to the y vectors encoding each of the emotions.

y_train_dummies = pd.get_dummies(y_train)
y_train         = pd.concat([y_train, y_train_dummies], axis=1)

y_test_dummies  = pd.get_dummies(y_test)
y_test          = pd.concat([y_test, y_test_dummies], axis=1)


In [92]:
# Fron Sentences

#X_train_temp = X_train.loc[(X_train.SentenceID == 'IOE') | (X_train.SentenceID == 'TIE') | (X_train.SentenceID == 'MTI') | (X_train.SentenceID == 'DFA')]
#X_test_temp  = X_test.loc[(X_test.SentenceID == 'IOE') | (X_test.SentenceID == 'TIE') | (X_test.SentenceID == 'MTI') | (X_train.SentenceID == 'DFA')]

#X_train_sen = X_train_temp.drop(columns  = ['SentenceID'])
#X_test_sen  = X_test_temp.drop(columns   = ['SentenceID'])

#y_train_sen = y_train.loc[(X_train.SentenceID == 'IOE') | (X_train.SentenceID == 'TIE') | (X_train.SentenceID == 'MTI') | (X_train.SentenceID == 'DFA')]
#y_test_sen  = y_test.loc[(X_test.SentenceID == 'IOE') | (X_test.SentenceID == 'TIE') | (X_test.SentenceID == 'MTI') | (X_train.SentenceID == 'DFA')]

# Emb Sentences

#X_train_temp = X_train.loc[(X_train.SentenceID == 'IWW') | (X_train.SentenceID == 'ITH') | (X_train.SentenceID == 'ITS')]
#X_test_temp  = X_test.loc[(X_test.SentenceID == 'IWW') | (X_test.SentenceID == 'ITH') | (X_test.SentenceID == 'ITS')]

#X_train_sen = X_train_temp.drop(columns  = ['SentenceID'])
#X_test_sen  = X_test_temp.drop(columns   = ['SentenceID'])

#y_train_sen = y_train.loc[(X_train.SentenceID == 'IWW') | (X_train.SentenceID == 'ITH') | (X_train.SentenceID == 'ITS')]
#y_test_sen  = y_test.loc[(X_test.SentenceID == 'IWW') | (X_test.SentenceID == 'ITH') | (X_test.SentenceID == 'ITS')]

# Fron Sentences

X_train_temp = X_train.loc[(X_train.SentenceID == 'TAI') | (X_train.SentenceID == 'IWL') | (X_train.SentenceID == 'IOM') | (X_train.SentenceID == 'TSI') | (X_train.SentenceID == 'WSI')]
X_test_temp  = X_test.loc[(X_test.SentenceID == 'TAI') | (X_test.SentenceID == 'IWL') | (X_test.SentenceID == 'IOM') | (X_train.SentenceID == 'TSI') | (X_train.SentenceID == 'WSI')]

X_train_sen = X_train_temp.drop(columns  = ['SentenceID'])
X_test_sen  = X_test_temp.drop(columns   = ['SentenceID'])

y_train_sen = y_train.loc[(X_train.SentenceID == 'TAI') | (X_train.SentenceID == 'IWL') | (X_train.SentenceID == 'IOM') | (X_train.SentenceID == 'TSI') | (X_train.SentenceID == 'WSI')]
y_test_sen  = y_test.loc[(X_test.SentenceID == 'TAI') | (X_test.SentenceID == 'IWL') | (X_test.SentenceID == 'IOM') | (X_train.SentenceID == 'TSI') | (X_train.SentenceID == 'WSI')]





In [121]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'ANG' and instances where emotion is 'HAP'. 

# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP'

X_train_sub = X_train_sen.loc[(y_train_sen.Emotion == 'DIS') | (y_train_sen.Emotion == 'NEU')]
X_test_sub  = X_test_sen.loc[(y_test_sen.Emotion == 'DIS') | (y_test_sen.Emotion == 'NEU')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_train_sub = y_train_sen.loc[(y_train_sen.Emotion == 'DIS') | (y_train_sen.Emotion == 'NEU')].Emotion_DIS
y_test_sub  = y_test_sen.loc[(y_test_sen.Emotion == 'DIS') | (y_test_sen.Emotion == 'NEU')].Emotion_DIS


In [122]:
#X_train_sub = X_train
#X_test_sub  = X_test

#y_train_sub = y_train
#y_test_sub  = y_test

In [123]:
print("X_train_sub",  X_train_sub.shape)
print("y_train_sub",  y_train_sub.shape)

X_train_sub (575, 136)
y_train_sub (575,)


In [124]:
np.unique(y_test_sub)

array([0, 1], dtype=uint8)

In [125]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)



In [126]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model t#o the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)


In [127]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the test set is:")
print(confusion_matrix(y_test_sub, pred))
print()

# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train_sub)
print("confusion matrix for the train set is:")
print(confusion_matrix(y_train_sub, pred_train))


confusion matrix for the test set is:
[[21 20]
 [23 24]]

confusion matrix for the train set is:
[[271  20]
 [ 21 263]]
