In [31]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [32]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [33]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('../../midFeaturesTrainSetWithChars.csv')


data.head()

print(data.shape)

(4877, 144)


In [34]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [35]:
data.sample(5)

Unnamed: 0,FileID,actorID,Emotion,SentenceID,Age,Sex,Race,Ethnicity,zcr_mean,energy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
4558,1002_TSI_DIS_XX,1002,DIS,TSI,21,Female,Caucasian,Not Hispanic,0.063152,0.014035,...,0.019344,0.028552,0.00877,0.033243,0.006863,0.010172,0.01409,0.034345,0.007013,0.00799
837,1082_MTI_DIS_XX,1082,DIS,MTI,20,Female,Caucasian,Not Hispanic,0.053911,0.011049,...,0.026283,0.028846,0.014705,0.02464,0.00496,0.007395,0.011421,0.019587,0.008811,0.010601
1801,1074_TIE_HAP_XX,1074,HAP,TIE,31,Female,African American,Not Hispanic,0.068513,0.014595,...,0.02229,0.02009,0.02227,0.03669,0.012071,0.00429,0.019348,0.023656,0.015666,0.008969
2452,1017_IWW_SAD_XX,1017,SAD,IWW,42,Male,Caucasian,Not Hispanic,0.063355,0.021404,...,0.020207,0.016511,0.011392,0.029444,0.007853,0.002882,0.008727,0.02887,0.014876,0.008986
2494,1044_WSI_NEU_XX,1044,NEU,WSI,40,Male,Caucasian,Not Hispanic,0.070894,0.014864,...,0.027933,0.01613,0.012093,0.030013,0.004832,0.0051,0.008814,0.027979,0.004611,0.00957


In [36]:
# Check the percentages of the different emotion categories in the training set



data_train.Emotion.value_counts(normalize=True)

NEU    0.178672
SAD    0.164317
HAP    0.164317
DIS    0.164317
FEA    0.164317
ANG    0.164060
Name: Emotion, dtype: float64

In [37]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
ANG    0.164959
DIS    0.163934
SAD    0.163934
HAP    0.163934
FEA    0.163934
Name: Emotion, dtype: float64

In [38]:
# Separate the train and test(validataion) set based on the sentences 

# IEO - It's 11 o'clock.
# TIE - That is exactly what happend.
# IOM - I'm on my way to the meeting.
# IWW - I wonder what this is about.
# TAI - The airplane is almost full.
# MTI - Maybe tomorrow it will be cold.
# IWL - I would like a new alarm clock.
# ITH - I think I have a doctor's appointment.
# DFA - Dont forget a jacket.
# ITS - I think I've seen this before.
# TSI - The surface is slick.
# WSI - We'll stop in a couple of minutes.

# Get the rows of X_train, X_test corresponding to specific sentences

# Split the train and test set into labels (y) and features (X)

y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['actorID', 'Emotion', 'SentenceID', 'Age', 'Race', 'Ethnicity'])
X_test  = data_test.drop(columns   = ['actorID', 'Emotion', 'SentenceID', 'Age', 'Race', 'Ethnicity'])


#X_train = data_train.drop(columns  = ['ActorID', 'Emotion', 'SentenceID'])
#X_test  = data_test.drop(columns   = ['ActorID', 'Emotion', 'SentenceID'])


X_train = data_train[['FileID','energy_mean','Sex']]
X_test  = data_test[['FileID','energy_mean','Sex']]

In [18]:
# Separate the train and test(validataion) set based on the sentences 

# IEO - It's 11 o'clock.
# TIE - That is exactly what happend.
# IOM - I'm on my way to the meeting.
# IWW - I wonder what this is about.
# TAI - The airplane is almost full.
# MTI - Maybe tomorrow it will be cold.
# IWL - I would like a new alarm clock.
# ITH - I think I have a doctor's appointment.
# DFA - Dont forget a jacket.
# ITS - I think I've seen this before.
# TSI - The surface is slick.
# WSI - We'll stop in a couple of minutes.


X_train.FileID = X_train["FileID"].str[5:8]
X_test.FileID  = X_test["FileID"].str[5:8]

################################################
#X_train_sen = X_train.loc[(X_train.FileID == 'IOE') | (X_train.FileID == 'TIE') | (X_train.FileID == 'MTI') | (X_train.FileID == 'DFA')]
#X_test_sen  = X_test.loc[(X_test.FileID == 'IOE') | (X_test.FileID == 'TIE') | (X_test.FileID == 'MTI') | (X_train.FileID == 'DFA')]

#y_train_sen = y_train.loc[(X_train.FileID == 'IOE') | (X_train.FileID == 'TIE') | (X_train.FileID == 'MTI') | (X_train.FileID == 'DFA')]
#y_test_sen  = y_test.loc[(X_test.FileID == 'IOE') | (X_test.FileID == 'TIE') | (X_test.FileID == 'MTI') | (X_train.FileID == 'DFA')]

#################################################
#X_train_sen = X_train.loc[(X_train.FileID == 'IWW') | (X_train.FileID == 'ITH') | (X_train.FileID == 'ITS')]
#X_test_sen  = X_test.loc[(X_test.FileID == 'IWW') | (X_test.FileID == 'ITH') | (X_test.FileID == 'ITS')]

#y_train_sen = y_train.loc[(X_train.FileID == 'IWW') | (X_train.FileID == 'ITH') | (X_train.FileID == 'ITS')]
#y_test_sen  = y_test.loc[(X_test.FileID == 'IWW') | (X_test.FileID == 'ITH') | (X_test.FileID == 'ITS')]

#################################################
#X_train_sen = X_train.loc[(X_train.FileID == 'TAI') | (X_train.FileID == 'IWL') | (X_train.FileID == 'IOM') | (X_train.FileID == 'TSI') | (X_train.FileID == 'WSI')]
#X_test_sen  = X_test.loc[(X_test.FileID == 'TAI') | (X_test.FileID == 'IWL') | (X_test.FileID == 'IOM') | (X_train.FileID == 'TSI') | (X_train.FileID == 'WSI')]

#y_train_sen = y_train.loc[(X_train.FileID == 'TAI') | (X_train.FileID == 'IWL') | (X_train.FileID == 'IOM') | (X_train.FileID == 'TSI') | (X_train.FileID == 'WSI')]
#y_test_sen  = y_test.loc[(X_test.FileID == 'TAI') | (X_test.FileID == 'IWL') | (X_test.FileID == 'IOM') | (X_train.FileID == 'TSI') | (X_train.FileID == 'WSI')]

##################################################

#X_train_sen1 = X_train.loc[(X_train.FileID == 'IWW') | (X_train.FileID == 'ITH') | (X_train.FileID == 'ITS')]
#X_test_sen1  = X_test.loc[(X_test.FileID == 'IWW') | (X_test.FileID == 'ITH') | (X_test.FileID == 'ITS')]

#y_train_sen1 = y_train.loc[(X_train.FileID == 'IWW') | (X_train.FileID == 'ITH') | (X_train.FileID == 'ITS')]
#y_test_sen1  = y_test.loc[(X_test.FileID == 'IWW') | (X_test.FileID == 'ITH') | (X_test.FileID == 'ITS')]

#X_train_sen = X_train_sen.loc[(X_train.Sex == 'Male')]
#X_test_sen  = X_test_sen.loc[(X_test.Sex == 'Male')]

#y_train_sen = y_train_sen.loc[(X_train.Sex == 'Male')]
#y_test_sen  = y_test_sen.loc[(X_test.Sex == 'Male')]



X_train_sen1 = X_train
X_test_sen1  = X_test

y_train_sen1 = y_train
y_test_sen1 = y_test

In [19]:
X_train_sen = X_train_sen1.loc[(X_train.Sex == 'Male')]
X_test_sen  = X_test_sen1.loc[(X_test.Sex == 'Male')]

y_train_sen = y_train_sen1.loc[(X_train.Sex == 'Male')]
y_test_sen  = y_test_sen1.loc[(X_test.Sex == 'Male')]

In [20]:
y_train_sen

Unnamed: 0,Emotion
3831,SAD
4372,SAD
3061,DIS
1528,FEA
4037,SAD
...,...
233,DIS
333,FEA
4188,ANG
3857,NEU


In [21]:
X_train_sen = X_train_sen.drop(columns = ['FileID','Sex'])
X_test_sen  = X_test_sen.drop(columns = ['FileID','Sex'])


In [22]:
# Add a column to the y vectors encoding each of the emotions.

y_train_sen_dummies = pd.get_dummies(y_train_sen)
y_train_sen         = pd.concat([y_train_sen, y_train_sen_dummies], axis=1)

y_test_sen_dummies  = pd.get_dummies(y_test)
y_test_sen          = pd.concat([y_test_sen, y_test_sen_dummies], axis=1)


In [23]:
# Check that the encoding looks right

X_train_sen

Unnamed: 0,energy_mean
3831,0.019216
4372,0.005302
3061,0.007254
1528,0.072522
4037,0.026553
...,...
233,0.023722
333,0.017539
4188,0.018437
3857,0.014280


In [24]:
y_train_sen.Emotion[y_train_sen.Emotion == 'NEU']

3674    NEU
3292    NEU
4808    NEU
418     NEU
649     NEU
       ... 
1423    NEU
3882    NEU
170     NEU
3816    NEU
3857    NEU
Name: Emotion, Length: 363, dtype: object

In [25]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'ANG' and instances where emotion is 'HAP'. 

# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP'

X_train_sub = X_train_sen.loc[(y_train_sen.Emotion == 'HAP') | (y_train_sen.Emotion == 'NEU')]
X_test_sub  = X_test_sen.loc[(y_test_sen.Emotion == 'HAP') | (y_test_sen.Emotion == 'NEU')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_train_sub = y_train_sen.loc[(y_train_sen.Emotion == 'HAP') | (y_train_sen.Emotion == 'NEU')].Emotion_HAP
y_test_sub = y_test_sen.loc[(y_test_sen.Emotion == 'HAP') | (y_test_sen.Emotion == 'NEU')].Emotion_HAP



In [26]:
print("X_train_sub",  X_train_sub.shape)
print("y_train_sub",  y_train_sub.shape)

X_train_sub (705, 1)
y_train_sub (705,)


In [27]:
np.unique(y_test_sub)

array([0, 1], dtype=uint8)

In [28]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)



In [29]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model t#o the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)


In [30]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the test set is:")
print(confusion_matrix(y_test_sub, pred))
print()

# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train_sub)
print("confusion matrix for the train set is:")
print(confusion_matrix(y_train_sub, pred_train))


confusion matrix for the test set is:
[[77 27]
 [59 27]]

confusion matrix for the train set is:
[[256 107]
 [219 123]]
