In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix


In [40]:
# Read in your csv file that has the mid features. 

data_test = pd.read_csv('../Data/midFeaturesTestSetWithChars.csv')
data_train = pd.read_csv('../Data/midFeaturesTrainSetWithChars.csv')


data_train.head()

#type(data)
print(data_train.shape)
#data.sample(5)

(4877, 144)


In [41]:
fem_train_subset = data_train.loc[data["Sex"] == 'Female', :]
mal_train_subset = data_train.loc[data["Sex"] == 'Male', :]

fem_test_subset = data_test.loc[data["Sex"] == 'Female', :]
mal_test_subset = data_test.loc[data["Sex"] == 'Male', :]



print(fem_train_subset.shape)

(2270, 144)


In [42]:
fem_train_subset.sample(5)

Unnamed: 0,FileID,ActorID,Emotion,SentenceID,Age,zcr_mean,Sex,Race,Ethnicity,energy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
488,1053_WSI_SAD_XX,1053,SAD,WSI,35,0.05046,Female,Caucasian,Not Hispanic,0.01549,...,0.018879,0.021546,0.016572,0.029255,0.004145,0.002658,0.015428,0.017136,0.002221,0.01079
2346,1091_TIE_DIS_XX,1091,DIS,TIE,29,0.06466,Female,Asian,Not Hispanic,0.017737,...,0.017463,0.017178,0.009117,0.02371,0.007539,0.008483,0.014724,0.025008,0.003134,0.007261
3107,1056_TIE_DIS_XX,1056,DIS,TIE,52,0.079846,Female,African American,Not Hispanic,0.014319,...,0.028621,0.027631,0.011061,0.025581,0.003518,0.014594,0.012224,0.015586,0.013462,0.00819
4417,1024_TIE_SAD_XX,1024,SAD,TIE,59,0.044333,Female,Caucasian,Not Hispanic,0.020743,...,0.01032,0.018875,0.009142,0.042972,0.000404,0.013557,0.017132,0.033208,0.00197,0.009079
1146,1078_ITS_NEU_XX,1078,NEU,ITS,21,0.091733,Female,Caucasian,Not Hispanic,0.021334,...,0.011387,0.015087,0.010534,0.030826,0.000877,0.010174,0.021275,0.024919,0.005566,0.008069


In [39]:
# Check the percentages of the different emotion categories in the Female training set

fem_train_subset.Emotion.value_counts(normalize=True)

NEU    0.178414
HAP    0.164317
FEA    0.164317
ANG    0.164317
SAD    0.164317
DIS    0.164317
Name: Emotion, dtype: float64

In [44]:
# Check the percentages of the different emotion categories in the Female test set


fem_test_subset.Emotion.value_counts(normalize=True)

NEU    0.183124
HAP    0.172352
SAD    0.168761
FEA    0.168761
DIS    0.157989
ANG    0.149013
Name: Emotion, dtype: float64

In [83]:
# Split the Female train and test set into labels (y) and features (X)

y_fem_train = fem_train_subset[['Emotion']]
y_fem_test = fem_test_subset[['Emotion']]

X_fem_train = fem_train_subset.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Age', 'Sex', 'Race', 'Ethnicity' ])
X_fem_test = fem_test_subset.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Age', 'Sex', 'Race', 'Ethnicity' ])

# Split the Male train and test set into labels (y) and features (X)

y_mal_train = mal_train_subset[['Emotion']]
y_mal_test = mal_test_subset[['Emotion']]

X_mal_train = mal_train_subset.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Age', 'Sex', 'Race', 'Ethnicity' ])
X_mal_test = mal_test_subset.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Age', 'Sex', 'Race', 'Ethnicity' ])



In [84]:
y_mal_train.sample(5)

Unnamed: 0,Emotion
2240,DIS
2102,HAP
3525,FEA
2399,NEU
3016,NEU


In [85]:
# Add a column to the y vectors encoding each of the emotions in the Female datasubset.


y_fem_train_dummies = pd.get_dummies(y_fem_train)

y_fem_train = pd.concat([y_fem_train, y_fem_train_dummies], axis=1)

y_fem_test_dummies = pd.get_dummies(y_fem_test)

y_fem_test = pd.concat([y_fem_test, y_fem_test_dummies], axis=1)

# Add a column to the y vectors encoding each of the emotions in the Male datasubset.


y_mal_train_dummies = pd.get_dummies(y_mal_train)

y_mal_train = pd.concat([y_mal_train, y_mal_train_dummies], axis=1)

y_mal_test_dummies = pd.get_dummies(y_mal_test)

y_mal_test = pd.concat([y_mal_test, y_mal_test_dummies], axis=1)

In [86]:
# Check that the encoding looks right

y_mal_train.head()

Unnamed: 0,Emotion,Emotion_ANG,Emotion_DIS,Emotion_FEA,Emotion_HAP,Emotion_NEU,Emotion_SAD
1,NEU,0,0,0,0,1,0
2,ANG,1,0,0,0,0,0
6,HAP,0,0,0,1,0,0
8,NEU,0,0,0,0,1,0
9,DIS,0,1,0,0,0,0


In [120]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'ANG' and instances where emotion is 'HAP'. 



# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP' in Female datsubset

X_fem_train_sub = X_fem_train.loc[(y_fem_train.Emotion == 'ANG') | (y_fem_train.Emotion == 'HAP')]
X_fem_test_sub = X_fem_test.loc[(y_fem_test.Emotion == 'ANG') | (y_fem_test.Emotion == 'HAP')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_fem_train_sub = y_fem_train.loc[(y_fem_train.Emotion == 'ANG') | (y_fem_train.Emotion == 'HAP')].Emotion_ANG
y_fem_test_sub = y_fem_test.loc[(y_fem_test.Emotion == 'ANG') | (y_fem_test.Emotion == 'HAP')].Emotion_ANG

###################

# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP' in Male datsubset

X_mal_train_sub = X_mal_train.loc[(y_mal_train.Emotion == 'ANG') | (y_mal_train.Emotion == 'HAP')]
X_mal_test_sub = X_mal_test.loc[(y_mal_test.Emotion == 'ANG') | (y_mal_test.Emotion == 'HAP')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_mal_train_sub = y_mal_train.loc[(y_mal_train.Emotion == 'ANG') | (y_mal_train.Emotion == 'HAP')].Emotion_ANG
y_mal_test_sub = y_mal_test.loc[(y_mal_test.Emotion == 'ANG') | (y_mal_test.Emotion == 'HAP')].Emotion_ANG

In [121]:
y_mal_train_sub

2       1
6       0
13      1
17      1
20      1
       ..
4837    0
4839    0
4849    0
4864    1
4870    1
Name: Emotion_ANG, Length: 856, dtype: uint8

In [128]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the Male training data

pipe.fit(X_mal_train_sub, y_mal_train_sub)

    
# Get the model's prediction on the Male test data

pred_mal = pipe.predict(X_mal_test_sub)


###################


# Fit the model to the Female training data

pipe.fit(X_fem_train_sub, y_fem_train_sub)

    
# Get the model's prediction on the Female test data

pred_fem = pipe.predict(X_fem_test_sub)




In [136]:
# Look at the confusion matrix for the Female test data :
#confusion_matrix(y_fem_test_sub, pred_fem)

###################


# Look at the confusion matrix for the Male test data :

confusion_matrix(y_mal_test_sub, pred_mal)

array([[55, 46],
       [61, 53]])

In [140]:
# Look at the confusion matrix for the Female training datasubset:
fem_pred_train = pipe.predict(X_fem_train_sub)

confusion_matrix(y_fem_train_sub, fem_pred_train)


###################

# Look at the confusion matrix for the Male training datasubset:
# The Male confusion matrix was sooo bad I had to settle for the female one
# for exposition. I tried to predict Male test data with female training data but it didn't work

#mal_pred_train = pipe.predict(X_mal_train_sub)

#confusion_matrix(y_mal_train_sub, mal_pred_train)

array([[316,  57],
       [ 53, 320]])