In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [24]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('midFeaturesTrainFinalWithChars.csv')


data.head()

print(data.shape)

(4876, 144)


In [4]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Sex,
                                   test_size=0.2
                                   )

In [25]:
data.head()

Unnamed: 0,FileID,actorID,Emotion,SentenceID,Age,Sex,Race,Ethnicity,zcr_mean,energy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
0,1001_DFA_ANG_XX,1001,ANG,DFA,51,Male,Caucasian,Not Hispanic,0.159956,0.012981,...,0.024107,0.014803,0.017961,0.013412,0.008655,0.010352,0.009738,0.0106,0.004328,0.009167
1,1001_DFA_DIS_XX,1001,DIS,DFA,51,Male,Caucasian,Not Hispanic,0.175069,0.006502,...,0.022395,0.01551,0.008768,0.014533,0.009661,0.002533,0.004223,0.007513,0.003662,0.007296
2,1001_DFA_FEA_XX,1001,FEA,DFA,51,Male,Caucasian,Not Hispanic,0.199849,0.016796,...,0.007043,0.003129,0.006915,0.007791,0.013899,0.005247,0.003474,0.014306,0.005781,0.00825
3,1001_DFA_HAP_XX,1001,HAP,DFA,51,Male,Caucasian,Not Hispanic,0.148663,0.00796,...,0.021737,0.005675,0.009277,0.026797,0.010147,0.010658,0.017229,0.013203,0.010011,0.007488
4,1001_DFA_NEU_XX,1001,NEU,DFA,51,Male,Caucasian,Not Hispanic,0.174283,0.010704,...,0.03197,0.012929,0.017969,0.037496,0.013379,0.008354,0.005615,0.008907,0.007483,0.013592


In [6]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

NEU    0.178462
ANG    0.165897
SAD    0.165641
HAP    0.163846
FEA    0.163846
DIS    0.162308
Name: Emotion, dtype: float64

In [7]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.180328
DIS    0.172131
FEA    0.165984
HAP    0.165984
ANG    0.157787
SAD    0.157787
Name: Emotion, dtype: float64

In [8]:
y_train = data_train[['Sex']]
y_test  = data_test[['Sex']]

X_train = data_train.drop(columns  = ['Age', 'Sex', 'Race', 'Ethnicity','FileID','actorID', 'Emotion', 'SentenceID'])
X_test  = data_test.drop(columns   = ['Age', 'Sex', 'Race', 'Ethnicity','FileID','actorID', 'Emotion', 'SentenceID'])

In [9]:
y_test

Unnamed: 0,Sex
1415,Female
2255,Male
3569,Male
4305,Female
844,Male
...,...
2048,Male
429,Female
2476,Male
4544,Male


In [10]:
# Add a column to the y vectors encoding each of the emotions.

y_train_dummies = pd.get_dummies(y_train)
y_train         = pd.concat([y_train_dummies], axis=1)

y_test_dummies  = pd.get_dummies(y_test)
y_test          = pd.concat([y_test_dummies], axis=1)


In [11]:
y_train

Unnamed: 0,Sex_Female,Sex_Male
1455,1,0
3824,1,0
3456,0,1
4511,0,1
952,1,0
...,...,...
3770,1,0
2358,1,0
2451,1,0
970,1,0


In [12]:
# Get the rows of X_train, X_test corresponding to just the 'male' and 'female'

#X_train_female = X_train.loc[(y_train.Sex_Female == 1)]
#X_test_female  = X_test.loc[(y_test.Sex_Female == 1)]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

#y_train_female = y_train.loc[(y_train.Sex_Female == 1)]
#y_test_female  = y_test.loc[(y_test.Sex_Female == 1)]


In [14]:
y_train = y_train[['Sex_Female']]
y_test  = y_test[['Sex_Female']]

In [20]:
X_train

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
1455,0.152276,0.014005,2.858160,0.225772,0.202198,0.915462,0.010300,0.201941,-26.199925,1.471775,...,0.009301,0.012673,0.023289,0.016034,0.010283,0.010652,0.029966,0.016329,0.013935,0.009539
3824,0.104086,0.022710,2.845487,0.174065,0.176616,0.802384,0.011002,0.166472,-26.374801,2.016346,...,0.014727,0.015466,0.017787,0.016991,0.006119,0.017857,0.020510,0.018147,0.006925,0.007298
3456,0.115666,0.016213,2.887324,0.208233,0.208960,0.831476,0.006363,0.176736,-25.677236,1.845666,...,0.030056,0.024633,0.020371,0.010850,0.020868,0.012656,0.007392,0.014988,0.006579,0.008956
4511,0.182331,0.021859,3.018190,0.272223,0.230758,1.262529,0.008269,0.300924,-25.452489,1.784276,...,0.009593,0.017992,0.042877,0.014460,0.006433,0.018741,0.019959,0.011406,0.006427,0.011830
952,0.130392,0.014650,2.956819,0.223849,0.217712,0.744362,0.009760,0.193515,-29.473907,1.861406,...,0.015567,0.013012,0.013075,0.039164,0.021264,0.017114,0.015570,0.015386,0.013727,0.013050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3770,0.128361,0.018541,2.797039,0.210737,0.201596,0.825257,0.009534,0.188901,-28.232758,1.855671,...,0.027056,0.015323,0.007610,0.010292,0.013818,0.002610,0.018192,0.012243,0.014435,0.008191
2358,0.106178,0.017545,3.031209,0.160384,0.157824,0.779150,0.008671,0.142498,-27.290574,1.837535,...,0.009497,0.009457,0.009011,0.012437,0.015811,0.014470,0.012311,0.024431,0.010589,0.008290
2451,0.140405,0.021262,2.903997,0.236286,0.209877,1.090516,0.008436,0.219737,-26.874934,0.655581,...,0.014410,0.009704,0.019837,0.012117,0.006510,0.032699,0.021756,0.019479,0.011979,0.010825
970,0.091021,0.033374,2.913303,0.168944,0.189730,0.548550,0.010607,0.123395,-25.501708,2.640155,...,0.023608,0.027502,0.022592,0.013719,0.010831,0.017146,0.009844,0.006500,0.010199,0.011332


In [21]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data
pipe.fit(X_train, y_train)

    
# Get the model's prediction on the test data
pred = pipe.predict(X_test)





  return f(*args, **kwargs)


In [23]:
# Look at the confusion matrix for the test data :

print("confusion matrix for the male/female test set is:")
print(confusion_matrix(y_test, pred))
print()

# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train)
print("confusion matrix for the male/female train set is:")
print(confusion_matrix(y_train, pred_train))


confusion matrix for the male/female test set is:
[[468  54]
 [ 66 388]]

confusion matrix for the male/female train set is:
[[2012   73]
 [  97 1718]]
