In [34]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [35]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [36]:
# Read in your csv file that has the mid features. 

data = pd.read_csv('../Data/Categories_train.csv')
data.head()

print(data.shape)

(4877, 5)


In [37]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [38]:
data.sample(5)

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
1496,1026_IWW_NEU_XX,1026,IWW,NEU,XX
757,1015_IWL_FEA_XX,1015,IWL,FEA,XX
3422,1065_ITH_SAD_XX,1065,ITH,SAD,XX
3874,1075_DFA_DIS_XX,1075,DFA,DIS,XX
4360,1083_ITS_ANG_XX,1083,ITS,ANG,XX


In [14]:
# Check the percentages of the different emotion categories in the training set



data_train.Emotion.value_counts(normalize=True)

NEU    0.178672
SAD    0.164317
HAP    0.164317
DIS    0.164317
FEA    0.164317
ANG    0.164060
Name: Emotion, dtype: float64

In [15]:
# Check the percentages of the different emotion categories in the test set


data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
ANG    0.164959
DIS    0.163934
SAD    0.163934
HAP    0.163934
FEA    0.163934
Name: Emotion, dtype: float64

In [27]:
# Split the train and test set into labels (y) and features (X)

y_train = data_train[['Emotion']]
y_test = data_test[['Emotion']]

X_train = data_train.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Intensity' ])
X_test = data_test.drop(columns = ['FileID', 'ActorID', 'Emotion', 'SentenceID', 'Intensity' ])



In [28]:
X_train

3793
3642
404
4438
2997
...
3563
3630
2389
3754
2661


In [29]:
# Add a column to the y vectors encoding each of the emotions.


y_train_dummies = pd.get_dummies(y_train)

y_train = pd.concat([y_train, y_train_dummies], axis=1)

y_test_dummies = pd.get_dummies(y_test)

y_test = pd.concat([y_test, y_test_dummies], axis=1)



In [30]:
# Check that the encoding looks right

y_train.head()

Unnamed: 0,Emotion,Emotion_ANG,Emotion_DIS,Emotion_FEA,Emotion_HAP,Emotion_NEU,Emotion_SAD
3793,SAD,0,0,0,0,0,1
3642,ANG,1,0,0,0,0,0
404,HAP,0,0,0,1,0,0
4438,SAD,0,0,0,0,0,1
2997,DIS,0,1,0,0,0,0


In [31]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'ANG' and instances where emotion is 'HAP'. 



# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP'

X_train_sub = X_train.loc[(y_train.Emotion == 'ANG') | (y_train.Emotion == 'HAP')]
X_test_sub = X_test.loc[(y_test.Emotion == 'ANG') | (y_test.Emotion == 'HAP')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_train_sub = y_train.loc[(y_train.Emotion == 'ANG') | (y_train.Emotion == 'HAP')].Emotion_ANG
y_test_sub = y_test.loc[(y_test.Emotion == 'ANG') | (y_test.Emotion == 'HAP')].Emotion_ANG



In [32]:
X_train_sub

3642
404
2576
3937
313
...
3584
4220
3563
3630
2661


In [33]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model t#o the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)


ValueError: at least one array or dtype is required

In [11]:
# Look at the confusion matrix for the test data :
confusion_matrix(y_test_sub, pred)


array([[126, 128],
       [129, 126]])

In [12]:
# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train_sub)

confusion_matrix(y_train_sub, pred_train)

array([[859, 158],
       [188, 828]])