In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix




In [4]:
# Read in your csv file that has the mid features. 


data = pd.read_csv('../Data/Categories_train.csv')
data.head()

(data.shape)

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
0,1001_DFA_ANG_XX,1001,DFA,ANG,XX
1,1001_DFA_DIS_XX,1001,DFA,DIS,XX
2,1001_DFA_FEA_XX,1001,DFA,FEA,XX
3,1001_DFA_HAP_XX,1001,DFA,HAP,XX
4,1001_DFA_NEU_XX,1001,DFA,NEU,XX


In [4]:
# Split the data into train and test set, stratified by Emotion.


data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [5]:
# Check the percentages of the different emotion categories in the training set



data_train.Emotion.value_counts(normalize=True)

SAD    0.170838
DIS    0.170838
FEA    0.170838
HAP    0.170838
ANG    0.170670
NEU    0.145977
Name: Emotion, dtype: float64

In [6]:
# Check the percentages of the different emotion categories in the test set


data_test.Emotion.value_counts(normalize=True)

ANG    0.171256
HAP    0.170584
DIS    0.170584
FEA    0.170584
SAD    0.170584
NEU    0.146407
Name: Emotion, dtype: float64

In [7]:
# Split the train and test set into labels (y) and features (X)

y_train = data_train[['Emotion']]
y_test = data_test[['Emotion']]

X_train = data_train.drop(columns = ['FileID', 'actorID', 'Emotion', 'SentenceID' ])
X_test = data_test.drop(columns = ['FileID', 'actorID', 'Emotion', 'SentenceID' ])






In [8]:
# Add a column to the y vectors encoding each of the emotions.


y_train_dummies = pd.get_dummies(y_train)

y_train = pd.concat([y_train, y_train_dummies], axis=1)

y_test_dummies = pd.get_dummies(y_test)

y_test = pd.concat([y_test, y_test_dummies], axis=1)



In [9]:
# Check that the encoding looks right

y_train.head()

Unnamed: 0,Emotion,Emotion_ANG,Emotion_DIS,Emotion_FEA,Emotion_HAP,Emotion_NEU,Emotion_SAD
3771,SAD,0,0,0,0,0,1
2809,DIS,0,1,0,0,0,0
7041,FEA,0,0,1,0,0,0
2129,NEU,0,0,0,0,1,0
3342,SAD,0,0,0,0,0,1


In [10]:
# Example: try to train a support vector machine with Gaussian radial kernel to distinguish between instances
# where emotion is 'ANG' and instances where emotion is 'HAP'. 



# Get the rows of X_train, X_test corresponding to just the emotions 'ANG' and 'HAP'

X_train_sub = X_train.loc[(y_train.Emotion == 'ANG') | (y_train.Emotion == 'HAP')]
X_test_sub = X_test.loc[(y_test.Emotion == 'ANG') | (y_test.Emotion == 'HAP')]


# Get the Emotion_ANG column of the ys, with only the rows corresponding to 'ANG' and 'HAP'

y_train_sub = y_train.loc[(y_train.Emotion == 'ANG') | (y_train.Emotion == 'HAP')].Emotion_ANG
y_test_sub = y_test.loc[(y_test.Emotion == 'ANG') | (y_test.Emotion == 'HAP')].Emotion_ANG


# Build pipeline to first scale the mid feature data, then apply the SVC

pipe = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])


# Fit the model to the training data

pipe.fit(X_train_sub, y_train_sub)

    
# Get the model's prediction on the test data

pred = pipe.predict(X_test_sub)



In [11]:
# Look at the confusion matrix for the test data :
confusion_matrix(y_test_sub, pred)


array([[126, 128],
       [129, 126]])

In [12]:
# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train_sub)

confusion_matrix(y_train_sub, pred_train)

array([[859, 158],
       [188, 828]])