In [1]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [2]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix


In [None]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('../Data/Mid_features/')
print(data.shape)

In [None]:
data.head()

In [None]:
# Split the data into train and test set, stratified by Emotion.
data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [None]:
data.sample(5)

In [None]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

In [None]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

In [None]:
y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID'])
X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

In [None]:
y_test

In [None]:
# Add a column to the y vectors encoding each of the emotions.

y_train_dummies = pd.get_dummies(y_train)
y_train         = pd.concat([y_train, y_train_dummies], axis=1)
y_train["Emotion_ALL"]  = 1*y_train["Emotion_NEU"] + 2*y_train["Emotion_ANG"] + 3*y_train["Emotion_HAP"] + 4*y_train["Emotion_SAD"] + 5*y_train["Emotion_FEA"] + 6*y_train["Emotion_DIS"]
y_train = y_train.drop(columns  = ["Emotion", "Emotion_NEU", "Emotion_ANG", "Emotion_HAP", "Emotion_SAD", "Emotion_FEA", "Emotion_DIS"])

y_test_dummies = pd.get_dummies(y_test)
y_test         = pd.concat([y_test, y_test_dummies], axis=1)
y_test["Emotion_ALL"]  = 1*y_test["Emotion_NEU"] + 2*y_test["Emotion_ANG"] + 3*y_test["Emotion_HAP"] + 4*y_test["Emotion_SAD"] + 5*y_test["Emotion_FEA"] + 6*y_test["Emotion_DIS"]
y_test  = y_test.drop(columns   = ["Emotion", "Emotion_NEU", "Emotion_ANG", "Emotion_HAP", "Emotion_SAD", "Emotion_FEA", "Emotion_DIS"])


In [None]:
# Build pipeline to first scale the mid feature data, then apply the SVC

pipe100 = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])

classifier100 = pipe100.fit(X_train, y_train)
pred100       = pipe100.predict(X_test)    

# Look at the confusion matrix for the test data :
cnf_matrix_test100 = confusion_matrix(y_test, pred100)

print("no PCA")
print("confusion matrix for all six emotions of the test set is:")
print(cnf_matrix_test100)
print()

# Look at the confusion matrix for the training data:
pred_train100       = pipe100.predict(X_train)
cnf_matrix_train100 = confusion_matrix(y_train, pred_train100)

print("confusion matrix for all six emotions of the train set is:")
print(cnf_matrix_train100)

class_names = ["NEU", "ANG", "HAP", "SAD", "FEA", "DIS"]
disp = plot_confusion_matrix(classifier100, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)
plt.show()



acc100 = np.diag(cnf_matrix_test100).sum()/cnf_matrix_test100.sum() 
print("accuracy =", acc100)




In [None]:
X_train_new95 = X_train.copy()
X_test_new95  = X_test.copy()

pca95 = PCA(n_components = .95)
pca95.fit(X_train)

X_train_transform95 = pca95.transform(X_train_new95)
X_test_transform95  = pca95.transform(X_test_new95)

print(len(X_train_transform95[0]))

for i in range (len(X_train_transform95[0])):
    X_train_new95["comp_" + str(i+1)] = X_train_transform95[:,i]
    X_test_new95 ["comp_" + str(i+1)] = X_test_transform95[:,i]

X_train_sub95  = X_train_new95.iloc[: , -(len(X_train_transform95[0])+1):]
X_test_sub95   = X_test_new95.iloc[: , -(len(X_test_transform95[0])+1):]    

# Build pipeline to first scale the mid feature data, then apply the SVC

pipe95 = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])

classifier95 = pipe95.fit(X_train_sub95, y_train)
pred95       = pipe95.predict(X_test_sub95)    

# Look at the confusion matrix for the test data :
cnf_matrix_test95 = confusion_matrix(y_test, pred95)

print("for PCA n_components = 0.95")
print("confusion matrix for all six emotions of the test set is:")
print(cnf_matrix_test95)
print()

# Look at the confusion matrix for the training data:
pred_train95       = pipe95.predict(X_train_sub95)
cnf_matrix_train95 = confusion_matrix(y_train, pred_train95)

print("confusion matrix for all six emotions of the train set is:")
print(cnf_matrix_train95)

class_names = ["NEU", "ANG", "HAP", "SAD", "FEA", "DIS"]
disp = plot_confusion_matrix(classifier95, X_test_sub95, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)
plt.show()

acc95 = np.diag(cnf_matrix_test95).sum()/cnf_matrix_test95.sum() 
print("accuracy =", acc95)

In [None]:
X_train_new90 = X_train.copy()
X_test_new90  = X_test.copy()

pca90 = PCA(n_components = .9)
pca90.fit(X_train)

X_train_transform90 = pca90.transform(X_train_new90)
X_test_transform90  = pca90.transform(X_test_new90)

print(len(X_train_transform90[0]))

for i in range (len(X_train_transform90[0])):
    X_train_new90["comp_" + str(i+1)] = X_train_transform90[:,i]
    X_test_new90 ["comp_" + str(i+1)] = X_test_transform90[:,i]

X_train_sub90  = X_train_new90.iloc[: , -(len(X_train_transform90[0])+1):]
X_test_sub90  = X_test_new90.iloc[: , -(len(X_test_transform90[0])+1):]    

# Build pipeline to first scale the mid feature data, then apply the SVC

pipe90 = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])

classifier90 = pipe90.fit(X_train_sub90, y_train)
pred90       = pipe90.predict(X_test_sub90)    

# Look at the confusion matrix for the test data :
cnf_matrix_test90 = confusion_matrix(y_test, pred90)

print("for PCA n_components = 0.9")
print("confusion matrix for all six emotions of the test set is:")
print(cnf_matrix_test90)
print()

# Look at the confusion matrix for the training data:
pred_train90       = pipe90.predict(X_train_sub90)
cnf_matrix_train90 = confusion_matrix(y_train, pred_train90)

print("confusion matrix for all six emotions of the train set is:")
print(cnf_matrix_train90)

class_names = ["NEU", "ANG", "HAP", "SAD", "FEA", "DIS"]
disp = plot_confusion_matrix(classifier90, X_test_sub90, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)
plt.show()

acc90 = np.diag(cnf_matrix_test90).sum()/cnf_matrix_test90.sum() 
print("accuracy =", acc90)

In [None]:
X_train_new85 = X_train.copy()
X_test_new85  = X_test.copy()

pca85 = PCA(n_components = .85)
pca85.fit(X_train)

X_train_transform85 = pca85.transform(X_train_new85)
X_test_transform85  = pca85.transform(X_test_new85)

print(len(X_train_transform85[0]))

for i in range (len(X_train_transform85[0])):
    X_train_new85["comp_" + str(i+1)] = X_train_transform85[:,i]
    X_test_new85 ["comp_" + str(i+1)] = X_test_transform85[:,i]

X_train_sub85  = X_train_new85.iloc[: , -(len(X_train_transform85[0])+1):]
X_test_sub85  = X_test_new85.iloc[: , -(len(X_test_transform85[0])+1):]    

# Build pipeline to first scale the mid feature data, then apply the SVC

pipe85 = Pipeline([('scale', StandardScaler()),
                 ('svc', SVC(kernel='rbf'))])

classifier85 = pipe85.fit(X_train_sub85, y_train)
pred85       = pipe85.predict(X_test_sub85)    

# Look at the confusion matrix for the test data :
cnf_matrix_test85 = confusion_matrix(y_test, pred85)

print("for PCA n_components = 0.85")
print("confusion matrix for all six emotions of the test set is:")
print(cnf_matrix_test85)
print()

# Look at the confusion matrix for the training data:
pred_train85       = pipe85.predict(X_train_sub85)
cnf_matrix_train85 = confusion_matrix(y_train, pred_train85)

print("confusion matrix for all six emotions of the train set is:")
print(cnf_matrix_train85)

class_names = ["NEU", "ANG", "HAP", "SAD", "FEA", "DIS"]
disp = plot_confusion_matrix(classifier85, X_test_sub85, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)
plt.show()

acc85 = np.diag(cnf_matrix_test85).sum()/cnf_matrix_test85.sum() 
print("accuracy =", acc85)

In [None]:

FP100 = cnf_matrix_test100.sum(axis=0) - np.diag(cnf_matrix_test100) 
FN100 = cnf_matrix_test100.sum(axis=1) - np.diag(cnf_matrix_test100)
TP100 = np.diag(cnf_matrix_test100)
TN100 = cnf_matrix_test100.sum() - (FP100 + FN100 + TP100)


FP95 = cnf_matrix_test95.sum(axis=0) - np.diag(cnf_matrix_test95) 
FN95 = cnf_matrix_test95.sum(axis=1) - np.diag(cnf_matrix_test95)
TP95 = np.diag(cnf_matrix_test95)
TN95 = cnf_matrix_test95.sum() - (FP95 + FN95 + TP95)

FP90 = cnf_matrix_test90.sum(axis=0) - np.diag(cnf_matrix_test90) 
FN90 = cnf_matrix_test90.sum(axis=1) - np.diag(cnf_matrix_test90)
TP90 = np.diag(cnf_matrix_test90)
TN90 = cnf_matrix_test90.sum() - (FP90 + FN90 + TP90)

FP85 = cnf_matrix_test85.sum(axis=0) - np.diag(cnf_matrix_test85) 
FN85 = cnf_matrix_test85.sum(axis=1) - np.diag(cnf_matrix_test85)
TP85 = np.diag(cnf_matrix_test85)
TN85 = cnf_matrix_test85.sum() - (FP85 + FN85 + TP85)

acc100 = np.diag(cnf_matrix_test100).sum()/cnf_matrix_test100.sum() 
acc95 = np.diag(cnf_matrix_test95).sum()/cnf_matrix_test95.sum() 
acc90 = np.diag(cnf_matrix_test90).sum()/cnf_matrix_test90.sum() 
acc85 = np.diag(cnf_matrix_test85).sum()/cnf_matrix_test85.sum() 

recall100 = TP100/(TP100 + FN100)
recall95 = TP95/(TP95 + FN95)
recall90 = TP90/(TP90 + FN90)
recall85 = TP85/(TP85 + FN85)

print("accuracy with 100% variance of data  =", acc100)
print("accuracy with 95% variance of data =", acc95)
print("accuracy with 90% variance of data =", acc90)
print("accuracy with 85% variance of data =", acc85)

print()

print("recall with 100% variance of data  =", recall100)
print("recall with 95% variance of data =",recall95)
print("recall with 90% variance of data =",recall90)
print("recall with 85% variance of data =",recall85)


In [None]:
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy for each class
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
ACC