In [4]:
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a dark background
sns.set_style("whitegrid")

In [5]:

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix


In [6]:
# Read in your csv file that has the mid features. 

#data = pd.read_csv('../Feature Extraction/midFeaturesTrainSet.csv')
data = pd.read_csv('midFeaturesTrainFinal.csv')
print(data.shape)

(4876, 140)


In [7]:
data.head()

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
0,1001_DFA_ANG_XX,1001,ANG,DFA,0.12174,0.010421,2.983526,0.216327,0.225219,0.932025,...,0.019988,0.018907,0.020462,0.016111,0.006406,0.00784,0.013986,0.014026,0.003441,0.00775
1,1001_DFA_DIS_XX,1001,DIS,DFA,0.161743,0.00653,2.940205,0.246532,0.212951,1.265724,...,0.02151,0.024405,0.024236,0.024355,0.00888,0.00261,0.004799,0.011605,0.003827,0.010198
2,1001_DFA_FEA_XX,1001,FEA,DFA,0.158708,0.015425,2.973619,0.252136,0.225729,1.245681,...,0.007116,0.003273,0.00796,0.058401,0.010373,0.00383,0.009172,0.025511,0.005837,0.017773
3,1001_DFA_HAP_XX,1001,HAP,DFA,0.159097,0.00576,2.937929,0.229749,0.208469,1.379728,...,0.014083,0.006455,0.007594,0.043598,0.007653,0.011884,0.015029,0.013349,0.014063,0.012297
4,1001_DFA_NEU_XX,1001,NEU,DFA,0.164732,0.008302,2.892321,0.264956,0.227461,1.380184,...,0.024043,0.014561,0.020798,0.051023,0.011482,0.004178,0.002889,0.015255,0.007335,0.016231


In [8]:
# Split the data into train and test set, stratified by Emotion.
data_train, data_test = train_test_split(data.copy(),
                                   shuffle=True,
                                   random_state=608,
                                   stratify=data.Emotion,
                                   test_size=0.2
                                   )

In [9]:
data.sample(5)

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
4381,1083_MTI_NEU_XX,1083,NEU,MTI,0.116958,0.016879,2.788513,0.208027,0.216303,0.6421,...,0.03183,0.030765,0.02812,0.018399,0.001263,0.012042,0.017294,0.024277,0.013567,0.012084
276,1005_IOM_FEA_XX,1005,FEA,IOM,0.15016,0.022103,3.05025,0.228797,0.205126,1.185003,...,0.012702,0.023818,0.023478,0.029286,0.007082,0.008151,0.011418,0.015805,0.011953,0.011054
1256,1022_WSI_FEA_XX,1022,FEA,WSI,0.187588,0.00604,2.839253,0.287019,0.229386,0.978975,...,0.00578,0.016451,0.023722,0.056643,0.004344,0.020056,0.01345,0.024273,0.005076,0.015962
3379,1064_MTI_SAD_XX,1064,SAD,MTI,0.087068,0.024371,2.804405,0.15856,0.188741,0.489511,...,0.038682,0.042925,0.028429,0.034153,0.008739,0.015172,0.011124,0.012655,0.009288,0.016317
4621,1087_ITH_ANG_XX,1087,ANG,ITH,0.177621,0.013839,2.738828,0.266778,0.226347,1.247138,...,0.004619,0.032639,0.017403,0.025345,0.012711,0.015971,0.022073,0.023607,0.009356,0.009774


In [10]:
# Check the percentages of the different emotion categories in the training set

data_train.Emotion.value_counts(normalize=True)

NEU    0.178718
HAP    0.164359
FEA    0.164359
ANG    0.164359
SAD    0.164103
DIS    0.164103
Name: Emotion, dtype: float64

In [11]:
# Check the percentages of the different emotion categories in the test set

data_test.Emotion.value_counts(normalize=True)

NEU    0.179303
DIS    0.164959
ANG    0.163934
SAD    0.163934
FEA    0.163934
HAP    0.163934
Name: Emotion, dtype: float64

In [12]:
y_train = data_train[['Emotion']]
y_test  = data_test[['Emotion']]

X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID'])
X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

In [13]:
print(y_train)
print(np.unique(y_train))

     Emotion
837      HAP
2490     SAD
3885     SAD
845      SAD
1890     FEA
...      ...
2046     ANG
3271     DIS
3148     SAD
4337     HAP
3075     NEU

[3900 rows x 1 columns]
['ANG' 'DIS' 'FEA' 'HAP' 'NEU' 'SAD']


In [14]:
# Drop most emotions
# emotion_list = ['NEU', 'ANG', 'FEA']
emotion_list = ['NEU', 'ANG', 'HAP']
I_train = y_train.isin(emotion_list)
I_test = y_test.isin(emotion_list)

X_train = X_train[I_train.values]
X_test = X_test[I_test.values]

y_train = y_train.loc[y_train.Emotion.isin(emotion_list),:]
y_test = y_test.loc[y_test.Emotion.isin(emotion_list),:]

In [16]:
y_train

Unnamed: 0,Emotion,Emotion_ANG,Emotion_HAP,Emotion_NEU
837,HAP,0,1,0
2865,HAP,0,1,0
1520,NEU,0,0,1
4387,NEU,0,0,1
2446,NEU,0,0,1
...,...,...,...,...
3161,ANG,1,0,0
2399,ANG,1,0,0
2046,ANG,1,0,0
4337,HAP,0,1,0


In [17]:
X_test

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
3624,0.137966,0.009896,2.769996,0.244530,0.244659,0.618559,0.013420,0.184352,-31.924370,2.242827,...,0.019029,0.020770,0.018790,0.073024,0.003929,0.003633,0.016040,0.031665,0.010291,0.021890
334,0.149873,0.023765,2.856053,0.254200,0.228800,1.168319,0.009826,0.249018,-31.408660,0.987017,...,0.030689,0.019946,0.019648,0.059214,0.009834,0.018090,0.010856,0.026703,0.013541,0.020258
2245,0.153501,0.016649,2.972033,0.233179,0.221148,0.920822,0.008758,0.228564,-29.759123,1.590153,...,0.033473,0.015486,0.019813,0.028999,0.001472,0.012190,0.008530,0.015049,0.010237,0.013429
4155,0.142130,0.007567,2.815767,0.227720,0.205058,0.917199,0.008902,0.212332,-30.695278,1.472583,...,0.017955,0.025019,0.037004,0.014575,0.004067,0.013144,0.011602,0.017142,0.009222,0.010781
295,0.121234,0.009051,2.767798,0.208866,0.211853,0.917593,0.011500,0.174069,-31.293380,1.357021,...,0.021114,0.023511,0.029164,0.047461,0.007517,0.015945,0.011983,0.018790,0.009628,0.015865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3407,0.120261,0.016759,2.877384,0.220216,0.227230,0.925920,0.009321,0.176483,-31.919319,1.442480,...,0.019422,0.019011,0.008170,0.058998,0.014311,0.004996,0.018622,0.032497,0.007440,0.017963
3730,0.164841,0.011010,2.912963,0.274387,0.243619,0.862875,0.010072,0.254701,-33.127634,1.167642,...,0.021365,0.022253,0.020306,0.051793,0.005381,0.007973,0.019596,0.031670,0.011889,0.016010
4030,0.096376,0.016511,2.838537,0.176307,0.175633,0.874673,0.008177,0.155747,-26.496701,1.937831,...,0.012252,0.021411,0.028839,0.015110,0.008736,0.016788,0.020017,0.013822,0.013315,0.007347
2150,0.183114,0.013585,2.857590,0.251913,0.228171,0.932158,0.010111,0.272506,-29.628198,1.874462,...,0.021876,0.015537,0.022817,0.032361,0.013887,0.015866,0.011805,0.018935,0.007357,0.013090


In [18]:
np.unique(y_test)

array(['ANG', 'HAP', 'NEU'], dtype=object)

In [20]:
# Look at the confusion matrix for the test data :
cnf_matrix_test = confusion_matrix(y_test, pred)

print("confusion matrix for all six emotions of the test set is:")
print(cnf_matrix_test)
print()

# Look at the confusion matrix for the training data:
pred_train = pipe.predict(X_train)
cnf_matrix_train = confusion_matrix(y_train, pred_train)

print("confusion matrix for all six emotions of the train set is:")
print(cnf_matrix_train)


NameError: name 'pred' is not defined

In [None]:
from sklearn.metrics import plot_confusion_matrix
# class_names = ["NEU", "ANG", "HAP", "SAD", "FEA", "DIS"]
# class_names = ["NEU", "ANG", "FEA", "Others"]
class_names = emotion_list

In [None]:
disp = plot_confusion_matrix(classifier, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues)
plt.show()

In [None]:
M = confusion_matrix(y_test, classifier.predict(X_test))
print('Overall accuracy')
print(np.sum(np.diag(M))/np.sum(M[:]))

In [None]:

FP = cnf_matrix_test.sum(axis=0) - np.diag(cnf_matrix_test) 
FN = cnf_matrix_test.sum(axis=1) - np.diag(cnf_matrix_test)
TP = np.diag(cnf_matrix_test)
TN = cnf_matrix_test.sum() - (FP + FN + TP)

In [None]:
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy for each class
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
ACC

## Subsets of emotions


In [None]:
import itertools

In [None]:
emotions = np.unique(data_train[['Emotion']])
for emotion_list in itertools.combinations(emotions, 3):
    print(emotion_list)

In [None]:
models = []
accuracies = []
for n in range(2,7):
    acc_list = []
    mod_list = []
    for emotion_list in itertools.combinations(emotions, n):
        # Original dataset
        y_train = data_train[['Emotion']]
        y_test  = data_test[['Emotion']]

        X_train = data_train.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID'])
        X_test  = data_test.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

        # Drop most emotions
        I_train = y_train.isin(emotion_list)
        I_test = y_test.isin(emotion_list)

        X_train = X_train[I_train.values]
        X_test = X_test[I_test.values]

        y_train = y_train.loc[y_train.Emotion.isin(emotion_list),:]
        y_test = y_test.loc[y_test.Emotion.isin(emotion_list),:]

        # Build pipeline to first scale the mid feature data, then apply the SVC
        pipe = Pipeline([('scale', StandardScaler()),
                         ('svc', SVC(kernel='rbf'))])
        classifier = pipe.fit(X_train, y_train)
        pred = pipe.predict(X_test)

        # Look at the confusion matrix for the test data :
        cnf_matrix_test = confusion_matrix(y_test, pred)

        # Plot confusion matrix
        print(emotion_list)
        disp = plot_confusion_matrix(classifier, X_test, y_test,
                                         display_labels=emotion_list,
                                         cmap=plt.cm.Blues)
        plt.show()

        M = confusion_matrix(y_test, classifier.predict(X_test))
        print('Overall accuracy')
        acc = np.sum(np.diag(M))/np.sum(M[:])
        print(acc)

        acc_list.append((emotion_list, acc))
        mod_list.append((emotion_list, classifier))
    accuracies.append(acc_list)
    models.append(mod_list)

In [None]:
for n in range(2,7):
    acc_list = accuracies[n-2]
    print('Chance with n=%i' %n)
    print(round(1/n,5))
    print('Obtained:')
    print(np.mean([acc[1] for acc in acc_list]))

### Try this on the test set


In [None]:
# Read in your csv file that has the mid features. 

data_test_final = pd.read_csv('../Data/Mid_features/midFeaturesTestFinal.csv')
print(data_test_final.shape)

y_test_final  = data_test_final[['Emotion']]
X_test_final  = data_test_final.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

In [None]:
models_test_final = []
accuracies_test_final = []
for n in range(2,7):
    acc_list = []
    mod_list = []
    for emotion_list in itertools.combinations(emotions, n):
        # Original dataset
        y_train = data[['Emotion']]
        y_test_final  = data_test_final[['Emotion']]

        X_train = data.drop(columns  = ['FileID','actorID', 'Emotion', 'SentenceID'])
        X_test_final  = data_test_final.drop(columns   = ['FileID','actorID', 'Emotion', 'SentenceID'])

        # Drop most emotions
        I_train = y_train.isin(emotion_list)
        I_test = y_test_final.isin(emotion_list)

        X_train = X_train[I_train.values]
        X_test_final = X_test_final[I_test.values]

        y_train = y_train.loc[y_train.Emotion.isin(emotion_list),:]
        y_test_final = y_test_final.loc[y_test_final.Emotion.isin(emotion_list),:]

        # Build pipeline to first scale the mid feature data, then apply the SVC
        pipe = Pipeline([('scale', StandardScaler()),
                         ('svc', SVC(kernel='rbf'))])
        classifier = pipe.fit(X_train, y_train)
        pred = pipe.predict(X_test_final)

        # Look at the confusion matrix for the test data :
        cnf_matrix_test = confusion_matrix(y_test_final, pred)

        # Plot confusion matrix
        print(emotion_list)
        disp = plot_confusion_matrix(classifier, X_test_final, y_test_final,
                                         display_labels=emotion_list,
                                         cmap=plt.cm.Blues)
        plt.show()

        M = confusion_matrix(y_test_final, classifier.predict(X_test_final))
        print('Overall accuracy')
        acc = np.sum(np.diag(M))/np.sum(M[:])
        print(acc)

        acc_list.append((emotion_list, acc))
        mod_list.append((emotion_list, classifier))
    accuracies_test_final.append(acc_list)
    models_test_final.append(mod_list)

In [None]:
for n in range(2,7):
    acc_list = accuracies_test_final[n-2]
    print('Chance with n=%i' %n)
    print(round(1/n,5))
    print('Obtained:')
    print(np.mean([acc[1] for acc in acc_list]))

In [None]:
# means = [np.mean([acc[1] for acc in acc_list]) for acc_list in accuracies]
# means_test_final = [np.mean([acc[1] for acc in acc_list]) for acc_list in accuracies_test_final]

means = []
for n in range(len(accuracies)):
    acc_list = accuracies[n]
    means.append(np.mean([acc[1] for acc in acc_list]))

means_test_final = []
for n in range(len(accuracies_test_final)):
    acc_list = accuracies_test_final[n]
    means_test_final.append(np.mean([acc[1] for acc in acc_list]))


In [None]:
plt.scatter(means, means_test_final)

tt = np.linspace(0.4,0.8)
plt.plot(tt,tt)

plt.title('Accuracies over sets of emotions')
plt.xlabel('Validation set')
plt.ylabel('Test set')

In [None]:
acc_vector = []
for n in range(len(accuracies)):
    row = accuracies[n]
    acc_row = []
    for i in range(len(row)):
        acc_row.append(row[i][1])
    acc_vector.append(acc_row)

acc_vector_final = []
for n in range(len(accuracies_test_final)):
    row = accuracies_test_final[n]
    acc_row = []
    for i in range(len(row)):
        acc_row.append(row[i][1])
    acc_vector_final.append(acc_row)

In [None]:
plt.figure()
for n in range(len(acc_vector)):
    vec_val = acc_vector[n]
    vec_test = acc_vector_final[n]
    
    plt.scatter(vec_val, vec_test)

plt.xlabel('Validation set')
plt.ylabel('Test set')
plt.title('Accuracies over subset of emotions')

tt = np.linspace(0.35,0.9)
plt.plot(tt,tt)

plt.savefig('Validation vs Test accuracy.png')