In [11]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import interp

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc

from sklearn.ensemble import VotingClassifier
import pickle

In [2]:
emotion_labels = {0:'angry', 1:'disgust', 2:'fear', 3:'happy', 4:'sad', 5:'surprise', 6:'neutral'}
target_labels = [val for key, val in emotion_labels.items()]
targets = [key for key, val in emotion_labels.items()]

In [3]:
def load_data(df, data_type):
    """Load data from DataFrame and return data to list 
    """
    image_data = []
    image_scaled = []
    
    for index, row in df.iterrows():
        if data_type == 1:
            field = row.pca
        elif data_type == 2:
            field = row.histogram
        elif data_type == 3:
            field = row.hog
        elif data_type == 4:
            field = row.lda
        else:
            field = row.pixels
        image = np.fromstring(field, sep=' ')
        image_data.append(image)
        if data_type == 0:
            scale = image / 255.0
            image_scaled.append(scale)
        
    return image_data, image_scaled

In [5]:
df = pd.read_csv('../../data/hog/hog_scaled2.csv')
df.head()

Unnamed: 0,usage,emotion,hog
0,train,3,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,train,3,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,train,3,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,train,3,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,train,3,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [7]:
df_train = df[df['usage']=='train']
df_test = df[df['usage']=='test']
train_data, train_scaled = load_data(df_train, 3)
train_target = list(df_train.emotion)
test_data, test_scaled = load_data(df_test, 3)
test_target = list(df_test.emotion)

In [8]:
X_train = np.array(train_data)
y_train = np.array(train_target)
X_test = np.array(test_data)
y_test = np.array(test_target)

In [12]:
rf = RandomForestClassifier(n_estimators=2000,max_features='sqrt',
                            random_state=101, n_jobs=-1,max_depth=60, min_samples_leaf=1, min_samples_split=2)

svm = SVC(C=10.0, random_state=1, kernel='rbf', gamma='auto', decision_function_shape='ovo')
knn = KNN(n_neighbors=2, weights='distance', metric='minkowski')

In [13]:
model = VotingClassifier(estimators=[('knn', knn), ('RF', rf),('SVM', svm)], voting='hard')
model.fit(X_train,y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(n_neighbors=2,
                                                   weights='distance')),
                             ('RF',
                              RandomForestClassifier(max_depth=60,
                                                     max_features='sqrt',
                                                     n_estimators=2000,
                                                     n_jobs=-1,
                                                     random_state=101)),
                             ('SVM',
                              SVC(C=10.0, decision_function_shape='ovo',
                                  gamma='auto', random_state=1))])

In [16]:
pickle.dump(model, open("../../models/voting.pkl", "wb"))

In [15]:
y_pred = model.predict(X_test)
print('Train Accuracy: %.3f' % model.score(X_train, y_train))
print('Test Accuracy: %.3f' % model.score(X_test, y_test))

Train Accuracy: 0.998
Test Accuracy: 0.560


In [17]:
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred, average='macro'))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred, average='macro'))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, average='macro'))
print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=target_labels))

Precision: 0.583
Recall: 0.556
F1: 0.556
Accuracy: 0.560
              precision    recall  f1-score   support

       angry       0.42      0.37      0.40       957
     disgust       0.85      1.00      0.92       830
        fear       0.53      0.38      0.44      1024
       happy       0.51      0.82      0.63      1774
         sad       0.45      0.30      0.36      1247
    surprise       0.84      0.59      0.70       831
     neutral       0.48      0.43      0.45      1233

    accuracy                           0.56      7896
   macro avg       0.58      0.56      0.56      7896
weighted avg       0.56      0.56      0.54      7896



In [18]:
def plot_cm(cm,testscore):
  '''provide cm item and test score'''
  cm_labels_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
  cm_labels_norm = cm_labels_norm * 100.00

  plt.figure(figsize=(9,9))
  sns.heatmap(cm_labels_norm,
              annot=True, 
              annot_kws={'size':16},
              fmt=".1f",
            cmap='Greens',
            xticklabels=[i for i in emotions.values()],
            yticklabels=[i for i in emotions.values()],
            square=True,
            cbar=False)
  plt.title('Test Accuracy Score: {0}%\n'.format("%.2f" % (testscore*100.0)), size=24)
  plt.tight_layout()
  plt.xticks(size=14)
  plt.yticks(size=14)
  plt.ylabel('Actual label', size=18)
  plt.xlabel('Predicted label', size=18)
  plt.show()

In [21]:
plot_cm(confusion_matrix(y_test, y_pred), model.score(X_test, y_test))

TypeError: plot_cm() missing 1 required positional argument: 'data'

In [None]:
voting = pickle.load(open("../../models/voting.pkl", "rb"))

In [None]:
train_sizes, train_scores, test_scores =\
                learning_curve(estimator=model,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()