In [None]:
import matplotlib.pyplot as plt
import numpy as np
import copy

def plot_roc_curves(y, y_proba):
    fpr, tpr, roc_auc = dict(), dict(), dict()
    all_y_train_i = np.empty(shape=(0, len(y)), dtype=int)
    all_y_predict_proba = np.empty(shape=(0, len(y)), dtype=int)

    # http://benalexkeen.com/scoring-classifier-models-using-scikit-learn/
    for i in range(10):
        y_train_i = list(map(lambda x: 1 if x == i else 0, y))
        all_y_train_i = np.append(all_y_train_i, [copy.deepcopy(y_train_i)], axis=0)
        all_y_predict_proba = np.append(all_y_predict_proba, [y_proba[:, i]], axis=0)
        fpr[i], tpr[i], _ = roc_curve(y_train_i, y_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    plt.figure(figsize=(8, 6))
    colors = ['darkorange', 'forestgreen', 'royalblue', 'firebrick', 'gold', 'deepskyblue', 'darkviolet', 'peru', 'deeppink', 'yellowgreen']
    for lbl in range(10):
        plt.plot(fpr[lbl], tpr[lbl], color=colors[lbl], linewidth=2, label=f'Label {lbl} (area = {round(roc_auc[lbl], 2)})')
    plt.plot([0, 1], [0, 1], color='navy', linewidth=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves per label')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')

training_dataSet = pd.read_csv('DecisionTrees/src/main/resources/trainingData_top10.csv')
testing_dataSet = pd.read_csv('DecisionTrees/src/main/resources/testingData_top10.csv')

training_dataSet_4000 = pd.read_csv('DecisionTrees/src/main/resources/trainingData_top10_4000.csv')
testing_dataSet_4000 = pd.read_csv('DecisionTrees/src/main/resources/testingData_top10_4000.csv')

training_dataSet_9000 = pd.read_csv('DecisionTrees/src/main/resources/trainingData_top10_9000.csv')
testing_dataSet_9000 = pd.read_csv('DecisionTrees/src/main/resources/testingData_top10_9000.csv')

training_dataSet['label'] = training_dataSet['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet['label'] = testing_dataSet['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})

training_dataSet_4000['label'] = training_dataSet_4000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet_4000['label'] = testing_dataSet_4000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})

training_dataSet_9000['label'] = training_dataSet_9000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet_9000['label'] = testing_dataSet_9000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})

In [None]:
y_train = training_dataSet['label']
y_test = testing_dataSet['label']
X_train = training_dataSet.drop("label", axis=1)
X_test = testing_dataSet.drop("label", axis=1)

y_train_4000 = training_dataSet_4000['label']
y_test_4000 = testing_dataSet_4000['label']
X_train_4000 = training_dataSet_4000.drop("label", axis=1)
X_test_4000 = testing_dataSet_4000.drop("label", axis=1)

y_train_9000 = training_dataSet_9000['label']
y_test_9000 = testing_dataSet_9000['label']
X_train_9000 = training_dataSet_9000.drop("label", axis=1)
X_test_9000 = testing_dataSet_9000.drop("label", axis=1)

In [1]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV

In [2]:
import pandas as pd

training_dataSet = pd.read_csv('data/trainingData_top10.csv')
testing_dataSet = pd.read_csv('data/testingData_top10.csv')
training_dataSet['label'] = training_dataSet['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet['label'] = testing_dataSet['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
y_train = training_dataSet['label']
y_test = testing_dataSet['label']
X_train = training_dataSet.drop("label", axis=1)
X_test = testing_dataSet.drop("label", axis=1)

training_dataSet_4000 = pd.read_csv('data/trainingData_top10_4000.csv')
testing_dataSet_4000 = pd.read_csv('data/testingData_top10_4000.csv')
training_dataSet_4000['label'] = training_dataSet_4000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet_4000['label'] = testing_dataSet_4000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
y_train_4000 = training_dataSet_4000['label']
y_test_4000 = testing_dataSet_4000['label']
X_train_4000 = training_dataSet_4000.drop("label", axis=1)
X_test_4000 = testing_dataSet_4000.drop("label", axis=1)

training_dataSet_9000 = pd.read_csv('data/trainingData_top10_9000.csv')
testing_dataSet_9000 = pd.read_csv('data/testingData_top10_9000.csv')
training_dataSet_9000['label'] = training_dataSet_9000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
testing_dataSet_9000['label'] = testing_dataSet_9000['label'].map({'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9})
y_train_9000 = training_dataSet_9000['label']
y_test_9000 = testing_dataSet_9000['label']
X_train_9000 = training_dataSet_9000.drop("label", axis=1)
X_test_9000 = testing_dataSet_9000.drop("label", axis=1)

In [None]:
# MULTILAYER PERCEPTRON: 10-FOLD CROSS VALIDATION

clf = MLPClassifier()
y_pred_cross = cross_val_predict(clf, X_train, y_train, cv=10)

print(confusion_matrix(y_train, y_pred_cross))
print(classification_report(y_train, y_pred_cross))

y_pred_cross_proba = cross_val_predict(clf, X_train, y_train, cv=10, method='predict_proba')

plot_roc_curves(y_train, y_pred_cross_proba)

In [None]:
# MULTILAYER PERCEPTRON: TRAINING & TESTING DATA SETS

clf = MLPClassifier()
y_pred_MLP = clf.fit(X_train, y_train).predict(X_test)

print(confusion_matrix(y_test, y_pred_MLP))
print(classification_report(y_test, y_pred_MLP))

y_pred_MLP_proba = clf.predict_proba(X_test)

plot_roc_curves(y_test, y_pred_MLP_proba)

In [None]:
# MULTILAYER PERCEPTRON: TRAINING & TESTING DATA SETS (4000 TRAINING -> TESTING)

clf_4000 = MLPClassifier()
y_pred_MLP = clf_4000.fit(X_train_4000, y_train_4000).predict(X_test_4000)

print(confusion_matrix(y_test_4000, y_pred_MLP))
print(classification_report(y_test_4000, y_pred_MLP))

y_pred_MLP_proba = clf_4000.predict_proba(X_test_4000)

plot_roc_curves(y_test_4000, y_pred_MLP_proba)

In [None]:
# MULTILAYER PERCEPTRON: TRAINING & TESTING DATA SETS (9000 TRAINING -> TESTING)

clf_9000 = MLPClassifier()
y_pred_MLP = clf_9000.fit(X_train_9000, y_train_9000).predict(X_test_9000)

print(confusion_matrix(y_test_9000, y_pred_MLP))
print(classification_report(y_test_9000, y_pred_MLP))

y_pred_MLP_proba = clf_9000.predict_proba(X_test_9000)

plot_roc_curves(y_test_9000, y_pred_MLP_proba)

In [3]:
possible_parameters = {
    'hidden_layer_sizes': [(50, 20), (100,)],
    'alpha': [0.0001, 0.00005],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.0005],
    'beta_1': [0.9, 0.99],
    'beta_2': [0.999, 0.5],
    'epsilon': [1e-8, 5e-9, 2e-9]
}

In [8]:
import warnings; warnings.simplefilter('ignore')

clf = MLPClassifier(max_iter=1000)
clf = GridSearchCV(clf, possible_parameters, n_jobs=4, verbose=10)
clf.fit(X_train, y_train)

cv = clf.cv_results_
tab = pd.DataFrame({
    'rank_test_score': cv['rank_test_score'],
    'mean_train_score': cv['mean_train_score'],
    'mean_test_score': cv['mean_test_score'],
    'hidden_layer_sizes': cv['param_hidden_layer_sizes'],
    'alpha': cv['param_alpha'],
    'learning_rate': cv['param_learning_rate'],
    'learning_rate_init': cv['param_learning_rate_init'],
    'beta_1': cv['param_beta_1'],
    'beta_2': cv['param_beta_2'],
    'epsilon': cv['param_epsilon']
})
tab.sort_values(['rank_test_score']).to_csv('results_all.csv', encoding='utf-8', sep=',', index=False)
tab.sort_values(['rank_test_score']).head(10)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   34.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   59.4s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  3.8min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  4.8min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  8.4min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  9.9min
[Parallel(

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,hidden_layer_sizes,alpha,learning_rate,learning_rate_init,beta_1,beta_2,epsilon
13,1,0.99613,0.855055,"(100,)",0.0001,constant,0.0005,0.9,0.999,5e-09
100,2,0.986888,0.854739,"(100,)",5e-05,constant,0.001,0.9,0.999,1e-08
20,3,0.98282,0.854107,"(100,)",0.0001,constant,0.001,0.9,0.999,2e-09
70,4,0.983886,0.852844,"(100,)",0.0001,adaptive,0.001,0.99,0.999,2e-09
52,5,0.985545,0.851817,"(100,)",0.0001,constant,0.001,0.99,0.999,1e-08
6,6,0.984044,0.85158,"(100,)",0.0001,adaptive,0.001,0.9,0.999,1e-08
150,7,0.987875,0.851106,"(100,)",5e-05,adaptive,0.001,0.99,0.999,1e-08
62,8,0.989139,0.850632,"(100,)",0.0001,adaptive,0.001,0.99,0.999,5e-09
149,9,0.993207,0.850316,"(100,)",5e-05,constant,0.0005,0.99,0.999,1e-08
60,10,0.9844,0.850079,"(100,)",0.0001,constant,0.001,0.99,0.999,5e-09


In [9]:
import warnings; warnings.simplefilter('ignore')

clf_4000 = MLPClassifier(max_iter=1000)
clf_4000 = GridSearchCV(clf_4000, possible_parameters, n_jobs=4, verbose=10)
clf_4000.fit(X_train_4000, y_train_4000)

cv = clf_4000.cv_results_
tab_4000 = pd.DataFrame({
    'rank_test_score': cv['rank_test_score'],
    'mean_train_score': cv['mean_train_score'],
    'mean_test_score': cv['mean_test_score'],
    'hidden_layer_sizes': cv['param_hidden_layer_sizes'],
    'alpha': cv['param_alpha'],
    'learning_rate': cv['param_learning_rate'],
    'learning_rate_init': cv['param_learning_rate_init'],
    'beta_1': cv['param_beta_1'],
    'beta_2': cv['param_beta_2'],
    'epsilon': cv['param_epsilon']
})
tab_4000.sort_values(['rank_test_score']).to_csv('results_4000.csv', encoding='utf-8', sep=',', index=False)
tab_4000.sort_values(['rank_test_score']).head(10)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   26.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   47.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  8.2min
[Parallel(

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,hidden_layer_sizes,alpha,learning_rate,learning_rate_init,beta_1,beta_2,epsilon
150,1,0.998036,0.844111,"(100,)",5e-05,adaptive,0.001,0.99,0.999,1e-08
54,2,0.991858,0.839954,"(100,)",0.0001,adaptive,0.001,0.99,0.999,1e-08
68,3,0.996651,0.839838,"(100,)",0.0001,constant,0.001,0.99,0.999,2e-09
52,4,0.996477,0.839607,"(100,)",0.0001,constant,0.001,0.99,0.999,1e-08
61,5,0.999365,0.83776,"(100,)",0.0001,constant,0.0005,0.99,0.999,5e-09
108,5,0.981409,0.83776,"(100,)",5e-05,constant,0.001,0.9,0.999,5e-09
148,5,0.986431,0.83776,"(100,)",5e-05,constant,0.001,0.99,0.999,1e-08
156,8,0.986602,0.837067,"(100,)",5e-05,constant,0.001,0.99,0.999,5e-09
166,9,0.983195,0.836028,"(100,)",5e-05,adaptive,0.001,0.99,0.999,2e-09
101,10,0.996708,0.835335,"(100,)",5e-05,constant,0.0005,0.9,0.999,1e-08


In [7]:
import warnings; warnings.simplefilter('ignore')

clf_9000 = MLPClassifier(max_iter=1000)
clf_9000 = GridSearchCV(clf_9000, possible_parameters, n_jobs=4, verbose=10)
clf_9000.fit(X_train_9000, y_train_9000)

cv = clf_9000.cv_results_
tab_9000= pd.DataFrame({
    'rank_test_score': cv['rank_test_score'],
    'mean_train_score': cv['mean_train_score'],
    'mean_test_score': cv['mean_test_score'],
    'hidden_layer_sizes': cv['param_hidden_layer_sizes'],
    'alpha': cv['param_alpha'],
    'learning_rate': cv['param_learning_rate'],
    'learning_rate_init': cv['param_learning_rate_init'],
    'beta_1': cv['param_beta_1'],
    'beta_2': cv['param_beta_2'],
    'epsilon': cv['param_epsilon']
})
tab_9000.sort_values(['rank_test_score']).to_csv('results_9000.csv', encoding='utf-8', sep=',', index=False)
tab_9000.sort_values(['rank_test_score']).head(10)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   26.3s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   38.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   48.5s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  4.2min
[Parallel(

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,hidden_layer_sizes,alpha,learning_rate,learning_rate_init,beta_1,beta_2,epsilon
150,1,0.99959,0.80929,"(100,)",5e-05,adaptive,0.001,0.99,0.999,1e-08
54,2,0.99959,0.80765,"(100,)",0.0001,adaptive,0.001,0.99,0.999,1e-08
62,3,0.99959,0.806557,"(100,)",0.0001,adaptive,0.001,0.99,0.999,5e-09
156,4,0.99959,0.806284,"(100,)",5e-05,constant,0.001,0.99,0.999,5e-09
148,5,0.99959,0.805464,"(100,)",5e-05,constant,0.001,0.99,0.999,1e-08
52,6,0.99959,0.804918,"(100,)",0.0001,constant,0.001,0.99,0.999,1e-08
158,7,0.99959,0.804645,"(100,)",5e-05,adaptive,0.001,0.99,0.999,5e-09
164,8,0.99959,0.804372,"(100,)",5e-05,constant,0.001,0.99,0.999,2e-09
70,9,0.99959,0.802732,"(100,)",0.0001,adaptive,0.001,0.99,0.999,2e-09
63,10,0.99959,0.801639,"(100,)",0.0001,adaptive,0.0005,0.99,0.999,5e-09
