In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data = np.load('data.npz')
X = data['features']
y = data['labels']

In [3]:
#Before using GridSearch and CrossValidation split the data into trainval and test, creating a test set to 
#calculate the final accuracy (or other test according to imbalanced data) on
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state = 0)

In [4]:
#Use a simple imputer to get rid of missing values
imp = SimpleImputer()
X_trainval = imp.fit_transform(X_trainval)
X_test = imp.transform(X_test)

In [5]:
#Work with the imbalance of the data
sm = SMOTE()
X_trainval_sm, y_trainval_sm = sm.fit_resample(X_trainval, y_trainval)
print("Previous number of samples:", X_trainval.shape)
print("New number of samples:", X_trainval_sm.shape)
print(np.unique(y_trainval_sm, return_counts=True))

Previous number of samples: (4233, 12)
New number of samples: (7544, 12)
(array([0, 1, 2, 3], dtype=int32), array([1886, 1886, 1886, 1886]))


In [6]:
#DicisionTree
param_grid_DT = {
    'max_depth' : [2, 6, 10, 14, 18, 22],
    'min_samples_split' : [2, 4, 6, 8, 10]
    }
    
grid_search_DT = GridSearchCV(DecisionTreeClassifier(), param_grid_DT, cv=5)

grid_search_DT.fit(X_trainval_sm, y_trainval_sm)

print("Best parameters: {}".format(grid_search_DT.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search_DT.best_score_))

Best parameters: {'max_depth': 10, 'min_samples_split': 2}
Best cross-validation score: 0.79


In [7]:
final_DT = DecisionTreeClassifier(max_depth = 14, min_samples_split = 2)
final_DT.fit(X_trainval_sm, y_trainval_sm)
print(final_DT.score(X_test, y_test))

0.6536827195467422


In [8]:
y_pred_DT = final_DT.fit(X_trainval_sm, y_trainval_sm).predict(X_test)
print(classification_report(y_test, y_pred_DT))
print(confusion_matrix(y_test, y_pred_DT))

              precision    recall  f1-score   support

           0       0.63      0.66      0.64       346
           1       0.71      0.91      0.80        77
           2       0.62      0.68      0.65       351
           3       0.70      0.62      0.66       638

    accuracy                           0.66      1412
   macro avg       0.67      0.72      0.69      1412
weighted avg       0.66      0.66      0.66      1412

[[227  12  30  77]
 [  2  70   0   5]
 [ 27   0 240  84]
 [107  16 119 396]]


In [9]:
#MLPClassifier
scaler = StandardScaler() 
scaler.fit(X_trainval_sm)

X_trainval_MLP = scaler.transform(X_trainval_sm)
X_test = scaler.transform(X_test)

param_grid_MLP = {
    'hidden_layer_sizes': [(12,), (12,12)],
    'activation': ['tanh', 'relu'],
    'alpha': [0.0001, 0.05]
}

random_search_MLP = RandomizedSearchCV(MLPClassifier(max_iter = 10000), param_grid_MLP, n_iter = 5, cv=5, random_state = 0)

random_search_MLP.fit(X_trainval_MLP, y_trainval_sm)

print("Best parameters: {}".format(random_search_MLP.best_params_))
print("Best cross-validation score: {:.2f}".format(random_search_MLP.best_score_))

Best parameters: {'hidden_layer_sizes': (12, 12), 'alpha': 0.05, 'activation': 'tanh'}
Best cross-validation score: 0.82


In [12]:
final_MLP = MLPClassifier(max_iter = 10000, 
                          hidden_layer_sizes = (12, 12), 
                          alpha = 0.05, 
                          activation = 'tanh')
final_MLP.fit(X_trainval_MLP, y_trainval_sm)
print(final_MLP.score(X_test, y_test))

0.6997167138810199


In [13]:
y_pred_MLP = final_MLP.fit(X_trainval_MLP, y_trainval_sm).predict(X_test)
print(classification_report(y_test, y_pred_MLP))

print(confusion_matrix(y_test, y_pred_MLP))

              precision    recall  f1-score   support

           0       0.66      0.75      0.70       346
           1       0.73      0.92      0.82        77
           2       0.71      0.76      0.73       351
           3       0.78      0.66      0.71       638

    accuracy                           0.72      1412
   macro avg       0.72      0.77      0.74      1412
weighted avg       0.73      0.72      0.72      1412

[[260  10  14  62]
 [  1  71   0   5]
 [ 30   0 267  54]
 [105  16  95 422]]
