In [8]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
import MajorityClassifier as MCL
import numpy as np
import pandas as pd
import generate_results as gr
from python.dataset_info import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
header = "model,dataset,scoring,P1_best,P2_best,cv_acc,val_acc"
output = "results/params_tuning.csv"
with open(output, "a") as f:
    f.write(header+'\n')

parameters = {'svm': {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1,10,100,1000]},
          'rf': {'n_estimators': [10, 30, 50, 100], 'max_depth': [None,1,3]},
          'knn': {'n_neighbors': [1,3,5,8,10,30], 'weights': ['uniform', 'distance']},
          'mnb': {},
          'mlp': {'activation': ['identity', 'relu', 'logistic'], 'alpha': [0.001, 1.0000000000000001e-05, 9.9999999999999995e-07]}}

# Split the dataset 70/30
for dataset in list(datasets.keys()):
    X, y = gr.read_dataset(datasets[dataset])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    scores = ['accuracy']#,'precision_macro', 'recall_macro']
    for label, model in {'svm': SVC(),'knn':KNeighborsClassifier(),'rf': RandomForestClassifier(), 'mlp': MLPClassifier()}.items():
        for score in scores:
            print("# Tuning hyper-parameters for %s, model %s" % (score, model.__class__.__name__))
            print()
            #clf = GridSearchCV(model, parameters[label], cv=5,scoring=('%s' % score))
            clf = GridSearchCV(model, parameters[label], cv=5,scoring=('%s' % score))
            
            clf.fit(X_train, y_train)

            print("Best parameters set found on development set:")
            print()
            print(clf.best_params_)
            print()
            print("Grid scores on development set:")
            print()
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds, clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            print()

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            y_true, y_pred = y_test, clf.predict(X_test)
            print(classification_report(y_true, y_pred))
            print()
            res_str = "%s,%s,%s,%s,%s,%.2f,%.2f" % (model.__class__.__name__,
                                                        dataset,
                                                        score,
                                                        list(clf.best_params_.values())[0],
                                                        list(clf.best_params_.values())[1],
                                                        max(means),
                                                        accuracy_score(y_true, y_pred))
            print(res_str)
            with open(output, "a") as f:
                f.write(res_str + '\n')

# Tuning hyper-parameters for accuracy, model SVC

Best parameters set found on development set:

{'C': 100, 'kernel': 'linear'}

Grid scores on development set:

0.742 (+/-0.027) for {'C': 1, 'kernel': 'linear'}
0.542 (+/-0.020) for {'C': 1, 'kernel': 'poly'}
0.542 (+/-0.020) for {'C': 1, 'kernel': 'rbf'}
0.542 (+/-0.020) for {'C': 1, 'kernel': 'sigmoid'}
0.802 (+/-0.064) for {'C': 10, 'kernel': 'linear'}
0.542 (+/-0.020) for {'C': 10, 'kernel': 'poly'}
0.689 (+/-0.042) for {'C': 10, 'kernel': 'rbf'}
0.610 (+/-0.062) for {'C': 10, 'kernel': 'sigmoid'}
0.859 (+/-0.084) for {'C': 100, 'kernel': 'linear'}
0.725 (+/-0.047) for {'C': 100, 'kernel': 'poly'}
0.778 (+/-0.054) for {'C': 100, 'kernel': 'rbf'}
0.769 (+/-0.063) for {'C': 100, 'kernel': 'sigmoid'}
0.855 (+/-0.075) for {'C': 1000, 'kernel': 'linear'}
0.709 (+/-0.058) for {'C': 1000, 'kernel': 'poly'}
0.744 (+/-0.086) for {'C': 1000, 'kernel': 'rbf'}
0.758 (+/-0.092) for {'C': 1000, 'kernel': 'sigmoid'}

Detailed classification repo

Best parameters set found on development set:

{'max_depth': 3, 'n_estimators': 30}

Grid scores on development set:

0.502 (+/-0.042) for {'max_depth': None, 'n_estimators': 10}
0.513 (+/-0.072) for {'max_depth': None, 'n_estimators': 30}
0.522 (+/-0.064) for {'max_depth': None, 'n_estimators': 50}
0.529 (+/-0.064) for {'max_depth': None, 'n_estimators': 100}
0.481 (+/-0.116) for {'max_depth': 1, 'n_estimators': 10}
0.445 (+/-0.035) for {'max_depth': 1, 'n_estimators': 30}
0.451 (+/-0.036) for {'max_depth': 1, 'n_estimators': 50}
0.451 (+/-0.015) for {'max_depth': 1, 'n_estimators': 100}
0.528 (+/-0.072) for {'max_depth': 3, 'n_estimators': 10}
0.549 (+/-0.037) for {'max_depth': 3, 'n_estimators': 30}
0.539 (+/-0.069) for {'max_depth': 3, 'n_estimators': 50}
0.544 (+/-0.070) for {'max_depth': 3, 'n_estimators': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  

Best parameters set found on development set:

{'max_depth': None, 'n_estimators': 50}

Grid scores on development set:

0.989 (+/-0.028) for {'max_depth': None, 'n_estimators': 10}
0.994 (+/-0.022) for {'max_depth': None, 'n_estimators': 30}
1.000 (+/-0.000) for {'max_depth': None, 'n_estimators': 50}
0.994 (+/-0.022) for {'max_depth': None, 'n_estimators': 100}
0.897 (+/-0.198) for {'max_depth': 1, 'n_estimators': 10}
0.891 (+/-0.165) for {'max_depth': 1, 'n_estimators': 30}
0.920 (+/-0.096) for {'max_depth': 1, 'n_estimators': 50}
0.914 (+/-0.127) for {'max_depth': 1, 'n_estimators': 100}
0.977 (+/-0.068) for {'max_depth': 3, 'n_estimators': 10}
0.994 (+/-0.022) for {'max_depth': 3, 'n_estimators': 30}
0.989 (+/-0.045) for {'max_depth': 3, 'n_estimators': 50}
0.994 (+/-0.022) for {'max_depth': 3, 'n_estimators': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recal

Best parameters set found on development set:

{'max_depth': None, 'n_estimators': 10}

Grid scores on development set:

0.852 (+/-0.092) for {'max_depth': None, 'n_estimators': 10}
0.848 (+/-0.098) for {'max_depth': None, 'n_estimators': 30}
0.838 (+/-0.092) for {'max_depth': None, 'n_estimators': 50}
0.833 (+/-0.095) for {'max_depth': None, 'n_estimators': 100}
0.733 (+/-0.133) for {'max_depth': 1, 'n_estimators': 10}
0.757 (+/-0.070) for {'max_depth': 1, 'n_estimators': 30}
0.786 (+/-0.128) for {'max_depth': 1, 'n_estimators': 50}
0.767 (+/-0.097) for {'max_depth': 1, 'n_estimators': 100}
0.819 (+/-0.123) for {'max_depth': 3, 'n_estimators': 10}
0.805 (+/-0.114) for {'max_depth': 3, 'n_estimators': 30}
0.819 (+/-0.130) for {'max_depth': 3, 'n_estimators': 50}
0.848 (+/-0.077) for {'max_depth': 3, 'n_estimators': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recal

Best parameters set found on development set:

{'max_depth': None, 'n_estimators': 50}

Grid scores on development set:

0.495 (+/-0.289) for {'max_depth': None, 'n_estimators': 10}
0.552 (+/-0.177) for {'max_depth': None, 'n_estimators': 30}
0.581 (+/-0.204) for {'max_depth': None, 'n_estimators': 50}
0.571 (+/-0.214) for {'max_depth': None, 'n_estimators': 100}
0.419 (+/-0.069) for {'max_depth': 1, 'n_estimators': 10}
0.438 (+/-0.159) for {'max_depth': 1, 'n_estimators': 30}
0.448 (+/-0.135) for {'max_depth': 1, 'n_estimators': 50}
0.457 (+/-0.154) for {'max_depth': 1, 'n_estimators': 100}
0.533 (+/-0.141) for {'max_depth': 3, 'n_estimators': 10}
0.552 (+/-0.217) for {'max_depth': 3, 'n_estimators': 30}
0.533 (+/-0.146) for {'max_depth': 3, 'n_estimators': 50}
0.533 (+/-0.137) for {'max_depth': 3, 'n_estimators': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recal

Best parameters set found on development set:

{'n_neighbors': 30, 'weights': 'distance'}

Grid scores on development set:

0.612 (+/-0.015) for {'n_neighbors': 1, 'weights': 'uniform'}
0.612 (+/-0.015) for {'n_neighbors': 1, 'weights': 'distance'}
0.536 (+/-0.017) for {'n_neighbors': 3, 'weights': 'uniform'}
0.611 (+/-0.037) for {'n_neighbors': 3, 'weights': 'distance'}
0.545 (+/-0.018) for {'n_neighbors': 5, 'weights': 'uniform'}
0.623 (+/-0.032) for {'n_neighbors': 5, 'weights': 'distance'}
0.539 (+/-0.030) for {'n_neighbors': 8, 'weights': 'uniform'}
0.623 (+/-0.032) for {'n_neighbors': 8, 'weights': 'distance'}
0.534 (+/-0.046) for {'n_neighbors': 10, 'weights': 'uniform'}
0.626 (+/-0.027) for {'n_neighbors': 10, 'weights': 'distance'}
0.542 (+/-0.028) for {'n_neighbors': 30, 'weights': 'uniform'}
0.645 (+/-0.034) for {'n_neighbors': 30, 'weights': 'distance'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full ev

Best parameters set found on development set:

{'activation': 'logistic', 'alpha': 1e-05}

Grid scores on development set:

0.612 (+/-0.273) for {'activation': 'identity', 'alpha': 0.001}
0.691 (+/-0.219) for {'activation': 'identity', 'alpha': 1e-05}
0.520 (+/-0.207) for {'activation': 'identity', 'alpha': 1e-06}
0.618 (+/-0.413) for {'activation': 'relu', 'alpha': 0.001}
0.480 (+/-0.195) for {'activation': 'relu', 'alpha': 1e-05}
0.480 (+/-0.207) for {'activation': 'relu', 'alpha': 1e-06}
0.862 (+/-0.268) for {'activation': 'logistic', 'alpha': 0.001}
0.947 (+/-0.068) for {'activation': 'logistic', 'alpha': 1e-05}
0.895 (+/-0.294) for {'activation': 'logistic', 'alpha': 1e-06}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.98      0.93      0.95        45
          1       0.87      0.95      0.91        21

avg / total   

In [10]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex, HTML

Parameter combinations by GridSearchCV for all classifiers. Sorted in order of accuracy.
"model,dataset,scoring,P1_best,P2_best,cv_acc,val_acc"

In [18]:
df_def = pd.read_csv('results/params_tuning.csv')
res = df_def.drop(['scoring', 'cv_acc', 'val_acc'], axis=1).groupby(['model','P1_best', 'P2_best']).count()
display(HTML(res.to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dataset
model,P1_best,P2_best,Unnamed: 3_level_1
KNeighborsClassifier,1,uniform,3
KNeighborsClassifier,10,distance,1
KNeighborsClassifier,10,uniform,1
KNeighborsClassifier,3,distance,3
KNeighborsClassifier,3,uniform,1
KNeighborsClassifier,30,distance,2
MLPClassifier,identity,0.001,1
MLPClassifier,logistic,0.001,1
MLPClassifier,logistic,1e-05,1
MLPClassifier,logistic,1e-06,1
