In [7]:
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
datasets = {
	"student" : {
		"train_name" : "prep_data/student/student_grades.csv",
		"X_col" : range(33),
		"Y_col" : [33],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"contraceptive" : {
		"train_name" : "prep_data/contraceptive/contraceptive.csv",
		"X_col" : range(9),
		"Y_col" : [9],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"autism" : {
		"train_name" : "prep_data/Autism-Adult-Data/Autism-Adult-Data-preproc.csv",
		"X_col" : range(20),
		"Y_col" : [20],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"bankruptcy" : {
		"train_name" : "prep_data/bankruptcy/bankrupt.csv",
		"X_col" : range(6),
		"Y_col" : [6],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"breast_cancer" : {
		"train_name" : "prep_data/breast-cancer/breast-cancer-wisconsin.data",
		"X_col" : range(9),
		"Y_col" : [9],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"horse" : {
		"train_name" : "prep_data/horse-colic/horse-colic.data-preproc.csv",
		"X_col" : range(22),
		"Y_col" : [22],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
    "hr" : {
		"train_name" : "prep_data/hr-analytics/HR_comma_sep.csv",
		"X_col" : range(9),
		"Y_col" : [9],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"english" : {
		"train_name" : "prep_data/teaching-english/tae.csv",
		"X_col" : range(5),
		"Y_col" : [5],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"phishing" : {
		"train_name" : "prep_data/website-phishing/PhishingData.csv",
		"X_col" : range(9),
		"Y_col" : [9],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"wine" : {
		"train_name" : "prep_data/wine-quality/winequality-red.csv",
		"X_col" : range(11),
		"Y_col" : [11],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"amazon" : {
		"train_name" : "prep_data/amazon/amzreviews.csv",
		"X_col" : range(1,3093),
		"Y_col" : [3093],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"congress" : {
		"train_name" : "prep_data/congress/congress_leave.csv",
		"X_col" : range(1,17),
		"Y_col" : [17],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"covertype" : {
		"train_name" : "prep_data/covertypes/covertype_scale.csv",
		"X_col" : range(54),
		"Y_col" : [54],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	},
	"kidney" : {
		"train_name" : "prep_data/kidney/kidney_colMeanMode.csv",
		"X_col" : range(24),
		"Y_col" : [24],
		"has_header" : True,
		"filetype" : "CSV",
		"encode_labels" : False
	}
}

def read_dataset(dataset):
    df = pd.read_csv('../'+dataset["train_name"])
    data_X = df.iloc[:, dataset["X_col"]].copy()
    data_y = df.iloc[:, dataset["Y_col"]].copy()
    assert(data_y.columns[0] == 'Class')
    return data_X, data_y


In [8]:
header = "model,dataset,scoring,p1,p2,acc"
output = "../results/grid_search_res.csv"
#with open(output, "a") as f:
#    f.write(header+'\n')
parameters = {'svm': {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1,10,100,1000]},
              'rf': {'n_estimators': [10, 30, 50, 100], 'max_depth': [None,1,3]},
              'knn': {'n_neighbors': [1,3,5,8,10,30,50,100], 'weights': ['uniform', 'distance']},
              'mnb': {},
              'mlp': {'activation': ['identity', 'relu', 'logistic'], 'alpha': [0.001, 1.0000000000000001e-05, 9.9999999999999995e-07]}}

# Split the dataset 70/30
total_iter = 14*4*3
current_iter = 0
for dataset in list(datasets.keys()):
    print("Tuning parameters for %s dataset" % dataset)
    X, y = read_dataset(datasets[dataset])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    scores = ['accuracy','precision_macro', 'recall_macro']
    for label, model in {'svm': SVC(),'knn':KNeighborsClassifier(),'rf': RandomForestClassifier(), 'mlp': MLPClassifier()}.items():
        for score in scores:
            current_iter += 1
            print("[%s]%d/%d" % (dataset,current_iter, total_iter))
            #print("# Tuning hyper-parameters for %s, model %s" % (score, model.__class__.__name__))
            #print()
            clf = GridSearchCV(model, parameters[label], cv=5,scoring=('%s' % score))
            clf.fit(X_train, y_train)
            
            #print("Best parameters set found on development set:")
            #print()
            #print(clf.best_params_)
            #print()
            #print("Grid scores on development set:")
            #print()
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            #    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            #print()

            #print("Detailed classification report:")
            #print()
            #print("The model is trained on the full development set.")
            #print("The scores are computed on the full evaluation set.")
            #print()
            y_true, y_pred = y_test, clf.predict(X_test)
            #print(classification_report(y_true, y_pred))
            #print()
            res_str = "%s,%s,%s,%s,%s,%.2f" % (model.__class__.__name__,
                                                    dataset,
                                                    score,
                                                    list(clf.best_params_.values())[0],
                                                    list(clf.best_params_.values())[1],
                                                    (max(means)+accuracy_score(y_true, y_pred))/2)
                                                        
            #print(res_str)
            with open(output, "a") as f:
                f.write(res_str + '\n')

Tuning parameters for student dataset
[student]1/180
[student]2/180
[student]3/180
[student]4/180
[student]5/180
[student]6/180
[student]7/180
[student]8/180
[student]9/180
[student]10/180
[student]11/180
[student]12/180
Tuning parameters for contraceptive dataset
[contraceptive]13/180
[contraceptive]14/180
[contraceptive]15/180
[contraceptive]16/180
[contraceptive]17/180
[contraceptive]18/180
[contraceptive]19/180
[contraceptive]20/180
[contraceptive]21/180
[contraceptive]22/180
[contraceptive]23/180
[contraceptive]24/180
Tuning parameters for autism dataset
[autism]25/180
[autism]26/180
[autism]27/180
[autism]28/180
[autism]29/180
[autism]30/180
[autism]31/180
[autism]32/180
[autism]33/180
[autism]34/180
[autism]35/180
[autism]36/180
Tuning parameters for bankruptcy dataset
[bankruptcy]37/180
[bankruptcy]38/180
[bankruptcy]39/180
[bankruptcy]40/180
[bankruptcy]41/180
[bankruptcy]42/180
[bankruptcy]43/180
[bankruptcy]44/180
[bankruptcy]45/180
[bankruptcy]46/180
[bankruptcy]47/180
[ban

In [9]:
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex, HTML

Parameter combinations by GridSearchCV for all classifiers. Sorted in order of accuracy.
"model,dataset,scoring,p1,p2,acc"

In [14]:
df_def = pd.read_csv('../results/grid_search_res.csv')
res = df_def.drop(['dataset', 'scoring'], axis=1).groupby(['model','p1','p2']).count()
display(HTML(res.to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acc
model,p1,p2,Unnamed: 3_level_1
KNeighborsClassifier,1,uniform,10
KNeighborsClassifier,10,distance,3
KNeighborsClassifier,10,uniform,1
KNeighborsClassifier,3,distance,11
KNeighborsClassifier,3,uniform,3
KNeighborsClassifier,30,distance,8
KNeighborsClassifier,5,distance,2
KNeighborsClassifier,5,uniform,1
KNeighborsClassifier,8,distance,3
MLPClassifier,identity,0.001,2
