In [109]:
import pandas as pd
import numpy as np

In [110]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [111]:
df = pd.read_csv('../kyoto/kyoto.csv')

In [112]:
df.head()

Unnamed: 0,Duration,Service,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Flag,Label,Source_Port_Number,Destination_Port_Number,protocol_type
0,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,47904,23,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,58974,23,tcp
2,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,37174,23,tcp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,40711,3389,tcp
4,5.2e-05,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SH,-1,8429,22,tcp


In [113]:
df = df.sample(n=4000)

# Data Preprocessing

In [115]:
dummy_cols = ['Service' , 'Flag' , 'protocol_type']
df = pd.get_dummies( df , columns = dummy_cols)

In [119]:
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0   

In [120]:
df = df.dropna()
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0   

In [121]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
329583,0.000454,44,104,15,1.0,0.0,0.0,89,99,0.0,...,0,0,0,0,1,0,0,0,0,1
85372,0.0,0,0,0,0.0,0.0,1.0,2,2,1.0,...,0,0,1,0,0,0,0,0,1,0
267953,0.000453,44,104,24,1.0,0.0,0.0,92,99,0.0,...,0,0,0,0,1,0,0,0,0,1
375435,0.000516,44,104,11,1.0,0.0,0.0,84,99,0.0,...,0,0,0,0,1,0,0,0,0,1
347629,1.604261,536,1745,1,1.0,0.0,0.0,24,24,0.0,...,0,0,0,0,1,0,0,0,1,0


# Split Data

In [122]:
X = df.drop(['Label'], axis=1)
y = df['Label']

In [123]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [124]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
329583,0.000454,44,104,15,1.0,0.0,0.0,89,99,0.0,...,0,0,0,0,1,0,0,0,0,1
85372,0.0,0,0,0,0.0,0.0,1.0,2,2,1.0,...,0,0,1,0,0,0,0,0,1,0
267953,0.000453,44,104,24,1.0,0.0,0.0,92,99,0.0,...,0,0,0,0,1,0,0,0,0,1
375435,0.000516,44,104,11,1.0,0.0,0.0,84,99,0.0,...,0,0,0,0,1,0,0,0,0,1
347629,1.604261,536,1745,1,1.0,0.0,0.0,24,24,0.0,...,0,0,0,0,1,0,0,0,1,0


In [125]:
y.shape

(4000,)

In [126]:
y.head()

329583   -1
85372    -1
267953   -1
375435   -1
347629   -1
Name: Label, dtype: int64

# ChiSquared

In [None]:
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=25)
fit = selector.fit(X,y)
features = fit.transform(X)
print("scores_:",fit.scores_)
print("pvalues_:",fit.pvalues_)
print("selected index:",fit.get_support(True))
print("after transform:",fit.transform(X)) 
X = fit.transform(X)

# SVM

In [None]:
from sklearn import svm

In [136]:
svmModel = svm.SVC(gamma='scale')

In [137]:
svmScore = cross_validate(svmModel, X, y, cv=10, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    4.8s remaining:   19.6s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    4.9s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    5.0s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    5.1s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    5.1s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    5.1s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    5.4s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.4s finished


In [138]:
ss = pd.DataFrame(svmScore)



In [139]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_precision,train_precision,test_recall,train_recall,test_f1_score,train_f1_score
0,0.13592,0.046974,0.942643,0.944707,0.942643,0.944707,0.942643,0.944707,0.942643,0.944707
1,0.136429,0.042975,0.927681,0.94554,0.927681,0.94554,0.927681,0.94554,0.927681,0.94554
2,0.142918,0.045481,0.957606,0.942484,0.957606,0.942484,0.957606,0.942484,0.957606,0.942484
3,0.13792,0.045973,0.952618,0.942484,0.952618,0.942484,0.952618,0.942484,0.952618,0.942484
4,0.15142,0.203884,0.945137,0.943595,0.945137,0.943595,0.945137,0.943595,0.945137,0.943595


In [140]:
ss['test_accuracy'].mean()

0.9434958968493554

In [141]:
ss.to_csv('Kyoto_SVM_CHiSquared_Scores.csv')

# Random Forest

In [142]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [144]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=3 , scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


In [145]:
randomForestScore = pd.DataFrame(randomForestScore)



In [146]:
randomForestScore.to_csv('Kyoto_RandomForest_CHiSquared_scores.csv')

In [147]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [148]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [149]:
rf2 = RandomForestClassifier()

In [150]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=15, n_jobs=-1, cv=3)

In [151]:
dlGrid.fit(X, y)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 151 tasks      | elapsed:   14.0s
[Paralle

[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 282 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 283 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed:   28.5s
[Paralle

[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 415 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 417 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 419 tasks      | elapsed:   42.6s
[Paralle

[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done 547 tasks      | elapsed:   56.6s
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 549 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 550 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 551 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done 553 tasks      | elapsed:   57.3s
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:   57.5s
[Paralle

[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 681 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 682 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 683 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 685 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 687 tasks      | elapsed:  1.2min
[Paralle

[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 813 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 815 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 817 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 818 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 819 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 820 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 821 tasks      | elapsed:  1.4min
[Paralle

[Parallel(n_jobs=-1)]: Done 943 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 944 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 945 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 949 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 950 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 951 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 952 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 953 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 955 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 957 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 958 tasks      | elapsed:  1.6min
[Paralle

[Parallel(n_jobs=-1)]: Done 1075 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1077 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1078 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1079 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1081 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1082 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1083 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1085 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1086 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1087 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1088 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1089 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1090 tasks      | elapsed: 

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': range(80, 200, 20), 'max_depth': range(1, 20, 3), 'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn',
       scoring={'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average=micro), 'recall': make_scorer(recall_score, average=micro), 'f1_score': make_scorer(f1_score, average=m

In [152]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_train_f1_score,split1_train_f1_score,split2_train_f1_score,mean_train_f1_score,std_train_f1_score
0,0.214544,0.020794,0.094612,0.003856,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.891304,0.891223,...,0.891223,0.891223,0.89125,3.8e-05,38,0.891223,0.891264,0.891264,0.89125,1.9e-05
1,0.267846,0.020204,0.120597,0.008571,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.891304,0.891223,...,0.891223,0.891223,0.89125,3.8e-05,38,0.891223,0.891264,0.891264,0.89125,1.9e-05
2,0.343803,0.065098,0.188891,0.068822,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.891304,0.891223,...,0.891223,0.891223,0.89125,3.8e-05,38,0.891223,0.891264,0.891264,0.89125,1.9e-05
3,0.370121,0.028295,0.165572,0.007538,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.891304,0.891223,...,0.891223,0.891223,0.89125,3.8e-05,38,0.891223,0.891264,0.891264,0.89125,1.9e-05
4,0.447076,0.04501,0.198553,0.011079,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.891304,0.891223,...,0.891223,0.891223,0.89125,3.8e-05,38,0.891223,0.891264,0.891264,0.89125,1.9e-05


In [153]:
dtGridScores.to_csv('Kyoto_RandomForest_CHiSquared_GridSearch')

In [154]:
dt3=dlGrid.best_estimator_

In [155]:
dlGrid.best_params_

{'max_depth': 16, 'min_samples_split': 0.1, 'n_estimators': 100}

In [156]:
dtFinalScore = cross_validate(dt3, X, y, cv=10, scoring=scoring_metrics, verbose=15, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    0.8s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.9s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    1.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [159]:
RandomForestFinalScore = pd.DataFrame(dtFinalScore)

In [160]:
RandomForestFinalScore.to_csv('Kyoto_RandomForest_CHiSquared_bestEstimator.csv')

# Neural Network

In [161]:
from sklearn.neural_network import MLPClassifier

In [162]:
mlpModel = MLPClassifier()

In [163]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    3.7s finished


In [164]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [165]:
mlpScore = pd.DataFrame(mlpScore)



In [166]:
mlpScore.to_csv("kyoto_MLP_CHiSquared_scores.csv")

In [167]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [168]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [169]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [170]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [171]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [172]:
mlp_grid.fit(X, y)

Fitting 4 folds for each of 312 candidates, totalling 1248 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 1248 out of 1248 | elapsed: 12.8min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'hidden_layer_sizes': [4, 7, 10, 13, 17, 20, 30, 50, 80, 100, 120, 140, 180, 220, (10, 10), (20, 20), (30, 30), (50, 50), (80, 80), (100, 100), (150, 150), (10, 10, 10), (20, 20, 20), (30, 30, 30), (50, 50, 50), (80, 80, 80)], 'solver': ['sgd', 'adam'], 'alpha': [0.001, 0.01, 0.05], 'learning_rate': ['constant', 'adaptive']},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn

In [173]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('Kyoto_MLP_CHiSquared_GridSearch.csv')



In [174]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

{'alpha': 0.01,
 'hidden_layer_sizes': (30, 30, 30),
 'learning_rate': 'adaptive',
 'solver': 'adam'}

In [175]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=4, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.1s finished


In [176]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)



In [177]:
mlpFinalScore.to_csv('Kyoto_MLP_CHiSquared_bestEstimator')

# Decision Trees

In [178]:
from sklearn.tree import tree

In [179]:
dt=tree.DecisionTreeClassifier()

In [180]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [181]:
dtScore = pd.DataFrame(dtScore)



In [182]:
dtScore.to_csv('kyoto_DescisionTree_CHiSquared_scores.csv')

In [183]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [184]:
dt2 = tree.DecisionTreeClassifier()

In [185]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [186]:
dlGrid.fit(X, y)

Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    4.8s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'min_samples_split': range(10, 500, 20), 'max_depth': range(1, 20, 2)},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn',
       scoring={'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average=micro), 'recall': make_scorer(recall_score, average=micro), 'f1_score': make_scorer(f1_score, average=micro)},
       verbose=3)

In [187]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_train_f1_score,split1_train_f1_score,split2_train_f1_score,split3_train_f1_score,mean_train_f1_score,std_train_f1_score
0,0.004747,0.001478409,0.006747,0.001089,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.891109,0.891,0.891,...,0.891892,0.89125,0.000373,226,0.891297,0.891333,0.891333,0.891036,0.89125,0.000124
1,0.005996,6.529362e-07,0.007746,0.000433,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.891109,0.891,0.891,...,0.891892,0.89125,0.000373,226,0.891297,0.891333,0.891333,0.891036,0.89125,0.000124
2,0.005757,0.0002413942,0.00824,0.000435,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.891109,0.891,0.891,...,0.891892,0.89125,0.000373,226,0.891297,0.891333,0.891333,0.891036,0.89125,0.000124
3,0.005747,0.0004321558,0.008245,0.000433,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.891109,0.891,0.891,...,0.891892,0.89125,0.000373,226,0.891297,0.891333,0.891333,0.891036,0.89125,0.000124
4,0.004998,0.0007067185,0.006496,0.001118,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.891109,0.891,0.891,...,0.891892,0.89125,0.000373,226,0.891297,0.891333,0.891333,0.891036,0.89125,0.000124


In [188]:
dtGridScores.to_csv('Kyoto_DesicionTree_CHiSquared_GridSearch.csv')

In [189]:
dt3=dlGrid.best_estimator_

In [190]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [191]:
pd.DataFrame(dtFinalScore)



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_precision,train_precision,test_recall,train_recall,test_f1_score,train_f1_score
0,0.009979,0.006012,0.949051,0.953318,0.949051,0.953318,0.949051,0.953318,0.949051,0.953318
1,0.00998,0.00701,0.948,0.951333,0.948,0.951333,0.948,0.951333,0.948,0.951333
2,0.010008,0.004997,0.945,0.952,0.945,0.952,0.945,0.952,0.945,0.952
3,0.013006,0.007981,0.936937,0.956681,0.936937,0.956681,0.936937,0.956681,0.936937,0.956681


In [192]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore['test_accuracy'].mean()

0.9447469714969714

In [193]:
dtFinalScore.to_csv('Kyoto_DesicionTree_CHiSquared_bestEstimator.csv')

# KNN

In [194]:
from sklearn.neighbors import KNeighborsClassifier

In [195]:
knn = KNeighborsClassifier()

In [196]:
k_range = list(range(1, 101, 5))

In [197]:
param_dict = dict(n_neighbors=k_range)

In [198]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [199]:
grid.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   15.5s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81, 86, 91, 96]},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn',
       scoring={'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average=micro), 'recall': make_scorer(recall_score, average=micro), 'f1_score': make_scorer(f1_score, average=micro)},
       verbose=3)

In [200]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_train_f1_score,split1_train_f1_score,split2_train_f1_score,split3_train_f1_score,mean_train_f1_score,std_train_f1_score
0,0.104186,0.002580008,0.373766,0.006755,1,{'n_neighbors': 1},0.952048,0.949,0.957,0.942943,...,0.942943,0.95025,0.005092,1,1.0,1.0,1.0,1.0,1.0,0.0
1,0.094192,0.00326168,0.420117,0.040886,6,{'n_neighbors': 6},0.943057,0.948,0.948,0.945946,...,0.945946,0.94625,0.002026,5,0.955652,0.952333,0.957,0.959014,0.956,0.002431
2,0.005997,0.001731581,0.29458,0.020861,11,{'n_neighbors': 11},0.946054,0.951,0.948,0.942943,...,0.942943,0.947,0.00293,2,0.95065,0.948667,0.951,0.952016,0.950583,0.001215
3,0.006247,0.002163771,0.307573,0.021217,16,{'n_neighbors': 16},0.944056,0.95,0.948,0.945946,...,0.945946,0.947,0.002224,2,0.947983,0.948,0.947333,0.949683,0.94825,0.00087
4,0.004747,0.0004330846,0.299829,0.006591,21,{'n_neighbors': 21},0.945055,0.95,0.947,0.942943,...,0.942943,0.94625,0.002597,5,0.946982,0.945667,0.946333,0.947351,0.946583,0.000642
5,0.005247,0.0004327405,0.309822,0.011084,26,{'n_neighbors': 26},0.943057,0.949,0.949,0.943944,...,0.943944,0.94625,0.002768,5,0.947649,0.946333,0.945667,0.947684,0.946833,0.000866
6,0.005247,0.0004327406,0.337338,0.014363,31,{'n_neighbors': 31},0.944056,0.949,0.947,0.945946,...,0.945946,0.9465,0.001788,4,0.946982,0.946,0.944667,0.946351,0.946,0.000846
7,0.004997,3.526258e-07,0.334314,0.017426,36,{'n_neighbors': 36},0.946054,0.945,0.947,0.944945,...,0.944945,0.94575,0.000846,8,0.946315,0.946333,0.945,0.946018,0.945917,0.000544
8,0.005496,0.00150009,0.357794,0.037074,41,{'n_neighbors': 41},0.946054,0.945,0.946,0.942943,...,0.942943,0.945,0.001259,9,0.944315,0.946,0.944333,0.945352,0.945,0.000714
9,0.005247,0.0004327753,0.373785,0.054499,46,{'n_neighbors': 46},0.944056,0.936,0.946,0.942943,...,0.942943,0.94225,0.003771,10,0.944982,0.937,0.944333,0.945685,0.943,0.003497


In [201]:
knn2=grid.best_estimator_
knn2

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [205]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.9s finished


In [206]:
knnFinalScore = pd.DataFrame(knnFinalScore)



In [207]:
knnFinalScore.to_csv('Kyoto_KNN_CHiSquared_bestEstimator.csv')

# All Results

In [208]:
allResults=pd.concat([
    ss.mean(),
    mlpScore.mean(),
    dtFinalScore.mean(),
    knnFinalScore.mean(),
    RandomForestFinalScore.mean()], axis=1)
allResults

Unnamed: 0,0,1,2,3,4
fit_time,0.142372,3.463258,0.010743,0.003998,0.619984
score_time,0.060667,0.014991,0.0065,0.247108,0.09848
test_accuracy,0.943496,0.944001,0.944747,0.950248,0.945753
train_accuracy,0.943944,0.946667,0.953333,1.0,0.947361
test_precision,0.943496,0.944001,0.944747,0.950248,0.945753
train_precision,0.943944,0.946667,0.953333,1.0,0.947361
test_recall,0.943496,0.944001,0.944747,0.950248,0.945753
train_recall,0.943944,0.946667,0.953333,1.0,0.947361
test_f1_score,0.943496,0.944001,0.944747,0.950248,0.945753
train_f1_score,0.943944,0.946667,0.953333,1.0,0.947361


In [209]:
allResults.rename(columns={0:'SVM', 1:'Multi-level Perceptron', 2:'Decision Tree', 3:'KNN' , 4:'Random Forest'},  inplace=True)
allResults

Unnamed: 0,SVM,Multi-level Perceptron,Decision Tree,KNN,Random Forest
fit_time,0.142372,3.463258,0.010743,0.003998,0.619984
score_time,0.060667,0.014991,0.0065,0.247108,0.09848
test_accuracy,0.943496,0.944001,0.944747,0.950248,0.945753
train_accuracy,0.943944,0.946667,0.953333,1.0,0.947361
test_precision,0.943496,0.944001,0.944747,0.950248,0.945753
train_precision,0.943944,0.946667,0.953333,1.0,0.947361
test_recall,0.943496,0.944001,0.944747,0.950248,0.945753
train_recall,0.943944,0.946667,0.953333,1.0,0.947361
test_f1_score,0.943496,0.944001,0.944747,0.950248,0.945753
train_f1_score,0.943944,0.946667,0.953333,1.0,0.947361


In [211]:
allResults.to_csv('Kyoto_CHiSquared_Final.csv')

In [None]:
pred[0:100]

In [None]:
X.head(100)