In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [3]:
df = pd.read_csv('../kyoto/kyoto.csv')

In [4]:
df.head()

Unnamed: 0,Duration,Service,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Flag,Label,Source_Port_Number,Destination_Port_Number,protocol_type
0,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,47904,23,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,58974,23,tcp
2,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,37174,23,tcp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,40711,3389,tcp
4,5.2e-05,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SH,-1,8429,22,tcp


In [5]:
df = df.sample(n=4000)

# Data Preprocessing

In [6]:
dummy_cols = ['Service' , 'Flag' , 'protocol_type']
df = pd.get_dummies( df , columns = dummy_cols)

In [7]:
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0                        0
Flag_S1   

In [8]:
df = df.dropna()
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0                        0
Flag_S1   

In [9]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
43179,0.0,0,0,1,1.0,1.0,1.0,32,61,0.03,...,0,0,1,0,0,0,0,0,1,0
269652,0.00042,44,104,21,1.0,0.0,0.0,93,99,0.0,...,0,0,0,0,1,0,0,0,0,1
344666,0.000419,44,104,16,1.0,0.0,0.06,86,95,0.0,...,0,0,0,0,1,0,0,0,0,1
348694,0.000589,44,104,11,1.0,0.0,0.0,84,99,0.0,...,0,0,0,0,1,0,0,0,0,1
365466,0.0,0,0,0,0.0,0.0,0.71,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0


# Split Data

In [10]:
X = df.drop(['Label'], axis=1)
y = df['Label']

In [11]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [12]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
43179,0.0,0,0,1,1.0,1.0,1.0,32,61,0.03,...,0,0,1,0,0,0,0,0,1,0
269652,0.00042,44,104,21,1.0,0.0,0.0,93,99,0.0,...,0,0,0,0,1,0,0,0,0,1
344666,0.000419,44,104,16,1.0,0.0,0.06,86,95,0.0,...,0,0,0,0,1,0,0,0,0,1
348694,0.000589,44,104,11,1.0,0.0,0.0,84,99,0.0,...,0,0,0,0,1,0,0,0,0,1
365466,0.0,0,0,0,0.0,0.0,0.71,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0


In [13]:
y.shape

(4000,)

In [14]:
y.head()

43179    -1
269652   -1
344666   -1
348694   -1
365466   -1
Name: Label, dtype: int64

# ChiSquared

In [15]:
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [16]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=23)
fit = selector.fit(X,y)
features = fit.transform(X)
print("scores_:",fit.scores_)
print("pvalues_:",fit.pvalues_)
print("selected index:",fit.get_support(True))
print("after transform:",fit.transform(X)) 
X = fit.transform(X)

before transform:         Duration  Source_bytes  Destination_bytes  Count  Same_srv_rate  \
43179   0.000000             0                  0      1            1.0   
269652  0.000420            44                104     21            1.0   
344666  0.000419            44                104     16            1.0   
348694  0.000589            44                104     11            1.0   
365466  0.000000             0                  0      0            0.0   
...          ...           ...                ...    ...            ...   
166677  0.000000             0                  0      0            0.0   
183781  0.000000             0                  0      0            0.0   
53392   0.000053             0                  0      0            0.0   
316372  1.596761           520               1745      1            1.0   
216696  0.000391            42                103     10            1.0   

        Serror_rate  Srv_serror_rate  Dst_host_count  Dst_host_srv_count  \
43179

# SVM

In [17]:
from sklearn import svm

In [18]:
svmModel = svm.SVC(gamma='scale')

In [19]:
svmScore = cross_validate(svmModel, X, y, cv=10, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    4.3s remaining:   17.4s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    4.3s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    4.3s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    4.3s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    4.4s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.4s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    4.4s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.7s finished


In [20]:
ss = pd.DataFrame(svmScore)

In [21]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.366255,0.148066,0.895262,0.895262,0.895262,0.895262
1,0.371335,0.14358,0.895262,0.895262,0.895262,0.895262
2,0.373444,0.150461,0.895262,0.895262,0.895262,0.895262
3,0.365306,0.151362,0.895262,0.895262,0.895262,0.895262
4,0.381418,0.153577,0.895262,0.895262,0.895262,0.895262


In [22]:
ss['test_accuracy'].mean()

0.8962524765779785

In [23]:
ss.to_csv('Kyoto_SVM_CHiSquared_Scores.csv')

# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [26]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=3 , scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


In [27]:
randomForestScore = pd.DataFrame(randomForestScore)

In [28]:
randomForestScore.to_csv('Kyoto_RandomForest_CHiSquared_scores.csv')

In [29]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [30]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [31]:
rf2 = RandomForestClassifier()

In [32]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=15, n_jobs=-1, cv=3)

In [33]:
dlGrid.fit(X, y)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   11.3s
[Paralle

[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:   22.7s
[Paralle

[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 415 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 417 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   34.3s
[Paralle

[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 547 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 549 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 550 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 551 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 553 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:   45.2s
[Paralle

[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 681 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 682 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 683 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done 685 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 687 tasks      | elapsed:   56.4s
[Paralle

[Parallel(n_jobs=-1)]: Done 805 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 813 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 815 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 817 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 818 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 819 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 820 tasks      | elapsed:  1.1min
[Paralle

[Parallel(n_jobs=-1)]: Done 942 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 943 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 944 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 945 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 949 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 950 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 951 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 952 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 953 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 955 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 957 tasks      | elapsed:  1.3min
[Paralle

[Parallel(n_jobs=-1)]: Done 1075 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1077 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1078 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1079 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1081 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1082 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1083 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1085 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1086 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1087 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1088 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1089 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1090 tasks      | elapsed: 

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [34]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.175208,0.016365,0.09623,0.028404,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.895802,0.896474,...,0.896474,0.89625,0.000317,73,0.895802,0.896474,0.896474,0.89625,0.000317,73
1,0.227227,0.040763,0.148969,0.03886,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.895802,0.896474,...,0.896474,0.89625,0.000317,73,0.895802,0.896474,0.896474,0.89625,0.000317,73
2,0.280786,0.02657,0.182377,0.025223,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.895802,0.896474,...,0.896474,0.89625,0.000317,73,0.895802,0.896474,0.896474,0.89625,0.000317,73
3,0.39468,0.006442,0.227981,0.033177,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.895802,0.896474,...,0.896474,0.89625,0.000317,73,0.895802,0.896474,0.896474,0.89625,0.000317,73
4,0.458836,0.028117,0.239403,0.009719,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.895802,0.896474,...,0.896474,0.89625,0.000317,73,0.895802,0.896474,0.896474,0.89625,0.000317,73


In [35]:
dtGridScores.to_csv('Kyoto_RandomForest_CHiSquared_GridSearch')

In [36]:
dt3=dlGrid.best_estimator_

In [37]:
dlGrid.best_params_

{'max_depth': 13, 'min_samples_split': 0.1, 'n_estimators': 160}

In [38]:
dtFinalScore = cross_validate(dt3, X, y, cv=10, scoring=scoring_metrics, verbose=15, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    0.9s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.9s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.9s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    1.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.7s finished


In [39]:
RandomForestFinalScore = pd.DataFrame(dtFinalScore)

In [40]:
RandomForestFinalScore.to_csv('Kyoto_RandomForest_CHiSquared_bestEstimator.csv')

# Neural Network

In [41]:
from sklearn.neural_network import MLPClassifier

In [42]:
mlpModel = MLPClassifier()

In [43]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    2.1s finished


In [44]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [45]:
mlpScore = pd.DataFrame(mlpScore)

In [46]:
mlpScore.to_csv("kyoto_MLP_CHiSquared_scores.csv")

In [47]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [48]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [50]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [51]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [52]:
mlp_grid.fit(X, y)

Fitting 4 folds for each of 312 candidates, totalling 1248 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 842 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 1194 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1248 out of 1248 | elapsed:  8.1min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_sta...
                                                (50, 50, 50), (80, 80, 80)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accu

In [53]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('Kyoto_MLP_CHiSquared_GridSearch.csv')

In [54]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

{'alpha': 0.01,
 'hidden_layer_sizes': 220,
 'learning_rate': 'constant',
 'solver': 'adam'}

In [55]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=4, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.1s finished


In [56]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)

In [57]:
mlpFinalScore.to_csv('Kyoto_MLP_CHiSquared_bestEstimator.csv')

# Decision Trees

In [58]:
from sklearn.tree import tree

In [59]:
dt=tree.DecisionTreeClassifier()

In [60]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [61]:
dtScore = pd.DataFrame(dtScore)

In [62]:
dtScore.to_csv('kyoto_DescisionTree_CHiSquared_scores.csv')

In [63]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [64]:
dt2 = tree.DecisionTreeClassifier()

In [65]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [66]:
dlGrid.fit(X, y)

Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    4.2s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)},
             pre_dispa

In [67]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.004336,0.001122,0.003777,0.000398,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.896104,0.896,0.896,...,0.89625,0.000376,226,0.896104,0.896,0.896,0.896897,0.89625,0.000376,226
1,0.003012,0.001739,0.00672,0.001616,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.896104,0.896,0.896,...,0.89625,0.000376,226,0.896104,0.896,0.896,0.896897,0.89625,0.000376,226
2,0.003931,0.000196,0.004647,0.002543,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.896104,0.896,0.896,...,0.89625,0.000376,226,0.896104,0.896,0.896,0.896897,0.89625,0.000376,226
3,0.003048,0.001195,0.005999,0.001431,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.896104,0.896,0.896,...,0.89625,0.000376,226,0.896104,0.896,0.896,0.896897,0.89625,0.000376,226
4,0.003806,0.002338,0.007379,0.001363,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.896104,0.896,0.896,...,0.89625,0.000376,226,0.896104,0.896,0.896,0.896897,0.89625,0.000376,226


In [68]:
dtGridScores.to_csv('Kyoto_DesicionTree_CHiSquared_GridSearch.csv')

In [69]:
dt3=dlGrid.best_estimator_

In [70]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [71]:
pd.DataFrame(dtFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.005022,0.008761,0.987013,0.987013,0.987013,0.987013
1,0.009729,0.000996,0.988,0.988,0.988,0.988
2,0.009757,0.00801,0.992,0.992,0.992,0.992
3,0.011611,0.004551,0.982983,0.982983,0.982983,0.982983


In [72]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore['test_accuracy'].mean()

0.9874989924989925

In [73]:
dtFinalScore.to_csv('Kyoto_DesicionTree_CHiSquared_bestEstimator.csv')

# KNN

In [74]:
from sklearn.neighbors import KNeighborsClassifier

In [75]:
knn = KNeighborsClassifier()

In [76]:
k_range = list(range(1, 101, 5))

In [77]:
param_dict = dict(n_neighbors=k_range)

In [78]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [79]:
grid.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    6.1s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                         51, 56, 61, 66, 71, 76, 81, 86, 91,
                                         96]},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1_score': make_scorer(f1_score, average=micro),
                      'precision': make_scorer(precision_score, average=micro),
                      'recall': make_scorer(recall_sco

In [80]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.010595,0.003392,0.365538,0.090261,1,{'n_neighbors': 1},0.896104,0.91,0.916,0.920921,...,0.91075,0.009303,1,0.896104,0.91,0.916,0.920921,0.91075,0.009303,1
1,0.014593,0.003806,0.412073,0.056937,6,{'n_neighbors': 6},0.892108,0.888,0.894,0.891892,...,0.8915,0.002181,19,0.892108,0.888,0.894,0.891892,0.8915,0.002181,19
2,0.023454,0.008172,0.61973,0.044769,11,{'n_neighbors': 11},0.888112,0.889,0.893,0.888889,...,0.88975,0.001907,20,0.888112,0.889,0.893,0.888889,0.88975,0.001907,20
3,0.022741,0.004885,0.655167,0.041949,16,{'n_neighbors': 16},0.894106,0.893,0.896,0.896897,...,0.895,0.001533,17,0.894106,0.893,0.896,0.896897,0.895,0.001533,17
4,0.022904,0.008772,0.586316,0.057272,21,{'n_neighbors': 21},0.895105,0.892,0.896,0.896897,...,0.895,0.001844,17,0.895105,0.892,0.896,0.896897,0.895,0.001844,17
5,0.022151,0.006602,0.573174,0.046632,26,{'n_neighbors': 26},0.896104,0.896,0.896,0.896897,...,0.89625,0.000376,2,0.896104,0.896,0.896,0.896897,0.89625,0.000376,2
6,0.018973,0.004013,0.497357,0.04009,31,{'n_neighbors': 31},0.896104,0.896,0.896,0.896897,...,0.89625,0.000376,2,0.896104,0.896,0.896,0.896897,0.89625,0.000376,2
7,0.013103,0.009231,0.601729,0.034241,36,{'n_neighbors': 36},0.896104,0.896,0.896,0.896897,...,0.89625,0.000376,2,0.896104,0.896,0.896,0.896897,0.89625,0.000376,2
8,0.018566,0.005197,0.549958,0.060839,41,{'n_neighbors': 41},0.896104,0.896,0.896,0.896897,...,0.89625,0.000376,2,0.896104,0.896,0.896,0.896897,0.89625,0.000376,2
9,0.011401,0.005534,0.697821,0.041306,46,{'n_neighbors': 46},0.896104,0.896,0.896,0.896897,...,0.89625,0.000376,2,0.896104,0.896,0.896,0.896897,0.89625,0.000376,2


In [81]:
knn2=grid.best_estimator_
knn2

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [82]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.2s finished


In [83]:
knnFinalScore = pd.DataFrame(knnFinalScore)

In [84]:
knnFinalScore.to_csv('Kyoto_KNN_CHiSquared_bestEstimator.csv')

# All Results

In [85]:
allResults=pd.concat([
    ss.mean(),
    mlpScore.mean(),
    dtFinalScore.mean(),
    knnFinalScore.mean(),
    RandomForestFinalScore.mean()], axis=1)
allResults

Unnamed: 0,0,1,2,3,4
fit_time,0.351134,1.247443,0.00903,0.017343,0.771068
score_time,0.141213,0.013843,0.00558,0.262968,0.189167
test_accuracy,0.896252,0.89075,0.987499,0.910756,0.969262
test_precision,0.896252,0.89075,0.987499,0.910756,0.969262
test_recall,0.896252,0.89075,0.987499,0.910756,0.969262
test_f1_score,0.896252,0.89075,0.987499,0.910756,0.969262


In [86]:
allResults.rename(columns={0:'SVM', 1:'Multi-level Perceptron', 2:'Decision Tree', 3:'KNN' , 4:'Random Forest'},  inplace=True)
allResults

Unnamed: 0,SVM,Multi-level Perceptron,Decision Tree,KNN,Random Forest
fit_time,0.351134,1.247443,0.00903,0.017343,0.771068
score_time,0.141213,0.013843,0.00558,0.262968,0.189167
test_accuracy,0.896252,0.89075,0.987499,0.910756,0.969262
test_precision,0.896252,0.89075,0.987499,0.910756,0.969262
test_recall,0.896252,0.89075,0.987499,0.910756,0.969262
test_f1_score,0.896252,0.89075,0.987499,0.910756,0.969262


In [87]:
allResults.to_csv('Kyoto_CHiSquared_Final.csv')