In [9]:
import pandas as pd
import numpy as np

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Reading Data

In [11]:
df = pd.read_csv('kyoto/kyoto.csv')

In [12]:
df.head()

Unnamed: 0,Duration,Service,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Flag,Label,Source_Port_Number,Destination_Port_Number,protocol_type
0,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,47904,23,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,58974,23,tcp
2,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,37174,23,tcp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,40711,3389,tcp
4,5.2e-05,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SH,-1,8429,22,tcp


In [13]:
df = df.sample(n=4000)

# Data Preprocessing

In [14]:
dummy_cols = ['Service' , 'Flag' , 'protocol_type']
df = pd.get_dummies( df , columns = dummy_cols)
df.isna().sum()


Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0   

In [15]:
df = df.dropna()
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH                     0
Flag_S0   

In [16]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTRH,Flag_S0,Flag_S1,Flag_S2,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
170959,0.0,0,0,0,0.0,0.0,0.93,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
96540,0.0,0,0,1,1.0,1.0,0.83,0,1,0.0,...,0,1,0,0,0,0,0,0,1,0
36253,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
152472,0.0,0,0,0,0.0,0.0,0.99,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
53178,0.0,0,0,0,0.0,0.0,0.93,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0


# Split Data

In [17]:
X = df.drop(['Label'], axis=1)
y = df['Label']

In [18]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [19]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTRH,Flag_S0,Flag_S1,Flag_S2,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
170959,0.0,0,0,0,0.0,0.0,0.93,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
96540,0.0,0,0,1,1.0,1.0,0.83,0,1,0.0,...,0,1,0,0,0,0,0,0,1,0
36253,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
152472,0.0,0,0,0,0.0,0.0,0.99,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0
53178,0.0,0,0,0,0.0,0.0,0.93,0,0,0.0,...,0,1,0,0,0,0,0,0,1,0


In [20]:
y.shape

(4000,)

In [21]:
y.head()

170959   -1
96540    -1
36253    -1
152472   -1
53178    -1
Name: Label, dtype: int64

# Feature Selection

In [22]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [23]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=25)
fit = selector.fit(X,y)
features = fit.transform(X)
print("scores_:",fit.scores_)
print("pvalues_:",fit.pvalues_)
print("selected index:",fit.get_support(True))
print("after transform:",fit.transform(X)) 
X = fit.transform(X)

before transform:         Duration  Source_bytes  Destination_bytes  Count  Same_srv_rate  \
170959  0.000000             0                  0      0            0.0   
96540   0.000000             0                  0      1            1.0   
36253   0.000000             0                  0      0            0.0   
152472  0.000000             0                  0      0            0.0   
53178   0.000000             0                  0      0            0.0   
...          ...           ...                ...    ...            ...   
115356  0.000000             0                  0      0            0.0   
252466  1.601011           536               1745      1            1.0   
165485  0.999594             0                  0      0            0.0   
1578    2.923007             0                  0      0            0.0   
229368  0.000000             0                  0      0            0.0   

        Serror_rate  Srv_serror_rate  Dst_host_count  Dst_host_srv_count  \
17095

# SVM

In [24]:
from sklearn import svm

In [25]:
svmModel = svm.SVC(gamma='scale')

In [26]:
svmScore = cross_validate(svmModel, X, y, cv=10, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:   29.3s remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   29.6s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   29.8s remaining:   44.8s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   29.9s remaining:   29.9s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   30.3s remaining:   20.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   30.5s remaining:   13.0s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:   31.5s remaining:    7.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.1s finished


In [27]:
ss = pd.DataFrame(svmScore)

In [28]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,1.587872,0.374149,0.887781,0.887781,0.887781,0.887781
1,1.204527,0.382251,0.887781,0.887781,0.887781,0.887781
2,1.038027,0.143389,0.887781,0.887781,0.887781,0.887781
3,0.836721,0.108632,0.8875,0.8875,0.8875,0.8875
4,1.797224,0.544137,0.8875,0.8875,0.8875,0.8875


In [29]:
ss['test_accuracy'].mean()

0.8882514578216114

In [30]:
ss.to_csv('Kyoto_SVM_Scores.csv')

# Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [33]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=3 , scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.8s finished


In [34]:
pd.DataFrame(randomForestScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.984348,0.52883,0.888306,0.888306,0.888306,0.888306
1,0.995861,0.550591,0.888222,0.888222,0.888222,0.888222
2,0.926413,0.44673,0.888222,0.888222,0.888222,0.888222


In [35]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [36]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [37]:
rf2 = RandomForestClassifier()

In [38]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=15, n_jobs=-1, cv=3)

In [39]:
dlGrid.fit(X, y)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   32.0s
[Paralle

[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Paralle

[Parallel(n_jobs=-1)]: Done 398 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 399 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:  1.5min
[Paralle

[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 547 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:  2.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 667 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 669 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 670 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 681 tasks      | elapsed:  2.4min
[Paralle

[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 801 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 803 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 805 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 813 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 815 tasks      | elapsed:  2.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 933 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 934 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 935 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 937 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 938 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 939 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 940 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 941 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 942 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 943 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 944 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 945 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:  3.4min
[Paralle

[Parallel(n_jobs=-1)]: Done 1065 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1066 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1067 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1068 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1069 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1070 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1071 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1073 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1074 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1075 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1077 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1078 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1079 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed: 

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [40]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,1.13049,0.086045,0.502337,0.064161,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.888306,0.888222,...,0.888222,0.88825,4e-05,73,0.888306,0.888222,0.888222,0.88825,4e-05,73
1,1.198738,0.21299,0.60886,0.034407,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.888306,0.888222,...,0.888222,0.88825,4e-05,73,0.888306,0.888222,0.888222,0.88825,4e-05,73
2,1.370229,0.230551,0.788649,0.101827,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.888306,0.888222,...,0.888222,0.88825,4e-05,73,0.888306,0.888222,0.888222,0.88825,4e-05,73
3,1.878566,0.105522,0.952182,0.052182,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.888306,0.888222,...,0.888222,0.88825,4e-05,73,0.888306,0.888222,0.888222,0.88825,4e-05,73
4,1.693392,0.676553,0.844925,0.436682,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.888306,0.888222,...,0.888222,0.88825,4e-05,73,0.888306,0.888222,0.888222,0.88825,4e-05,73


In [41]:
dtGridScores.to_csv('Kyoto_RandomForest_GridSearch')

In [42]:
dt3=dlGrid.best_estimator_

In [43]:
dlGrid.best_params_

{'max_depth': 13, 'min_samples_split': 0.1, 'n_estimators': 120}

In [44]:
dtFinalScore = cross_validate(dt3, X, y, cv=10, scoring=scoring_metrics, verbose=15, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:   32.5s remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   34.2s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   34.2s remaining:   51.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   35.5s remaining:   35.5s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   35.9s remaining:   23.9s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   36.6s remaining:   15.6s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:   37.4s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   39.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   39.4s finished


In [45]:
dtFinalScore = pd.DataFrame(dtFinalScore)

In [46]:
dtFinalScore.to_csv('Kyoto_RandomForest_Scores.csv')

# Neural Network

In [47]:
from sklearn.neural_network import MLPClassifier

In [48]:
mlpModel = MLPClassifier()

In [49]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    6.7s finished


In [50]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [51]:
mlpScore = pd.DataFrame(mlpScore)

In [52]:
mlpScore.to_csv("kyoto_MLP_scores.csv")

In [53]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [54]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [55]:
X_sample=X.sample(frac=0.2, random_state=1)
y_sample=y.sample(frac=0.2, random_state=1)

AttributeError: 'numpy.ndarray' object has no attribute 'sample'

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [None]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [None]:
mlp_grid.fit(X_sample, y_sample)

In [None]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('Kyoto_MLP_GridSearch')

In [None]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

In [None]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=4, n_jobs=-1)

In [None]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)

In [None]:
mlpFinalScore.to_csv('Kyoto_MLP_Scores')

# Decision Trees

In [56]:
from sklearn.tree import tree

In [57]:
dt=tree.DecisionTreeClassifier()

In [58]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.0s finished


In [59]:
pd.DataFrame(dtScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.015958,0.007979,0.99001,0.99001,0.99001,0.99001
1,0.01812,0.010061,0.99,0.99,0.99,0.99
2,0.020192,0.008027,0.986,0.986,0.986,0.986
3,0.013866,0.00803,0.991992,0.991992,0.991992,0.991992


In [60]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [61]:
dt2 = tree.DecisionTreeClassifier()

In [62]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [63]:
dlGrid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 612 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   20.6s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)},
             pre_dispa

In [64]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.009128,0.001619376,0.001957,0.00339,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.888112,0.888,0.888,...,0.88825,0.000371,226,0.888112,0.888,0.888,0.888889,0.88825,0.000371,226
1,0.005984,8.150813e-07,0.008228,0.000432,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.888112,0.888,0.888,...,0.88825,0.000371,226,0.888112,0.888,0.888,0.888889,0.88825,0.000371,226
2,0.005984,7.539457e-07,0.008228,0.000432,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.888112,0.888,0.888,...,0.88825,0.000371,226,0.888112,0.888,0.888,0.888889,0.88825,0.000371,226
3,0.005734,0.0004320186,0.036901,0.050096,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.888112,0.888,0.888,...,0.88825,0.000371,226,0.888112,0.888,0.888,0.888889,0.88825,0.000371,226
4,0.008976,0.005183181,0.054105,0.047936,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.888112,0.888,0.888,...,0.88825,0.000371,226,0.888112,0.888,0.888,0.888889,0.88825,0.000371,226


In [65]:
dtGridScores.to_csv('Kyoto_DesicionTree_GridSearch.csv')

In [66]:
dt3=dlGrid.best_estimator_

In [67]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [68]:
pd.DataFrame(dtFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.020126,0.010056,0.988012,0.988012,0.988012,0.988012
1,0.018087,0.00202,0.99,0.99,0.99,0.99
2,0.020107,0.008025,0.978,0.978,0.978,0.978
3,0.018081,0.002015,0.98999,0.98999,0.98999,0.98999


In [69]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore['test_accuracy'].mean()

0.9865004945004945

In [70]:
dtFinalScore.to_csv('Kyoto_DesicionTree_Scores.csv')

# KNN

In [71]:
from sklearn.neighbors import KNeighborsClassifier

In [72]:
knn = KNeighborsClassifier()

In [73]:
k_range = list(range(1, 101, 5))

In [74]:
param_dict = dict(n_neighbors=k_range)

In [75]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [76]:
grid.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   41.0s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                         51, 56, 61, 66, 71, 76, 81, 86, 91,
                                         96]},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1_score': make_scorer(f1_score, average=micro),
                      'precision': make_scorer(precision_score, average=micro),
                      'recall': make_scorer(recall_sco

In [77]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.013463,0.002056,1.097563,0.0704,1,{'n_neighbors': 1},0.901099,0.904,0.888,0.897898,...,0.89775,0.006029,1,0.901099,0.904,0.888,0.897898,0.89775,0.006029,1
1,0.013464,0.002056,1.387537,0.132323,6,{'n_neighbors': 6},0.875125,0.885,0.881,0.887888,...,0.88225,0.004788,20,0.875125,0.885,0.881,0.887888,0.88225,0.004788,20
2,0.035405,0.037187,1.344153,0.14835,11,{'n_neighbors': 11},0.88012,0.883,0.889,0.884885,...,0.88425,0.003225,19,0.88012,0.883,0.889,0.884885,0.88425,0.003225,19
3,0.085022,0.041766,1.506718,0.14611,16,{'n_neighbors': 16},0.887113,0.887,0.883,0.890891,...,0.887,0.00279,17,0.887113,0.887,0.883,0.890891,0.887,0.00279,17
4,0.038147,0.044229,1.523424,0.34996,21,{'n_neighbors': 21},0.884116,0.887,0.884,0.88989,...,0.88625,0.00242,18,0.884116,0.887,0.884,0.88989,0.88625,0.00242,18
5,0.01471,0.002579,1.268107,0.644033,26,{'n_neighbors': 26},0.888112,0.888,0.887,0.888889,...,0.888,0.000671,16,0.888112,0.888,0.887,0.888889,0.888,0.000671,16
6,0.01446,0.002685,1.497256,0.63093,31,{'n_neighbors': 31},0.888112,0.888,0.888,0.888889,...,0.88825,0.000371,2,0.888112,0.888,0.888,0.888889,0.88825,0.000371,2
7,0.042386,0.046376,1.145447,0.737226,36,{'n_neighbors': 36},0.888112,0.888,0.888,0.888889,...,0.88825,0.000371,2,0.888112,0.888,0.888,0.888889,0.88825,0.000371,2
8,0.041893,0.047291,1.160656,0.7192,41,{'n_neighbors': 41},0.888112,0.888,0.888,0.888889,...,0.88825,0.000371,2,0.888112,0.888,0.888,0.888889,0.88825,0.000371,2
9,0.071558,0.058898,1.203529,0.769655,46,{'n_neighbors': 46},0.888112,0.888,0.888,0.888889,...,0.88825,0.000371,2,0.888112,0.888,0.888,0.888889,0.88825,0.000371,2


In [78]:
knn2=grid.best_estimator_
knn2

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [79]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.9s finished


In [80]:
pd.DataFrame(knnFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.009974,0.874657,0.901099,0.901099,0.901099,0.901099
1,0.014959,0.950457,0.904,0.904,0.904,0.904
2,0.012963,0.924527,0.888,0.888,0.888,0.888
3,0.01496,0.96043,0.897898,0.897898,0.897898,0.897898


In [82]:
pd.DataFrame(mlpScore).mean()

fit_time          4.729520
score_time        0.073271
test_accuracy     0.794500
test_precision    0.794500
test_recall       0.794500
test_f1_score     0.794500
dtype: float64

In [83]:
knnFinalScore = pd.DataFrame(knnFinalScore)
knnFinalScore.to_csv('Kyoto_KNN_Scores_ChiSquared')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [None]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=4, scoring=scoring_metrics, verbose=3 ,n_jobs=-1)

In [None]:
randomForestScore = pd.DataFrame(randomForestScore)

In [None]:
randomForestScore.to_csv('Kyoto_RandomForest_scores_ChiSquared.csv')

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [None]:
rf2 = RandomForestClassifier()

In [None]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)

In [None]:
dlGrid.fit(X, y)

In [None]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

In [None]:
dt3=dlGrid.best_estimator_

In [None]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
pd.DataFrame(dtFinalScore)

In [None]:
dtFinalScore['test_accuracy'].mean()

# All Results

In [67]:
allResults=pd.concat([
    pd.DataFrame(linearScore).mean(),
    pd.DataFrame(mlpScore).mean(),
    pd.DataFrame(dtFinalScore).mean(),
    pd.DataFrame(knnFinalScore).mean()], axis=1)
allResults

Unnamed: 0,0,1,2,3
fit_time,0.475388,80.660868,0.187977,2.758369
score_time,0.014059,0.059341,0.00449,2.287742
test_r2,-141.704346,-5.804186,-0.607665,-0.159364
test_neg_mean_squared_error,-45841.923804,-4572.337706,-769.260438,-515.443445
test_neg_mean_absolute_error,-26.534008,-24.622919,-20.780519,-11.968235


In [68]:
allResults.rename(columns={0:'Linear Regression', 1:'Multi-level Perceptron', 2:'Decision Tree', 3:'KNN'},  inplace=True)
allResults

Unnamed: 0,Linear Regression,Multi-level Perceptron,Decision Tree,KNN
fit_time,0.475388,80.660868,0.187977,2.758369
score_time,0.014059,0.059341,0.00449,2.287742
test_r2,-141.704346,-5.804186,-0.607665,-0.159364
test_neg_mean_squared_error,-45841.923804,-4572.337706,-769.260438,-515.443445
test_neg_mean_absolute_error,-26.534008,-24.622919,-20.780519,-11.968235


In [69]:
pred = dt3.predict(X)
pred

array([45.73684211, 45.73684211, 45.73684211, ..., 84.74276527,
       24.49354839, 53.27044025])

In [None]:
pred[0:100]

In [None]:
X.head(100)