In [127]:
import pandas as pd
import numpy as np

In [128]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Reading Data

In [129]:
df = pd.read_csv('kyoto/kyoto.csv')

In [130]:
df.head()

Unnamed: 0,Duration,Service,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Flag,Label,Source_Port_Number,Destination_Port_Number,protocol_type
0,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,47904,23,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,58974,23,tcp
2,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,37174,23,tcp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,-1,40711,3389,tcp
4,5.2e-05,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SH,-1,8429,22,tcp


In [131]:
df = df.sample(n=4000)

# Data Preprocessing

In [132]:
dummy_cols = ['Service' , 'Flag' , 'protocol_type']
df = pd.get_dummies( df , columns = dummy_cols)
df.isna().sum()


Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Service_ssl                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH

In [133]:
df = df.dropna()
df.isna().sum()

Duration                       0
Source_bytes                   0
Destination_bytes              0
Count                          0
Same_srv_rate                  0
Serror_rate                    0
Srv_serror_rate                0
Dst_host_count                 0
Dst_host_srv_count             0
Dst_host_same_src_port_rate    0
Dst_host_serror_rate           0
Dst_host_srv_serror_rate       0
Label                          0
Source_Port_Number             0
Destination_Port_Number        0
Service_dns                    0
Service_http                   0
Service_other                  0
Service_rdp                    0
Service_sip                    0
Service_smtp                   0
Service_smtp,ssl               0
Service_snmp                   0
Service_ssh                    0
Service_ssl                    0
Flag_OTH                       0
Flag_REJ                       0
Flag_RSTO                      0
Flag_RSTOS0                    0
Flag_RSTR                      0
Flag_RSTRH

In [134]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
255018,1.612751,520,1745,1,1.0,0.0,0.0,11,11,0.0,...,0,0,0,0,1,0,0,0,1,0
301673,1.50498,536,1564,1,1.0,0.0,0.0,15,15,0.0,...,0,0,0,0,1,0,0,0,1,0
140811,3.046459,0,0,0,0.0,0.0,1.0,30,73,0.0,...,0,0,1,0,0,0,0,0,1,0
16557,2.99954,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0
77340,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0


# Split Data

In [135]:
X = df.drop(['Label'], axis=1)
y = df['Label']

In [136]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [137]:
df.head()

Unnamed: 0,Duration,Source_bytes,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_src_port_rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_SF,Flag_SH,Flag_SHR,protocol_type_icmp,protocol_type_tcp,protocol_type_udp
255018,1.612751,520,1745,1,1.0,0.0,0.0,11,11,0.0,...,0,0,0,0,1,0,0,0,1,0
301673,1.50498,536,1564,1,1.0,0.0,0.0,15,15,0.0,...,0,0,0,0,1,0,0,0,1,0
140811,3.046459,0,0,0,0.0,0.0,1.0,30,73,0.0,...,0,0,1,0,0,0,0,0,1,0
16557,2.99954,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0
77340,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0,0,1,0,0,0,0,0,1,0


In [138]:
y.shape

(4000,)

In [139]:
y.head()

255018   -1
301673    1
140811   -1
16557    -1
77340    -1
Name: Label, dtype: int64

# Feature Selection

In [140]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [141]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=23)
fit = selector.fit(X,y)
features = fit.transform(X)
print("scores_:",fit.scores_)
print("pvalues_:",fit.pvalues_)
print("selected index:",fit.get_support(True))
print("after transform:",fit.transform(X)) 
X = fit.transform(X)

before transform:          Duration  Source_bytes  Destination_bytes  Count  Same_srv_rate  \
255018   1.612751           520               1745      1            1.0   
301673   1.504980           536               1564      1            1.0   
140811   3.046459             0                  0      0            0.0   
16557    2.999540             0                  0      0            0.0   
77340    0.000000             0                  0      0            0.0   
...           ...           ...                ...    ...            ...   
148286   2.955127             0                  0      3            1.0   
225472   0.000409            44                104     11            1.0   
228087   0.000479            44                104     11            1.0   
206995   3.007578             0                  0      0            0.0   
168182  34.295373           647                738      0            0.0   

        Serror_rate  Srv_serror_rate  Dst_host_count  Dst_host_srv_co

# SVM

In [142]:
from sklearn import svm

In [143]:
svmModel = svm.SVC(gamma='scale')

In [144]:
svmScore = cross_validate(svmModel, X, y, cv=10, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    0.2s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [145]:
ss = pd.DataFrame(svmScore)

In [146]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.287713,0.135,0.89,0.89,0.89,0.89
1,0.331698,0.13927,0.89,0.89,0.89,0.89
2,0.236476,0.086725,0.89,0.89,0.89,0.89
3,0.23248,0.082685,0.89,0.89,0.89,0.89
4,0.225988,0.082685,0.89,0.89,0.89,0.89


In [147]:
ss['test_accuracy'].mean()

0.89

In [148]:
ss.to_csv('Kyoto_SVM_Scores.csv')

# Random Forest

In [149]:
from sklearn.ensemble import RandomForestClassifier

In [150]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [151]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=3 , scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


In [152]:
pd.DataFrame(randomForestScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.116549,0.099922,0.889805,0.889805,0.889805,0.889805
1,0.203819,0.051362,0.889805,0.889805,0.889805,0.889805
2,0.199803,0.056746,0.89039,0.89039,0.89039,0.89039


In [153]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [154]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [155]:
rf2 = RandomForestClassifier()

In [156]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=15, n_jobs=-1, cv=3)

In [157]:
dlGrid.fit(X, y)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:   10.7s
[Paralle

[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 282 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 283 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed:   22.1s
[Paralle

[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 415 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 417 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   33.4s
[Paralle

[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 547 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done 549 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 550 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 551 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:   44.4s
[Paralle

[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 681 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 682 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 683 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 685 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 687 tasks      | elapsed:   55.5s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:   55.6s
[Paralle

[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 813 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 815 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 817 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 818 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 819 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 820 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 821 tasks      | elapsed:  1.1min
[Paralle

[Parallel(n_jobs=-1)]: Done 943 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 944 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 945 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 949 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 950 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 951 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 952 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 953 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 955 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 957 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 958 tasks      | elapsed:  1.3min
[Paralle

[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1077 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1078 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1079 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1081 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1082 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1083 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1085 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1086 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1087 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1088 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1089 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1090 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1091 tasks      | elapsed: 

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [158]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.147166,0.027191,0.116096,0.012305,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.889805,0.889805,...,0.89039,0.89,0.000276,68,0.889805,0.889805,0.89039,0.89,0.000276,68
1,0.168447,0.025906,0.100235,0.013118,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.889805,0.889805,...,0.89039,0.89,0.000276,68,0.889805,0.889805,0.89039,0.89,0.000276,68
2,0.285409,0.039763,0.172872,0.012919,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.889805,0.889805,...,0.89039,0.89,0.000276,68,0.889805,0.889805,0.89039,0.89,0.000276,68
3,0.354544,0.025741,0.191666,0.019411,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.889805,0.889805,...,0.89039,0.89,0.000276,68,0.889805,0.889805,0.89039,0.89,0.000276,68
4,0.432694,0.04602,0.232668,0.019186,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.889805,0.889805,...,0.89039,0.89,0.000276,68,0.889805,0.889805,0.89039,0.89,0.000276,68


In [159]:
dtGridScores.to_csv('Kyoto_RandomForest_GridSearch')

In [160]:
dt3=dlGrid.best_estimator_

In [161]:
dlGrid.best_params_

{'max_depth': 16, 'min_samples_split': 0.1, 'n_estimators': 100}

In [162]:
dtFinalScore = cross_validate(dt3, X, y, cv=10, scoring=scoring_metrics, verbose=15, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    0.4s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.4s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished


In [163]:
dtFinalScore = pd.DataFrame(dtFinalScore)

In [164]:
dtFinalScore.to_csv('Kyoto_RandomForest_Scores.csv')

# Neural Network

In [56]:
from sklearn.neural_network import MLPClassifier

In [57]:
mlpModel = MLPClassifier()

In [58]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    3.6s finished


In [59]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [60]:
mlpScore = pd.DataFrame(mlpScore)

In [61]:
mlpScore.to_csv("kyoto_MLP_scores.csv")

In [62]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [63]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [64]:
X_sample=X.sample(frac=0.2, random_state=1)
y_sample=y.sample(frac=0.2, random_state=1)

AttributeError: 'numpy.ndarray' object has no attribute 'sample'

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [66]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [67]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [68]:
mlp_grid.fit(X_sample, y_sample)

NameError: name 'X_sample' is not defined

In [72]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('Kyoto_MLP_GridSearch')

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [73]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [74]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=4, n_jobs=-1)

NameError: name 'mlpFinalModel' is not defined

In [75]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)

NameError: name 'mlpFinalScore' is not defined

In [76]:
mlpFinalScore.to_csv('Kyoto_MLP_Scores')

NameError: name 'mlpFinalScore' is not defined

# Decision Trees

In [77]:
from sklearn.tree import tree

In [78]:
dt=tree.DecisionTreeClassifier()

In [79]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [80]:
pd.DataFrame(dtScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.011113,0.001765,0.989011,0.989011,0.989011,0.989011
1,0.009893,0.0,0.982,0.982,0.982,0.982
2,0.011481,0.005938,0.986,0.986,0.986,0.986
3,0.032331,0.016731,0.983984,0.983984,0.983984,0.983984


In [81]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [82]:
dt2 = tree.DecisionTreeClassifier()

In [83]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)


In [84]:
dlGrid.fit(X, y)

Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.7s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)},
             pre_dispa

In [85]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.003759,0.001857,0.003202,0.001276,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.886114,0.887,0.887,...,0.88675,0.00037,226,0.886114,0.887,0.887,0.886887,0.88675,0.00037,226
1,0.002242,0.00147,0.004058,0.002021,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.886114,0.887,0.887,...,0.88675,0.00037,226,0.886114,0.887,0.887,0.886887,0.88675,0.00037,226
2,0.003825,0.001613,0.004378,0.001196,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.886114,0.887,0.887,...,0.88675,0.00037,226,0.886114,0.887,0.887,0.886887,0.88675,0.00037,226
3,0.004005,0.000665,0.005891,2.6e-05,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.886114,0.887,0.887,...,0.88675,0.00037,226,0.886114,0.887,0.887,0.886887,0.88675,0.00037,226
4,0.006748,0.003213,0.009342,0.002352,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.886114,0.887,0.887,...,0.88675,0.00037,226,0.886114,0.887,0.887,0.886887,0.88675,0.00037,226


In [86]:
dtGridScores.to_csv('Kyoto_DesicionTree_GridSearch.csv')

In [87]:
dt3=dlGrid.best_estimator_

In [88]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [89]:
pd.DataFrame(dtFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.010646,0.005522,0.99001,0.99001,0.99001,0.99001
1,0.008833,0.004008,0.986,0.986,0.986,0.986
2,0.00985,0.009167,0.989,0.989,0.989,0.989
3,0.005522,0.004001,0.983984,0.983984,0.983984,0.983984


In [90]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore['test_accuracy'].mean()

0.9872484934984934

In [91]:
dtFinalScore.to_csv('Kyoto_DesicionTree_Scores.csv')

# KNN

In [92]:
from sklearn.neighbors import KNeighborsClassifier

In [93]:
knn = KNeighborsClassifier()

In [94]:
k_range = list(range(1, 101, 5))

In [95]:
param_dict = dict(n_neighbors=k_range)

In [96]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [97]:
grid.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    8.1s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                         51, 56, 61, 66, 71, 76, 81, 86, 91,
                                         96]},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1_score': make_scorer(f1_score, average=micro),
                      'precision': make_scorer(precision_score, average=micro),
                      'recall': make_scorer(recall_sco

In [98]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.009519,0.001367,0.321289,0.06174,1,{'n_neighbors': 1},0.918082,0.914,0.911,0.910911,...,0.9135,0.002925,1,0.918082,0.914,0.911,0.910911,0.9135,0.002925,1
1,0.011384,0.001035,0.412221,0.065765,6,{'n_neighbors': 6},0.884116,0.887,0.88,0.886887,...,0.8845,0.002843,17,0.884116,0.887,0.88,0.886887,0.8845,0.002843,17
2,0.013135,0.003135,0.458135,0.016729,11,{'n_neighbors': 11},0.879121,0.884,0.868,0.875876,...,0.87675,0.005821,20,0.879121,0.884,0.868,0.875876,0.87675,0.005821,20
3,0.023272,0.010234,0.559671,0.051853,16,{'n_neighbors': 16},0.886114,0.884,0.882,0.884885,...,0.88425,0.0015,18,0.886114,0.884,0.882,0.884885,0.88425,0.0015,18
4,0.010604,0.001615,0.608564,0.039773,21,{'n_neighbors': 21},0.884116,0.88,0.881,0.886887,...,0.883,0.002708,19,0.884116,0.88,0.881,0.886887,0.883,0.002708,19
5,0.024926,0.003559,0.641989,0.048109,26,{'n_neighbors': 26},0.885115,0.884,0.886,0.887888,...,0.88575,0.001423,16,0.885115,0.884,0.886,0.887888,0.88575,0.001423,16
6,0.018057,0.007963,0.73186,0.029964,31,{'n_neighbors': 31},0.886114,0.886,0.886,0.887888,...,0.8865,0.000802,15,0.886114,0.886,0.886,0.887888,0.8865,0.000802,15
7,0.021404,0.000837,0.733904,0.018486,36,{'n_neighbors': 36},0.886114,0.887,0.887,0.886887,...,0.88675,0.00037,2,0.886114,0.887,0.887,0.886887,0.88675,0.00037,2
8,0.02122,0.000731,0.755853,0.030832,41,{'n_neighbors': 41},0.886114,0.887,0.887,0.886887,...,0.88675,0.00037,2,0.886114,0.887,0.887,0.886887,0.88675,0.00037,2
9,0.026234,0.005151,0.776347,0.015892,46,{'n_neighbors': 46},0.886114,0.887,0.887,0.886887,...,0.88675,0.00037,2,0.886114,0.887,0.887,0.886887,0.88675,0.00037,2


In [99]:
knn2=grid.best_estimator_
knn2

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [100]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.2s finished


In [101]:
pd.DataFrame(knnFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.004028,0.22854,0.918082,0.918082,0.918082,0.918082
1,0.010982,0.254418,0.914,0.914,0.914,0.914
2,0.014081,0.24176,0.911,0.911,0.911,0.911
3,0.005964,0.236868,0.910911,0.910911,0.910911,0.910911


In [102]:
pd.DataFrame(mlpScore).mean()

fit_time          2.394076
score_time        0.013756
test_accuracy     0.873495
test_precision    0.873495
test_recall       0.873495
test_f1_score     0.873495
dtype: float64

In [103]:
knnFinalScore = pd.DataFrame(knnFinalScore)
knnFinalScore.to_csv('Kyoto_KNN_Scores_ChiSquared')

# Random Forest

In [104]:
from sklearn.ensemble import RandomForestClassifier

In [105]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [106]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=4, scoring=scoring_metrics, verbose=3 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.3s finished


In [107]:
randomForestScore = pd.DataFrame(randomForestScore)

In [108]:
randomForestScore.to_csv('Kyoto_RandomForest_scores_ChiSquared.csv')

In [109]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [110]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [111]:
rf2 = RandomForestClassifier()

In [112]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)

In [113]:
dlGrid.fit(X, y)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1134 out of 1134 | elapsed:  1.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [117]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.168518,0.031651,0.116137,0.008528,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.886807,0.886722,...,0.886722,0.88675,4e-05,73,0.886807,0.886722,0.886722,0.88675,4e-05,73
1,0.230921,0.017609,0.109871,0.015118,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.886807,0.886722,...,0.886722,0.88675,4e-05,73,0.886807,0.886722,0.886722,0.88675,4e-05,73
2,0.252736,0.01591,0.141414,0.041718,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.886807,0.886722,...,0.886722,0.88675,4e-05,73,0.886807,0.886722,0.886722,0.88675,4e-05,73
3,0.412391,0.024848,0.163979,0.023653,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.886807,0.886722,...,0.886722,0.88675,4e-05,73,0.886807,0.886722,0.886722,0.88675,4e-05,73
4,0.441892,0.045422,0.256334,0.050687,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.886807,0.886722,...,0.886722,0.88675,4e-05,73,0.886807,0.886722,0.886722,0.88675,4e-05,73


In [118]:
dt3=dlGrid.best_estimator_

In [119]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.2s finished


In [120]:
pd.DataFrame(dtFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.221735,0.060548,0.97003,0.97003,0.97003,0.97003
1,0.173175,0.087434,0.963,0.963,0.963,0.963
2,0.207163,0.061579,0.969,0.969,0.969,0.969
3,0.184816,0.047534,0.964965,0.964965,0.964965,0.964965


In [121]:
dtFinalScore['test_accuracy'].mean()

0.9667487337487337

# All Results

In [122]:
allResults=pd.concat([
    pd.DataFrame(linearScore).mean(),
    pd.DataFrame(mlpScore).mean(),
    pd.DataFrame(dtFinalScore).mean(),
    pd.DataFrame(knnFinalScore).mean()], axis=1)
allResults

NameError: name 'linearScore' is not defined

In [123]:
allResults.rename(columns={0:'Linear Regression', 1:'Multi-level Perceptron', 2:'Decision Tree', 3:'KNN'},  inplace=True)
allResults

NameError: name 'allResults' is not defined

In [124]:
pred = dt3.predict(X)
pred

array([ 1, -1, -1, ...,  1, -1, -1], dtype=int64)

In [125]:
pred[0:100]

array([ 1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,
       -1,  1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1],
      dtype=int64)

In [126]:
X.head(100)

AttributeError: 'numpy.ndarray' object has no attribute 'head'