In [1]:
#liberando Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [3]:
path_dataset = '/content/drive/My Drive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/dataset/training_dataset.csv'
data = pd.read_csv(path_dataset)

In [4]:
X = data[data.columns[:-1]]
y = data[data.columns[-1:]]

In [5]:
X_selec = X[['Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'web_traffic', 'Page_Rank', 'Google_Index']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_selec, y, test_size=0.2, random_state=42)

In [7]:
skf = StratifiedKFold(n_splits=5)

In [8]:
n_estimators = [50,100,200,500]

max_features = ['auto', 'sqrt']

max_depth = [10,25,50]
max_depth.append(None)

min_samples_split = [2, 16, 64]

min_samples_leaf = [1, 16, 64]

bootstrap = [True, False]

criterion = ['gini', 'entropy']

grid = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,
          'min_samples_split': min_samples_split,
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap,
          'criterion': criterion}

In [9]:
model = RandomForestClassifier()

In [10]:
modelo_grid = GridSearchCV(model, grid, scoring={'f1_score' : make_scorer(f1_score),'accuracy_score': make_scorer(accuracy_score),'matthews_corrcoef':make_scorer(matthews_corrcoef)}, cv=skf, refit='f1_score', n_jobs=-1, verbose=1)

In [11]:
modelo_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 59.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,...
                         'max_depth': [10, 25, 50, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 16, 64],
                

In [12]:
modelo_grid.best_score_

0.9534604203700286

In [13]:
pd_cv = pd.DataFrame(modelo_grid.cv_results_)
pd_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
0,0.287054,0.005346,0.029718,0.000921,True,gini,10,auto,1,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.949341,0.940767,0.950525,0.956434,0.955433,0.9505,0.005579,168,0.943471,0.93273,0.944036,0.951385,0.949661,0.944257,0.006535,189,0.885492,0.864302,0.887162,0.901569,0.898463,0.887397,0.013118,186
1,0.560085,0.002452,0.051962,0.003732,True,gini,10,auto,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.948809,0.944334,0.947368,0.956122,0.954637,0.950254,0.004452,183,0.942906,0.936687,0.940644,0.951385,0.949095,0.944143,0.00541,195,0.884347,0.872492,0.880079,0.901623,0.897007,0.88711,0.010764,196
2,1.094603,0.011903,0.09089,0.001994,True,gini,10,auto,1,2,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.947901,0.94569,0.948847,0.956875,0.953054,0.950473,0.003995,170,0.941775,0.938383,0.94234,0.95195,0.947398,0.944369,0.004761,181,0.882057,0.875752,0.883502,0.902706,0.893522,0.887508,0.009499,181
3,2.735388,0.016787,0.215192,0.004362,True,gini,10,auto,1,2,500,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.948223,0.944749,0.948371,0.956345,0.956083,0.950754,0.004643,160,0.94234,0.937253,0.941775,0.951385,0.950792,0.944709,0.005502,164,0.883212,0.873523,0.882379,0.901561,0.90041,0.888217,0.010971,166
4,0.266648,0.001718,0.031837,0.005499,True,gini,10,auto,1,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943878,0.943227,0.943888,0.955949,0.946889,0.946766,0.004764,255,0.937818,0.935557,0.936687,0.95082,0.940611,0.940299,0.005522,254,0.874172,0.870028,0.872055,0.900428,0.879717,0.87928,0.011057,253


In [14]:
pd_cv.to_csv('/content/drive/MyDrive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/classificação/1/6_grupo_selec/cv_grid.csv',index=False)

In [16]:
pd_cv.sort_values(by='rank_test_f1_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
218,1.165799,0.006521,0.096454,0.001075,True,gini,,auto,1,2,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.949032,0.949597,0.955511,0.960204,0.952959,0.953460,0.004110,1,0.943471,0.943471,0.950254,0.955907,0.947398,0.948100,0.004668,2,0.885580,0.885547,0.899302,0.910774,0.893485,0.894938,0.009463,2
73,0.588256,0.009230,0.052837,0.000850,True,gini,25,auto,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.948496,0.948640,0.955994,0.960123,0.953877,0.953426,0.004446,2,0.942906,0.942340,0.950820,0.955907,0.948529,0.948100,0.005073,1,0.884451,0.883276,0.900441,0.910828,0.895761,0.894951,0.010290,1
362,1.180370,0.016197,0.096681,0.003393,True,entropy,25,auto,1,2,200,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.947689,0.949648,0.955466,0.960123,0.953347,0.953254,0.004383,3,0.941775,0.943471,0.950254,0.955907,0.947964,0.947874,0.005036,4,0.882072,0.885572,0.899287,0.910828,0.894613,0.894475,0.010234,4
434,1.207569,0.026979,0.097896,0.004077,True,entropy,50,auto,1,2,200,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.948338,0.947316,0.956038,0.960614,0.953877,0.953237,0.004934,4,0.942906,0.940644,0.950820,0.956473,0.948529,0.947874,0.005657,3,0.884565,0.879950,0.900459,0.911988,0.895761,0.894545,0.011431,3
181,0.590612,0.014367,0.057443,0.003700,True,gini,50,sqrt,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.947583,0.948640,0.956038,0.960694,0.952864,0.953164,0.004831,5,0.941775,0.942340,0.950820,0.956473,0.947398,0.947761,0.005488,6,0.882108,0.883276,0.900459,0.911930,0.893468,0.894248,0.011132,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,0.489416,0.013267,0.045492,0.000818,True,gini,10,auto,64,16,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.934947,0.923615,0.932806,0.942240,0.932195,0.933161,0.005963,1148,0.927077,0.913510,0.923120,0.934992,0.923643,0.924468,0.006930,1148,0.852257,0.824954,0.845493,0.868494,0.845401,0.847320,0.014000,1149
132,0.357175,0.158286,0.045211,0.036742,True,gini,25,sqrt,64,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.937722,0.925447,0.936170,0.937500,0.928177,0.933003,0.005156,1149,0.930469,0.915206,0.927077,0.929904,0.919118,0.924355,0.006111,1150,0.859130,0.828666,0.853426,0.858058,0.836202,0.847096,0.012356,1150
348,0.240688,0.001496,0.027087,0.000554,True,entropy,10,sqrt,64,2,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.930280,0.931466,0.932612,0.940998,0.929293,0.932930,0.004186,1150,0.922555,0.922555,0.922555,0.933861,0.920814,0.924468,0.004745,1149,0.843191,0.843280,0.844862,0.866082,0.839561,0.847395,0.009504,1148
28,0.245957,0.007266,0.027103,0.000573,True,gini,10,auto,64,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.935566,0.925062,0.929911,0.941709,0.930233,0.932496,0.005681,1151,0.928208,0.914641,0.919729,0.934426,0.921946,0.923790,0.006874,1151,0.854568,0.827618,0.838662,0.867327,0.841854,0.846006,0.013691,1151
