In [1]:
#liberando Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [3]:
path_dataset = '/content/drive/My Drive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/dataset/training_dataset.csv'
data = pd.read_csv(path_dataset)

In [4]:
X = data[data.columns[:-1]]
y = data[data.columns[-1:]]

In [5]:
X_tree = X[['Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'URL_of_Anchor','Links_in_tags', 'web_traffic']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_tree, y, test_size=0.2, random_state=42)

In [7]:
skf = StratifiedKFold(n_splits=5)

In [8]:
n_estimators = [50,100,200,500]

max_features = ['auto', 'sqrt']

max_depth = [10,25,50]
max_depth.append(None)

min_samples_split = [2, 16, 64]

min_samples_leaf = [1, 16, 64]

bootstrap = [True, False]

criterion = ['gini', 'entropy']

grid = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,
          'min_samples_split': min_samples_split,
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap,
          'criterion': criterion}

In [9]:
model = RandomForestClassifier()

In [10]:
modelo_grid = GridSearchCV(model, grid, scoring={'f1_score' : make_scorer(f1_score),'accuracy_score': make_scorer(accuracy_score),'matthews_corrcoef':make_scorer(matthews_corrcoef)}, cv=skf, refit='f1_score', n_jobs=-1, verbose=1)

In [11]:
modelo_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 50.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,...
                         'max_depth': [10, 25, 50, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 16, 64],
                

In [12]:
modelo_grid.best_score_

0.9441866995040383

In [13]:
pd_cv = pd.DataFrame(modelo_grid.cv_results_)
pd_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
0,0.24957,0.005364,0.028789,0.001027,True,gini,10,auto,1,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.942482,0.937438,0.945636,0.95171,0.93728,0.942909,0.005417,247,0.935557,0.928773,0.938383,0.945732,0.929299,0.935549,0.006272,260,0.869463,0.856426,0.87579,0.890263,0.856937,0.869776,0.012642,261
1,0.476649,0.005489,0.04813,0.0006,True,gini,10,auto,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943968,0.937438,0.945473,0.952141,0.936747,0.943153,0.005666,203,0.937253,0.928773,0.938383,0.946297,0.928733,0.935888,0.006607,197,0.872899,0.856426,0.87559,0.891358,0.855768,0.870408,0.01328,205
2,0.945093,0.00666,0.085695,0.000953,True,gini,10,auto,1,2,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943073,0.936973,0.946108,0.95171,0.936683,0.942909,0.005687,246,0.936122,0.928208,0.938949,0.945732,0.928733,0.935549,0.006572,265,0.870636,0.855314,0.876905,0.890263,0.855734,0.86977,0.013249,265
3,2.362114,0.0202,0.203693,0.003896,True,gini,10,auto,1,2,500,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.944922,0.936973,0.945636,0.952141,0.936277,0.94319,0.005925,195,0.938383,0.928208,0.938383,0.946297,0.928167,0.935888,0.006919,204,0.875183,0.855314,0.87579,0.891358,0.854637,0.870456,0.013909,196
4,0.238957,0.004096,0.028569,0.000696,True,gini,10,auto,1,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943548,0.937965,0.945582,0.953101,0.936747,0.943389,0.005877,119,0.936687,0.929339,0.938383,0.947428,0.928733,0.936114,0.006839,119,0.871774,0.857622,0.875719,0.893624,0.855768,0.870901,0.013752,136


In [14]:
pd_cv.to_csv('/content/drive/MyDrive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/classificação/1/8_grupotree/cv_grid.csv',index=False)

In [15]:
pd_cv.sort_values(by='rank_test_f1_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
184,0.237258,0.008156,0.028130,0.000115,True,gini,50,sqrt,1,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.948058,0.936973,0.946054,0.953101,0.936747,0.944187,0.006408,1,0.941775,0.928208,0.938949,0.947428,0.928733,0.937019,0.007496,2,0.882093,0.855314,0.876836,0.893624,0.855768,0.872727,0.015047,2
328,0.244379,0.001946,0.028386,0.000329,True,entropy,10,sqrt,1,16,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.944810,0.937438,0.946054,0.953101,0.939470,0.944175,0.005498,2,0.938383,0.928773,0.938949,0.947428,0.931561,0.937019,0.006507,1,0.875176,0.856426,0.876836,0.893624,0.861681,0.872749,0.013018,1
545,0.493874,0.004929,0.047871,0.000726,True,entropy,,sqrt,1,16,100,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.944866,0.936910,0.946054,0.953101,0.939470,0.944080,0.005631,3,0.938383,0.928208,0.938949,0.947428,0.931561,0.936906,0.006653,3,0.875177,0.855231,0.876836,0.893624,0.861681,0.872510,0.013322,3
542,0.959821,0.005758,0.086083,0.001108,True,entropy,,sqrt,1,2,200,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.948111,0.937438,0.945636,0.952141,0.936747,0.944015,0.006025,4,0.941775,0.928773,0.938383,0.946297,0.928733,0.936792,0.007028,4,0.882115,0.856426,0.875790,0.891358,0.855768,0.872291,0.014121,4
508,0.244318,0.005851,0.028252,0.000479,True,entropy,,auto,1,16,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.945838,0.937438,0.946580,0.953101,0.936747,0.943941,0.006140,5,0.938949,0.928773,0.939514,0.947428,0.928733,0.936679,0.007133,6,0.876523,0.856426,0.878022,0.893624,0.855768,0.872072,0.014355,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,0.222481,0.004276,0.026616,0.000525,True,gini,50,sqrt,64,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.924012,0.922301,0.926441,0.935936,0.925645,0.926867,0.004752,1148,0.915206,0.912945,0.916337,0.927643,0.916855,0.917797,0.005103,1147,0.828191,0.823569,0.831097,0.853703,0.831528,0.833617,0.010436,1148
171,2.168632,0.020824,0.192436,0.010193,True,gini,50,auto,64,2,500,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.923313,0.919026,0.926368,0.935484,0.928608,0.926560,0.005498,1149,0.915206,0.907858,0.916337,0.927643,0.920249,0.917458,0.006481,1149,0.828518,0.813705,0.831026,0.853467,0.838417,0.833026,0.012996,1149
248,0.216951,0.000352,0.028830,0.004552,True,gini,,auto,64,64,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.925205,0.919026,0.925669,0.934673,0.926463,0.926207,0.004992,1150,0.917467,0.907858,0.915206,0.926512,0.916855,0.916780,0.005954,1150,0.833209,0.813705,0.829027,0.851243,0.831743,0.831786,0.011959,1150
106,0.866073,0.007601,0.078861,0.000527,True,gini,25,auto,64,64,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.923935,0.919026,0.924752,0.933805,0.928283,0.925960,0.004911,1151,0.915206,0.907858,0.914076,0.925947,0.919683,0.916554,0.006027,1151,0.828210,0.813705,0.826805,0.849996,0.837265,0.831196,0.012037,1151
