In [2]:
#liberando Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [4]:
path_dataset = '/content/drive/My Drive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/dataset/training_dataset.csv'
data = pd.read_csv(path_dataset)

In [5]:
X = data[data.columns[:-1]]
y = data[data.columns[-1:]]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
skf = StratifiedKFold(n_splits=5)

In [8]:
n_estimators = [50,100,200,500]

max_features = ['auto', 'sqrt']

max_depth = [10,25,50]
max_depth.append(None)

min_samples_split = [2, 16, 64]

min_samples_leaf = [1, 16, 64]

bootstrap = [True, False]

criterion = ['gini', 'entropy']

grid = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,
          'min_samples_split': min_samples_split,
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap,
          'criterion': criterion}

In [11]:
model = RandomForestClassifier()

In [14]:
modelo_grid = GridSearchCV(model, grid, scoring={'f1_score' : make_scorer(f1_score),'accuracy_score': make_scorer(accuracy_score),'matthews_corrcoef':make_scorer(matthews_corrcoef)}, cv=skf, refit='f1_score', n_jobs=-1, verbose=1)

In [15]:
modelo_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 30.1min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 39.2min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 50.5min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 63.3min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 73.7min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,...
                         'max_depth': [10, 25, 50, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 16, 64],
                

In [16]:
modelo_grid.best_score_

0.9741457096667194

In [17]:
pd_cv = pd.DataFrame(modelo_grid.cv_results_)
pd_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
0,0.331768,0.003299,0.032017,0.000582,True,gini,10,auto,1,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.961013,0.957393,0.955831,0.96416,0.956914,0.959062,0.003086,216,0.956473,0.95195,0.949689,0.959864,0.951357,0.953867,0.003747,216,0.911854,0.90299,0.899301,0.918837,0.901884,0.906973,0.007283,216
1,0.67097,0.010298,0.059922,0.006466,True,gini,10,auto,1,2,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.960446,0.953908,0.958209,0.964232,0.959839,0.959327,0.003352,212,0.955907,0.947993,0.952516,0.959864,0.954751,0.954206,0.003916,212,0.9107,0.894968,0.904814,0.918908,0.908671,0.907612,0.007823,212
2,1.318004,0.013113,0.099121,0.001149,True,gini,10,auto,1,2,200,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.95846,0.954248,0.961634,0.965169,0.95992,0.959886,0.0036,205,0.953646,0.948559,0.956473,0.960995,0.954751,0.954885,0.004036,204,0.90612,0.895956,0.91274,0.921132,0.90879,0.908947,0.008242,205
3,3.284892,0.03013,0.235724,0.004603,True,gini,10,auto,1,2,500,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.958862,0.956827,0.959722,0.965204,0.961924,0.960508,0.002861,199,0.954211,0.951385,0.954211,0.960995,0.957014,0.955563,0.003247,199,0.907268,0.901764,0.908322,0.921166,0.913393,0.910382,0.006535,199
4,0.328067,0.002271,0.032638,0.000499,True,gini,10,auto,1,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.950836,0.945838,0.953465,0.960202,0.954386,0.952945,0.00469,306,0.945167,0.938949,0.946863,0.955342,0.948529,0.94697,0.005294,306,0.88893,0.876523,0.893808,0.909725,0.896101,0.893018,0.010752,306


In [18]:
pd_cv.to_csv('/content/drive/MyDrive/UFPR/aplicacao_ciencia_dados/trabalho_disciplina/classificação/1/1_All/cv_grid.csv',index=False)

In [19]:
pd_cv.sort_values(by='rank_test_f1_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
1117,0.915742,0.006322,0.062984,0.000473,False,entropy,,sqrt,1,2,100,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973631,0.973993,0.976650,0.974910,0.971545,0.974146,0.001668,1,0.970605,0.971170,0.973997,0.972301,0.968326,0.971280,0.001877,1,0.940489,0.941654,0.947377,0.944027,0.935871,0.941883,0.003819,1
396,0.409202,0.007621,0.035893,0.001148,True,entropy,25,sqrt,1,2,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.971285,0.977029,0.973219,0.975013,0.972755,0.973860,0.001982,2,0.967778,0.974562,0.970040,0.972301,0.969457,0.970828,0.002363,3,0.934918,0.948536,0.939453,0.943942,0.938315,0.941033,0.004733,3
830,1.767105,0.014550,0.117052,0.001855,False,gini,,sqrt,1,2,200,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.973165,0.974464,0.975114,0.974936,0.971574,0.973850,0.001327,3,0.970040,0.971735,0.972301,0.972301,0.968326,0.970940,0.001548,2,0.939363,0.942828,0.943936,0.943998,0.935877,0.941200,0.003152,2
1083,4.638153,0.015173,0.285764,0.013783,False,entropy,,auto,1,2,500,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973631,0.973993,0.976154,0.973886,0.971081,0.973749,0.001612,4,0.970605,0.971170,0.973431,0.971170,0.967760,0.970827,0.001815,5,0.940489,0.941654,0.946238,0.941741,0.934736,0.940971,0.003685,5
1118,1.845346,0.020631,0.119669,0.006049,False,entropy,,sqrt,1,2,200,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973110,0.973940,0.976178,0.973388,0.972067,0.973737,0.001364,5,0.970040,0.971170,0.973431,0.970605,0.968891,0.970827,0.001504,4,0.939339,0.941698,0.946254,0.940582,0.937020,0.940979,0.003062,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,0.294052,0.004529,0.029716,0.000557,True,gini,,sqrt,64,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943302,0.930723,0.929955,0.939470,0.939288,0.936547,0.005273,1148,0.936122,0.921990,0.920294,0.931600,0.931561,0.928313,0.006109,1148,0.870755,0.842007,0.839209,0.861782,0.861538,0.855058,0.012289,1148
528,0.286274,0.006098,0.029570,0.000454,True,entropy,,auto,64,2,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.934893,0.932271,0.936170,0.945165,0.933468,0.936393,0.004578,1149,0.927643,0.923120,0.927077,0.937818,0.925339,0.928200,0.005061,1149,0.853478,0.844674,0.853426,0.874676,0.848767,0.855004,0.010367,1149
104,0.297793,0.008569,0.029430,0.000259,True,gini,25,auto,64,64,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.935761,0.932866,0.932612,0.943188,0.936937,0.936273,0.003835,1150,0.928208,0.924251,0.922555,0.936122,0.928733,0.927974,0.004696,1150,0.854540,0.846670,0.844862,0.870755,0.855898,0.854545,0.009169,1151
172,0.284300,0.006865,0.029425,0.000613,True,gini,50,auto,64,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.932790,0.933995,0.931373,0.944724,0.938469,0.936270,0.004849,1151,0.925382,0.924816,0.920859,0.937818,0.930430,0.927861,0.005834,1151,0.848930,0.848390,0.841857,0.874220,0.859378,0.854555,0.011321,1150


In [20]:
pd_cv.sort_values(by='rank_test_f1_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,split3_test_f1_score,split4_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_test_accuracy_score,split1_test_accuracy_score,split2_test_accuracy_score,split3_test_accuracy_score,split4_test_accuracy_score,mean_test_accuracy_score,std_test_accuracy_score,rank_test_accuracy_score,split0_test_matthews_corrcoef,split1_test_matthews_corrcoef,split2_test_matthews_corrcoef,split3_test_matthews_corrcoef,split4_test_matthews_corrcoef,mean_test_matthews_corrcoef,std_test_matthews_corrcoef,rank_test_matthews_corrcoef
1117,0.915742,0.006322,0.062984,0.000473,False,entropy,,sqrt,1,2,100,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973631,0.973993,0.976650,0.974910,0.971545,0.974146,0.001668,1,0.970605,0.971170,0.973997,0.972301,0.968326,0.971280,0.001877,1,0.940489,0.941654,0.947377,0.944027,0.935871,0.941883,0.003819,1
396,0.409202,0.007621,0.035893,0.001148,True,entropy,25,sqrt,1,2,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.971285,0.977029,0.973219,0.975013,0.972755,0.973860,0.001982,2,0.967778,0.974562,0.970040,0.972301,0.969457,0.970828,0.002363,3,0.934918,0.948536,0.939453,0.943942,0.938315,0.941033,0.004733,3
830,1.767105,0.014550,0.117052,0.001855,False,gini,,sqrt,1,2,200,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.973165,0.974464,0.975114,0.974936,0.971574,0.973850,0.001327,3,0.970040,0.971735,0.972301,0.972301,0.968326,0.970940,0.001548,2,0.939363,0.942828,0.943936,0.943998,0.935877,0.941200,0.003152,2
1083,4.638153,0.015173,0.285764,0.013783,False,entropy,,auto,1,2,500,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973631,0.973993,0.976154,0.973886,0.971081,0.973749,0.001612,4,0.970605,0.971170,0.973431,0.971170,0.967760,0.970827,0.001815,5,0.940489,0.941654,0.946238,0.941741,0.934736,0.940971,0.003685,5
1118,1.845346,0.020631,0.119669,0.006049,False,entropy,,sqrt,1,2,200,"{'bootstrap': False, 'criterion': 'entropy', '...",0.973110,0.973940,0.976178,0.973388,0.972067,0.973737,0.001364,5,0.970040,0.971170,0.973431,0.970605,0.968891,0.970827,0.001504,4,0.939339,0.941698,0.946254,0.940582,0.937020,0.940979,0.003062,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,0.294052,0.004529,0.029716,0.000557,True,gini,,sqrt,64,2,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.943302,0.930723,0.929955,0.939470,0.939288,0.936547,0.005273,1148,0.936122,0.921990,0.920294,0.931600,0.931561,0.928313,0.006109,1148,0.870755,0.842007,0.839209,0.861782,0.861538,0.855058,0.012289,1148
528,0.286274,0.006098,0.029570,0.000454,True,entropy,,auto,64,2,50,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.934893,0.932271,0.936170,0.945165,0.933468,0.936393,0.004578,1149,0.927643,0.923120,0.927077,0.937818,0.925339,0.928200,0.005061,1149,0.853478,0.844674,0.853426,0.874676,0.848767,0.855004,0.010367,1149
104,0.297793,0.008569,0.029430,0.000259,True,gini,25,auto,64,64,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.935761,0.932866,0.932612,0.943188,0.936937,0.936273,0.003835,1150,0.928208,0.924251,0.922555,0.936122,0.928733,0.927974,0.004696,1150,0.854540,0.846670,0.844862,0.870755,0.855898,0.854545,0.009169,1151
172,0.284300,0.006865,0.029425,0.000613,True,gini,50,auto,64,16,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.932790,0.933995,0.931373,0.944724,0.938469,0.936270,0.004849,1151,0.925382,0.924816,0.920859,0.937818,0.930430,0.927861,0.005834,1151,0.848930,0.848390,0.841857,0.874220,0.859378,0.854555,0.011321,1150
