# IESB - CIA035 - Aula 06 - GridSearchCV

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer-wisconsin-data/data.csv


In [2]:
# Importando as bibliotecas
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Carregando o dataframe
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

df.shape

(569, 33)

In [4]:
# Verificando as colunas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [5]:
# Olhando os dados
df.head().T

Unnamed: 0,0,1,2,3,4
id,842302,842517,84300903,84348301,84358402
diagnosis,M,M,M,M,M
radius_mean,17.99,20.57,19.69,11.42,20.29
texture_mean,10.38,17.77,21.25,20.38,14.34
perimeter_mean,122.8,132.9,130.0,77.58,135.1
area_mean,1001.0,1326.0,1203.0,386.1,1297.0
smoothness_mean,0.1184,0.08474,0.1096,0.1425,0.1003
compactness_mean,0.2776,0.07864,0.1599,0.2839,0.1328
concavity_mean,0.3001,0.0869,0.1974,0.2414,0.198
concave points_mean,0.1471,0.07017,0.1279,0.1052,0.1043


In [6]:
# Removendo as colunas desnecessárias
df.drop(['id'], axis=1, inplace=True)
df.drop(['Unnamed: 32'], axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [7]:
# Verificando a variável target
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [8]:
# Separando o dataframe
train, test = train_test_split(df, random_state=42)

train.shape, test.shape

((426, 31), (143, 31))

In [9]:
# Obtendo as colunas para treinamento
features = [c for c in df.columns if c not in ['diagnosis']]

features

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

In [10]:
# Vamos executar o RandomForest padrão
rf_padrao = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42, oob_score=True)

rf_padrao.fit(train[features], train['diagnosis'])

accuracy_score(test['diagnosis'], rf_padrao.predict(test[features]))

0.965034965034965

In [11]:
# Vamos usar o GridSearchCV para encontrar os melhores parâmetros de execução do nosso modelo de RF

# Instanciando o modelo
rf = RandomForestClassifier(random_state=42)

# Definindo uma dicionário com os parâmetros
rf_param = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [2, 5, 10],
    'min_impurity_decrease': [0.01, 0.02, 0.0005]
}

# Instanciando o GridSearch
rf_grid = GridSearchCV(rf, rf_param, cv=5, scoring='accuracy')

# Treinando o modelo com GridSearch
rf_grid.fit(train[features], train['diagnosis'])

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [2, 5, 10],
                         'min_impurity_decrease': [0.01, 0.02, 0.0005],
                         'n_estimators': [100, 150, 200, 250]},
             scoring='accuracy')

In [12]:
# Vamos verificar os resultados
pd.DataFrame(rf_grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_impurity_decrease,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.204545,0.002553,0.0107,0.00032,2,0.01,100,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",0.930233,0.917647,0.952941,0.905882,0.941176,0.929576,0.016641,35
1,0.301003,0.001985,0.014517,0.000242,2,0.01,150,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",0.94186,0.929412,0.952941,0.917647,0.941176,0.936607,0.012054,24
2,0.398279,0.000663,0.018335,0.000102,2,0.01,200,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",0.94186,0.929412,0.952941,0.905882,0.941176,0.934254,0.016021,28
3,0.499068,0.002662,0.022507,0.000149,2,0.01,250,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",0.930233,0.929412,0.952941,0.917647,0.941176,0.934282,0.011938,27
4,0.201846,0.000933,0.010794,0.000456,2,0.02,100,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",0.94186,0.905882,0.952941,0.905882,0.929412,0.927196,0.018928,36
5,0.302174,0.002957,0.015093,0.001325,2,0.02,150,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",0.94186,0.941176,0.952941,0.905882,0.941176,0.936607,0.016001,24
6,0.403306,0.005647,0.018445,0.000134,2,0.02,200,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",0.94186,0.929412,0.952941,0.917647,0.941176,0.936607,0.012054,24
7,0.497057,0.001653,0.022499,8.7e-05,2,0.02,250,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",0.94186,0.941176,0.964706,0.917647,0.929412,0.93896,0.015631,21
8,0.201415,0.000499,0.01044,6.2e-05,2,0.0005,100,"{'max_depth': 2, 'min_impurity_decrease': 0.00...",0.930233,0.917647,0.964706,0.905882,0.941176,0.931929,0.020224,32
9,0.303225,0.002511,0.014437,6.9e-05,2,0.0005,150,"{'max_depth': 2, 'min_impurity_decrease': 0.00...",0.94186,0.929412,0.952941,0.905882,0.941176,0.934254,0.016021,28


In [13]:
# Vamos reduzir as colunas
pd.DataFrame(rf_grid.cv_results_)[['params','rank_test_score','mean_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score
0,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",35,0.929576
1,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",24,0.936607
2,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",28,0.934254
3,"{'max_depth': 2, 'min_impurity_decrease': 0.01...",27,0.934282
4,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",36,0.927196
5,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",24,0.936607
6,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",24,0.936607
7,"{'max_depth': 2, 'min_impurity_decrease': 0.02...",21,0.93896
8,"{'max_depth': 2, 'min_impurity_decrease': 0.00...",32,0.931929
9,"{'max_depth': 2, 'min_impurity_decrease': 0.00...",28,0.934254


In [14]:
# Vamos reduzir as colunas e ordenar pelo rank
pd.DataFrame(rf_grid.cv_results_)[['params','rank_test_score','mean_test_score']].sort_values(by=['rank_test_score'])

Unnamed: 0,params,rank_test_score,mean_test_score
35,"{'max_depth': 10, 'min_impurity_decrease': 0.0...",1,0.957756
33,"{'max_depth': 10, 'min_impurity_decrease': 0.0...",1,0.957756
32,"{'max_depth': 10, 'min_impurity_decrease': 0.0...",3,0.957729
34,"{'max_depth': 10, 'min_impurity_decrease': 0.0...",4,0.955404
22,"{'max_depth': 5, 'min_impurity_decrease': 0.00...",5,0.955376
26,"{'max_depth': 10, 'min_impurity_decrease': 0.0...",5,0.955376
23,"{'max_depth': 5, 'min_impurity_decrease': 0.00...",5,0.955376
14,"{'max_depth': 5, 'min_impurity_decrease': 0.01...",8,0.953023
21,"{'max_depth': 5, 'min_impurity_decrease': 0.00...",8,0.953023
20,"{'max_depth': 5, 'min_impurity_decrease': 0.00...",8,0.953023


In [15]:
# Imprime os parâmetros que produziram o ".best_score_".
rf_grid.best_params_

{'max_depth': 10, 'min_impurity_decrease': 0.0005, 'n_estimators': 150}

In [16]:
# Imprimindo o best_score
rf_grid.best_score_

0.9577564979480163

In [17]:
# Vamos 'carregar' o melhor estimator no modelo para usá-lo
rf_top = rf_grid.best_estimator_

# Treinar o modelo
rf_top.fit(train[features], train['diagnosis'])

# E testar
accuracy_score(test['diagnosis'], rf_top.predict(test[features]))

0.965034965034965

## GBM

In [18]:
# Vamos executar o GBM padrão
gbm_padrao = GradientBoostingClassifier(n_estimators=200, random_state=42)

gbm_padrao.fit(train[features], train['diagnosis'])

accuracy_score(test['diagnosis'], gbm_padrao.predict(test[features]))

0.958041958041958

In [19]:
# Vamos usar o GridSearchCV para encontrar os melhores parâmetros de execução do nosso modelo de GBM

# Instanciando o modelo
gbm = GradientBoostingClassifier(random_state=42)

# Definindo uma dicionário com os parâmetros
gbm_param = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.2, 0.05, 0.0001],
    'min_impurity_decrease': [0.01, 0.02, 0.0005]
}

# Instanciando o GridSearch
gbm_grid = GridSearchCV(gbm, gbm_param, cv=5, scoring='accuracy')

# Treinando o modelo com GridSearch
gbm_grid.fit(train[features], train['diagnosis'])

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'learning_rate': [0.2, 0.05, 0.0001],
                         'min_impurity_decrease': [0.01, 0.02, 0.0005],
                         'n_estimators': [100, 150, 200, 250]},
             scoring='accuracy')

In [20]:
# Vamos reduzir as colunas e ordenar pelo rank
pd.DataFrame(gbm_grid.cv_results_)[['params','rank_test_score','mean_test_score']].sort_values(by=['rank_test_score'])

Unnamed: 0,params,rank_test_score,mean_test_score
8,"{'learning_rate': 0.2, 'min_impurity_decrease'...",1,0.95067
9,"{'learning_rate': 0.2, 'min_impurity_decrease'...",1,0.95067
10,"{'learning_rate': 0.2, 'min_impurity_decrease'...",1,0.95067
11,"{'learning_rate': 0.2, 'min_impurity_decrease'...",1,0.95067
17,"{'learning_rate': 0.05, 'min_impurity_decrease...",5,0.948317
19,"{'learning_rate': 0.05, 'min_impurity_decrease...",5,0.948317
18,"{'learning_rate': 0.05, 'min_impurity_decrease...",5,0.948317
16,"{'learning_rate': 0.05, 'min_impurity_decrease...",5,0.948317
23,"{'learning_rate': 0.05, 'min_impurity_decrease...",9,0.945964
22,"{'learning_rate': 0.05, 'min_impurity_decrease...",9,0.945964


In [21]:
# Imprime os parâmetros que produziram o ".best_score_".
gbm_grid.best_params_

{'learning_rate': 0.2, 'min_impurity_decrease': 0.0005, 'n_estimators': 100}

In [22]:
# Imprimindo o best_score
gbm_grid.best_score_

0.9506703146374831

In [23]:
# Vamos 'carregar' o melhor estimator no modelo para usá-lo - GBM
gbm_top = gbm_grid.best_estimator_

# Treinar o modelo
gbm_top.fit(train[features], train['diagnosis'])

# E testar
accuracy_score(test['diagnosis'], gbm_top.predict(test[features]))

0.958041958041958

## AdaBoost

In [24]:
# Vamos executar o AdaBoost padrão
ada_padrao = AdaBoostClassifier(random_state=42)

ada_padrao.fit(train[features], train['diagnosis'])

accuracy_score(test['diagnosis'], ada_padrao.predict(test[features]))

0.951048951048951

In [25]:
# Vamos usar o GridSearchCV para encontrar os melhores parâmetros de execução do nosso modelo de AdaBboost

# Instanciando o modelo
ada = AdaBoostClassifier(random_state=42)

# Definindo uma dicionário com os parâmetros
ada_param = {
    'n_estimators': [100, 150, 200, 250],
    'learning_rate': [0.2, 0.05, 0.0001]
}

# Instanciando o GridSearch
ada_grid = GridSearchCV(ada, ada_param, cv=5, scoring='accuracy')

# Treinando o modelo com GridSearch
ada_grid.fit(train[features], train['diagnosis'])

GridSearchCV(cv=5, estimator=AdaBoostClassifier(random_state=42),
             param_grid={'learning_rate': [0.2, 0.05, 0.0001],
                         'n_estimators': [100, 150, 200, 250]},
             scoring='accuracy')

In [26]:
# Vamos reduzir as colunas e ordenar pelo rank
pd.DataFrame(ada_grid.cv_results_)[['params','rank_test_score','mean_test_score']].sort_values(by=['rank_test_score'])

Unnamed: 0,params,rank_test_score,mean_test_score
2,"{'learning_rate': 0.2, 'n_estimators': 200}",1,0.971792
5,"{'learning_rate': 0.05, 'n_estimators': 150}",2,0.969439
0,"{'learning_rate': 0.2, 'n_estimators': 100}",3,0.967086
1,"{'learning_rate': 0.2, 'n_estimators': 150}",3,0.967086
3,"{'learning_rate': 0.2, 'n_estimators': 250}",3,0.967086
6,"{'learning_rate': 0.05, 'n_estimators': 200}",3,0.967086
7,"{'learning_rate': 0.05, 'n_estimators': 250}",3,0.967086
4,"{'learning_rate': 0.05, 'n_estimators': 100}",8,0.957784
8,"{'learning_rate': 0.0001, 'n_estimators': 100}",9,0.891956
9,"{'learning_rate': 0.0001, 'n_estimators': 150}",9,0.891956


In [27]:
# Imprime os parâmetros que produziram o ".best_score_".
ada_grid.best_params_

{'learning_rate': 0.2, 'n_estimators': 200}

In [28]:
# Imprimindo o best_score
ada_grid.best_score_

0.9717920656634748

In [29]:
# Vamos 'carregar' o melhor estimator no modelo para usá-lo - AdaBoost
ada_top = ada_grid.best_estimator_

# Treinar o modelo
ada_top.fit(train[features], train['diagnosis'])

# E testar
accuracy_score(test['diagnosis'], ada_top.predict(test[features]))

0.958041958041958