<a href="https://colab.research.google.com/github/ffelfis/OrgaDatosTPs/blob/main/TP2/resources/OPT_Grid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lectura de datos de Google Drive

In [24]:
# Lectura de Dataset desde Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Para importar funciones customizadas

Hay que especificar la ruta de donde se encuentra el módulo (archivo `.py`) para buscar las funciones.

La lectura puede ser muy celosa: las indentaciones son de 4 espacios no tabulaciones.

https://colab.research.google.com/drive/1uvHuizCBqFgvbCwEhK7FvU8JW0AfxgJw

In [25]:
import sys
sys.path.append('/content/drive/My Drive/75.06 - Organización de Datos/TP2/resources')

# Carga de librerías y directorios

In [26]:
import pandas as pd
import numpy as np

# Métrica
from sklearn.metrics import f1_score

# Clasificador
from sklearn.ensemble import RandomForestClassifier

# Paquete de Grid Search Cross Validation
from sklearn.model_selection import GridSearchCV

# Para el cambio de tipo de datos: para reducir el uso de memoria
from utilidades import cambio_tipos

# Rutas de los archivos a usar

In [27]:
# Ruta train_values.csv
dir_values = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/train_values.csv'
# Ruta train_labels.csv
dir_labels = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/train_labels.csv'
# Ruta test_values.csv
dir_test = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/test_values.csv'
# Ruta submission_format.csv
dir_sub_format = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/submission_format.csv'
# Ruta de Binary Encodings para train_values.csv
dir_Binary = '/content/drive/My Drive/75.06 - Organización de Datos/TP2/resources'

---
#Entrenamiento
---
### Carga de train

In [28]:
columnas = ['building_id',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other']
 
# Carga de train_values.csv
train = pd.read_csv(dir_values, usecols=columnas)

### Cambio de tipos de datos

In [29]:
train = cambio_tipos(train)

### Carga de columnas codificadas: Binary Encoding

In [30]:
# 28 columnas más.
train = train.join(pd.read_csv(dir_Binary+f'/BE_train.csv', dtype='uint8'))

X = train.values

### Carga de labels
GridSearchCV necesita los valores en una dimensión.

In [31]:
# Carga de train_labels.csv
labels = pd.read_csv(dir_labels, usecols=['damage_grade'], dtype='uint8')

y = labels.damage_grade.values

# Grid Search

### Instancia del clasificador

In [32]:
rfc = RandomForestClassifier()

### Grilla de parámetros para realizar la búsqueda

In [33]:
param_grid = {
    'n_estimators' : [10, 20, 50],
    'max_depth': [20, 30, 40]
}

### Instancia del modelo de Grid Search

In [34]:
model = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring = 'f1_micro', n_jobs = 1, cv = 4, verbose = 10)

### Entrenamiento del modelo usando Grid Search Cross Validation

In [35]:
model.fit(X, y)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] max_depth=20, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... max_depth=20, n_estimators=10, score=0.679, total=   5.2s
[CV] max_depth=20, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=10, score=0.678, total=   5.5s
[CV] max_depth=20, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.7s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=10, score=0.684, total=   5.3s
[CV] max_depth=20, n_estimators=10 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.0s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=10, score=0.687, total=   5.1s
[CV] max_depth=20, n_estimators=20 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   21.1s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=20, score=0.692, total=  10.3s
[CV] max_depth=20, n_estimators=20 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   31.4s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=20, score=0.690, total=   9.9s
[CV] max_depth=20, n_estimators=20 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   41.4s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=20, score=0.694, total=   9.4s
[CV] max_depth=20, n_estimators=20 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   50.8s remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=20, score=0.696, total=   9.7s
[CV] max_depth=20, n_estimators=50 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.0min remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=50, score=0.697, total=  22.1s
[CV] max_depth=20, n_estimators=50 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.4min remaining:    0.0s


[CV] ....... max_depth=20, n_estimators=50, score=0.696, total=  19.2s
[CV] max_depth=20, n_estimators=50 ...................................
[CV] ....... max_depth=20, n_estimators=50, score=0.693, total=  20.4s
[CV] max_depth=20, n_estimators=50 ...................................
[CV] ....... max_depth=20, n_estimators=50, score=0.696, total=  20.9s
[CV] max_depth=30, n_estimators=10 ...................................
[CV] ....... max_depth=30, n_estimators=10, score=0.692, total=   6.0s
[CV] max_depth=30, n_estimators=10 ...................................
[CV] ....... max_depth=30, n_estimators=10, score=0.693, total=   5.7s
[CV] max_depth=30, n_estimators=10 ...................................
[CV] ....... max_depth=30, n_estimators=10, score=0.691, total=   5.6s
[CV] max_depth=30, n_estimators=10 ...................................
[CV] ....... max_depth=30, n_estimators=10, score=0.695, total=   5.5s
[CV] max_depth=30, n_estimators=20 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  8.3min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### Mejor score

In [36]:
print('Mejor score:', model.best_score_)
print('\nMejores parámetros de grilla:', model.best_params_)
print('\nParámetros del mejor modelo:')
display(model.best_estimator_.get_params())

Mejor score: 0.7131323344212246

Mejores parámetros de grilla: {'max_depth': 30, 'n_estimators': 50}

Parámetros del mejor modelo:


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}