<a href="https://colab.research.google.com/github/ffelfis/OrgaDatosTPs/blob/main/TP2/resources/OPT_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalación de optuna

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.8.0-py3-none-any.whl (301 kB)
[?25l[K     |█                               | 10 kB 20.5 MB/s eta 0:00:01[K     |██▏                             | 20 kB 23.5 MB/s eta 0:00:01[K     |███▎                            | 30 kB 16.7 MB/s eta 0:00:01[K     |████▍                           | 40 kB 12.6 MB/s eta 0:00:01[K     |█████▍                          | 51 kB 5.7 MB/s eta 0:00:01[K     |██████▌                         | 61 kB 5.7 MB/s eta 0:00:01[K     |███████▋                        | 71 kB 6.3 MB/s eta 0:00:01[K     |████████▊                       | 81 kB 6.2 MB/s eta 0:00:01[K     |█████████▊                      | 92 kB 6.3 MB/s eta 0:00:01[K     |██████████▉                     | 102 kB 5.4 MB/s eta 0:00:01[K     |████████████                    | 112 kB 5.4 MB/s eta 0:00:01[K     |█████████████                   | 122 kB 5.4 MB/s eta 0:00:01[K     |██████████████                  | 133 kB 5.4 MB/s eta 0:00:01[K 

# Lectura de datos de Google Drive

In [None]:
# Lectura de Dataset desde Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Para importar funciones customizadas

Hay que especificar la ruta de donde se encuentra el módulo (archivo `.py`) para buscar las funciones.

La lectura puede ser muy celosa: las indentaciones son de 4 espacios no tabulaciones.

https://colab.research.google.com/drive/1uvHuizCBqFgvbCwEhK7FvU8JW0AfxgJw

In [None]:
import sys
sys.path.append('/content/drive/My Drive/75.06 - Organización de Datos/TP2/resources')

# Carga de librerías y directorios

In [None]:
import pandas as pd
import numpy as np

# Métrica de evaluación
from sklearn.metrics import f1_score
# fold_score = f1_score(y_test, prediction, average='micro')
# Se especifica average por tener un target multiclase

# Clasificador
from sklearn.ensemble import RandomForestClassifier

# Para la división en k-folds
from sklearn.model_selection import StratifiedKFold

# Para pasar parámetros varias veces a una función
from functools import partial

# Librería de optimización
import optuna

# Librería para guardar los módulos de estudio: study
import joblib

# Función para cambiar tipos de datos
from utilidades import cambio_tipos

# Rutas de los archivos a usar

In [None]:
# Ruta train_values.csv
dir_values = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/train_values.csv'
# Ruta train_labels.csv
dir_labels = '/content/drive/My Drive/75.06 - Organización de Datos/TP1/Data/train_labels.csv'
# Ruta de Binary Encodings para train_values.csv
dir_resources = '/content/drive/My Drive/75.06 - Organización de Datos/TP2/resources'

---
#Entrenamiento
---
### Carga de train

In [None]:
columnas = ['building_id',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other']
 
# Carga de train_values.csv
train = pd.read_csv(dir_values, usecols=columnas)

### Cambio de tipos de datos

In [None]:
train = cambio_tipos(train)

### Carga de columnas codificadas: Binary Encoding

In [None]:
# 28 columnas más.
train = train.join(pd.read_csv(dir_resources+f'/BE_train.csv', dtype='uint8'))

X = train.values

### Carga de labels

In [None]:
# Carga de train_labels.csv
labels = pd.read_csv(dir_labels, usecols=['damage_grade'], dtype='uint8')

y = labels.damage_grade.values

# Optuna

- API Optuna: https://optuna.readthedocs.io/en/stable/reference/index.html

El espacio de parámetros hay que definirlo dentro de la función que opera para devolver el resultado a optimizar.

In [None]:
# trial:
# x: features
# y: target

def optimize(trial, x, y):
  # Se define el espacio de parámetros
  criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
  n_estimators = trial.suggest_int('n_estimators', 10, 50, 1)
  max_depth = trial.suggest_int('max_depth', 40, 60, 1)
  max_features = trial.suggest_uniform('max_features', 0.05, 1.0)
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Se instancia el modelo
  model = RandomForestClassifier(
      n_estimators = n_estimators,
      max_depth = max_depth,
      criterion = criterion,
      max_features = max_features
  )
  # Hay que realizar el k-folding
  kf = StratifiedKFold(n_splits=5)
  # Lista de scores
  scores = []
  # Se dividen los datos
  for index in kf.split(X=x, y=y):
    train_index, test_index = index[0], index[1]
    X_train = x[train_index]
    y_train = y[train_index]
    X_test = x[test_index]
    y_test = y[test_index]
    # Entrenamiento del k-fold
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    fold_score = f1_score(y_test, prediction, average='micro')
    # Se agrega el score a la lista
    scores.append(fold_score)
  # Ahora hay que devolver la función para minimizar
  return -1.0 * np.mean(scores)

Función a optimizar

In [None]:
optimization_function = partial(optimize, x=X, y=y)

Instancia de `study`.

In [None]:
study = optuna.create_study(direction='minimize')

[32m[I 2021-07-25 03:47:46,940][0m A new study created in memory with name: no-name-459a21ed-9c02-4056-b4b7-af4499bdbd00[0m


### Optimiación

In [None]:
study.optimize(optimization_function, n_trials=10)

[32m[I 2021-07-25 04:13:44,830][0m Trial 1 finished with value: -0.714889817458381 and parameters: {'criterion': 'entropy', 'n_estimators': 31, 'max_depth': 57, 'max_features': 0.19835971978850486}. Best is trial 1 with value: -0.714889817458381.[0m
[32m[I 2021-07-25 04:22:40,483][0m Trial 2 finished with value: -0.7279826169146554 and parameters: {'criterion': 'entropy', 'n_estimators': 39, 'max_depth': 53, 'max_features': 0.7616559529753357}. Best is trial 2 with value: -0.7279826169146554.[0m
[32m[I 2021-07-25 04:30:19,866][0m Trial 3 finished with value: -0.724521383274526 and parameters: {'criterion': 'gini', 'n_estimators': 35, 'max_depth': 51, 'max_features': 0.9775923556993334}. Best is trial 2 with value: -0.7279826169146554.[0m
[32m[I 2021-07-25 04:32:06,351][0m Trial 4 finished with value: -0.7116818534732842 and parameters: {'criterion': 'entropy', 'n_estimators': 10, 'max_depth': 43, 'max_features': 0.5521929718993152}. Best is trial 2 with value: -0.72798261691

In [None]:
study.best_params

{'criterion': 'gini',
 'max_depth': 45,
 'max_features': 0.6208355802503962,
 'n_estimators': 41}

In [None]:
study.get_trials()

[FrozenTrial(number=0, values=None, datetime_start=datetime.datetime(2021, 7, 25, 4, 10, 16, 462661), datetime_complete=datetime.datetime(2021, 7, 25, 4, 10, 24, 217569), params={'criterion': 'gini', 'n_estimators': 16, 'max_depth': 48, 'max_features': 0.055478285192109865}, distributions={'criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'n_estimators': IntUniformDistribution(high=50, low=10, step=1), 'max_depth': IntUniformDistribution(high=60, low=40, step=1), 'max_features': UniformDistribution(high=1.0, low=0.05)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.FAIL, value=None),
 FrozenTrial(number=1, values=[-0.714889817458381], datetime_start=datetime.datetime(2021, 7, 25, 4, 11, 6, 398279), datetime_complete=datetime.datetime(2021, 7, 25, 4, 13, 44, 830279), params={'criterion': 'entropy', 'n_estimators': 31, 'max_depth': 57, 'max_features': 0.19835971978850486}, distributions={'criterion': CategoricalDistribution(choices=

# Guardar progreso de búsquedas

In [None]:
joblib.dump(study, 'study.pkl')

['study.pkl']

# Cargar avances anteriores

In [None]:
study_loaded = joblib.load('study.pkl')

Se realizan tantas iteraciones más de búsqueda según indique `n_trials`.

In [None]:
study_loaded.optimize(optimization_function, n_trials=1)

[32m[I 2021-07-25 05:05:18,940][0m Trial 11 finished with value: -0.7283893662163387 and parameters: {'criterion': 'gini', 'n_estimators': 49, 'max_depth': 46, 'max_features': 0.8217330839743342}. Best is trial 6 with value: -0.7286886748399951.[0m


In [None]:
study_loaded.best_params

{'criterion': 'gini',
 'max_depth': 45,
 'max_features': 0.6208355802503962,
 'n_estimators': 41}