# Carga de librerías y directorios

In [2]:
import pandas as pd
import numpy as np

# Métrica de evaluación
from sklearn.metrics import f1_score
# fold_score = f1_score(y_test, prediction, average='micro')
# Se especifica average por tener un target multiclase

# Clasificador LightGBM
from lightgbm import LGBMClassifier

# Para la división en k-folds
from sklearn.model_selection import KFold

# Para pasar parámetros varias veces a una función
from functools import partial

# Optuna
import optuna

# Librería para guardar los Trials.
import joblib

# Función para cambiar tipos de datos
from resources.utilidades import cambio_tipos

# Rutas de los archivos a usar

In [3]:
# Ruta train_values.csv
dir_values = 'data/train_values.csv'
# Ruta train_labels.csv
dir_labels = 'data/train_labels.csv'
# Ruta de Binary Encodings para train_values.csv
dir_resources = 'resources'

# Train set

In [4]:
columnas = ['building_id',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other']

# Carga de train_values.csv
train = pd.read_csv(dir_values, usecols=columnas)

### Cambio de tipos de datos

In [5]:
train = cambio_tipos(train)

### Carga de columnas codificadas: One Hot Encoding

In [6]:
# 38 columnas más
train = train.join(pd.read_csv(dir_resources+f'/OHE_train.csv', dtype='uint8'))

### Se quitan columnas

In [7]:
# Se le quitan columnas
to_drop = ['building_id', 'has_secondary_use_school', 'has_secondary_use_health_post',  \
           'has_secondary_use_gov_office',  'has_secondary_use_use_police',  \
           'C_a', 'C_f', 'C_m', 'C_n', 'C_o']
train = train.drop(columns=to_drop)

# Valores del data set
X = train.values

### Carga de labels

In [10]:
# Carga de train_labels.csv
labels = pd.read_csv(dir_labels, usecols=['damage_grade'], dtype='uint8')

# Valores del data set
y = labels.damage_grade.values

Parámetros iniciales
```
    num_leaves = trial.suggest_int('num_leaves', 30, 200, 10)
    n_estimators = trial.suggest_int('n_estimators', 400, 1300, 50)
    max_depth = trial.suggest_int('max_depth', 20, 100, 5)
    learning_rate = trial.suggest_uniform('learning_rate', 0.05, 0.5)
```
Para nuevos `studio`, recordar revisar el número de splits por si se quiere cambiar.
```
    kf = KFold(n_splits=  números de splits  )
```

In [11]:
# trial: 
# x: features
# y: target

def optimize(trial, x, y):
    # Se define el espacio de parámetros
    num_leaves = trial.suggest_int('num_leaves', 30, 200, 10)
    n_estimators = trial.suggest_int('n_estimators', 400, 1300, 50)
    max_depth = trial.suggest_int('max_depth', 20, 100, 5)
    learning_rate = trial.suggest_uniform('learning_rate', 0.05, 0.5)
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Se instancia el modelo
    model = LGBMClassifier(
        num_leaves = num_leaves,
        n_estimators = n_estimators,
        max_depth = max_depth,
        learning_rate = learning_rate)
    # Hay que realizar el k-folding
    kf = KFold(n_splits=8)
    # Lista de scores
    scores = []
    # Se dividen los datos
    for index in kf.split(X=x, y=y):
        train_index, test_index = index[0], index[1]
        X_train = x[train_index]
        y_train = y[train_index]
        X_test = x[test_index]
        y_test = y[test_index]
        # Entrenamiento del k-fold y predicción
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        fold_score = f1_score(y_test, prediction, average='micro')
        # Se agrega el score a la lista
        scores.append(fold_score)
    # Ahora hay que devolver la función para minimizar
    return -1.0 * np.mean(scores)

# ===========================================================================
# Función de optimización con función partial
# Cada vez que se cambia el espacio de parámetros hay que ejecutar esta orden
optimization_function = partial(optimize, x=X, y=y)

 Si `nuevo_studio = True` se crea una nueva instancia de Study.

### Si se quiere continuar con otro hay que cargar el archivo pickle guardado y usar ese Study.

In [12]:
#=====================
######################
nuevo_studio = False
######################
#=====================

if nuevo_studio:
    study = optuna.create_study(direction='minimize')
else: print('Cargar otro')

[32m[I 2021-07-27 17:09:19,355][0m A new study created in memory with name: no-name-944e5519-8685-4b72-b5b9-00c28a98d79c[0m


### Optimización
Se realizan tantas operaciones como indique `n_trials`. Si es un study cargado, las evaluaciones se acumulan con las ya realizadas.

### Solo es necesario ejecutar la siguiente celda para agregar `n_trials` evaluaciones al study.

---
---

In [13]:
study.optimize(optimization_function, n_trials=20)

[32m[I 2021-07-27 17:11:19,157][0m Trial 0 finished with value: -0.8569310083500823 and parameters: {'num_leaves': 88, 'n_estimators': 580, 'max_depth': 48, 'learning_rate': 0.1026384813254633}. Best is trial 0 with value: -0.8569310083500823.[0m
[32m[I 2021-07-27 17:13:02,673][0m Trial 1 finished with value: -0.856247966971527 and parameters: {'num_leaves': 78, 'n_estimators': 580, 'max_depth': 42, 'learning_rate': 0.13712964128093}. Best is trial 0 with value: -0.8569310083500823.[0m
[32m[I 2021-07-27 17:15:08,885][0m Trial 2 finished with value: -0.8580284757330446 and parameters: {'num_leaves': 86, 'n_estimators': 640, 'max_depth': 52, 'learning_rate': 0.06552767007213364}. Best is trial 2 with value: -0.8580284757330446.[0m
[32m[I 2021-07-27 17:16:58,228][0m Trial 3 finished with value: -0.858254877524965 and parameters: {'num_leaves': 82, 'n_estimators': 560, 'max_depth': 48, 'learning_rate': 0.07097183280046414}. Best is trial 3 with value: -0.858254877524965.[0m
[3

KeyboardInterrupt: 

In [143]:
print('Cantidad de evaluaciones:', len(study.trials))
print('Mejor evaluación:', study.best_value, 'en trial', study.best_trial.number)
print('Mejores parámetros:', study.best_params)
df = study.trials_dataframe().nsmallest(10, 'value')[['value', 'duration', 'params_learning_rate', 'params_max_depth', 'params_n_estimators', 'params_num_leaves']]
df['duration'] = df['duration'].dt.seconds
df

Cantidad de evaluaciones: 140
Mejor evaluación: -0.74535784286882 en trial 122
Mejores parámetros: {'num_leaves': 112, 'n_estimators': 880, 'max_depth': 61, 'learning_rate': 0.0691427312603393}


Unnamed: 0,value,duration,params_learning_rate,params_max_depth,params_n_estimators,params_num_leaves
122,-0.745358,113,0.069143,61,880,112
124,-0.745277,113,0.070696,61,870,112
134,-0.745212,131,0.069161,61,870,138
110,-0.745201,119,0.070074,63,860,122
109,-0.745185,121,0.071428,59,870,122
32,-0.745143,108,0.083838,75,850,110
11,-0.745024,113,0.070147,50,1050,80
91,-0.745024,122,0.066258,64,925,115
74,-0.745009,106,0.071106,62,800,115
43,-0.744997,116,0.060453,60,750,140


### Rangos de parámetros de las 10 mejores evaluaciones acumuladas

In [145]:
top_10 = study.trials_dataframe().nsmallest(10, 'value')
parametro = ['learning_rate', 'num_leaves', 'n_estimators', 'max_depth']
for i in np.arange(len(parametro)):
    print(parametro[i], \
          '\tmim:', top_10['params_'+parametro[i]].min().round(4), \
          '\tmax:', top_10['params_'+parametro[i]].max().round(4), \
          '\tmean:', top_10['params_'+parametro[i]].mean().round(4), \
          '\tmedian:', top_10['params_'+parametro[i]].median().round(4), \
          '\tstd:', top_10['params_'+parametro[i]].std().round(4))

learning_rate 	mim: 0.0605 	max: 0.0838 	mean: 0.0702 	median: 0.0701 	std: 0.0058
num_leaves 	mim: 80 	max: 140 	mean: 116.6 	median: 115.0 	std: 16.648
n_estimators 	mim: 750 	max: 1050 	mean: 872.5 	median: 870.0 	std: 78.3599
max_depth 	mim: 50 	max: 75 	mean: 61.6 	median: 61.0 	std: 6.0773


### Para 5 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_5Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.0605, 0.0838)`
- `'num_leaves'` $\in$ `(110, 140)`
- `'n_estimators'` $\in$ `(800, 900)` $\longleftarrow$ Probar con `(850, 950)`
- `'max_depth'` $\in$ `(55, 65)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 140
Mejor evaluación: -0.74535784286882 en trial 122
Mejores parámetros: {'num_leaves': 112, 'n_estimators': 880, 'max_depth': 61, 'learning_rate': 0.0691427312603393}
```

### Para 6 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_6Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.0606, 0.0773)`
- `'num_leaves'` $\in$ `(167, 172)`
- `'n_estimators'` $\in$ `(600, 640)`
- `'max_depth'` $\in$ `(50, 68)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 80
Mejor evaluación: -0.7458375076268551 en trial 41
Mejores parámetros: {'num_leaves': 168, 'n_estimators': 620, 'max_depth': 60, 'learning_rate': 0.06716342852245775}
```

### Para 7 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_7Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.0771, 0.1000)`
- `'num_leaves'` $\in$ `(105, 113)`
- `'n_estimators'` $\in$ `(830, 940)`
- `'max_depth'` $\in$ `(70, 92)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 100
Mejor evaluación: -0.7460869325437617 en trial 53
Mejores parámetros: {'num_leaves': 106, 'n_estimators': 830, 'max_depth': 85, 'learning_rate': 0.07781085417266771}
```

### Para 8 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_8Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.1242, 0.1393)`
- `'num_leaves'` $\in$ `(79, 93)`
- `'n_estimators'` $\in$ `(600, 625)`
- `'max_depth'` $\in$ `(51, 60)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 90
Mejor evaluación: -0.7458528540897384 en trial 63
Mejores parámetros: {'num_leaves': 82, 'n_estimators': 620, 'max_depth': 52, 'learning_rate': 0.12450509333352734}
```

### Para 9 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_9Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.12, 0.2)`
- `'num_leaves'` $\in$ `(58, 76)`
- `'n_estimators'` $\in$ `(580, 690)`
- `'max_depth'` $\in$ `(40, 50)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 60
Mejor evaluación: -0.7461675108209561 en trial 48
Mejores parámetros: {'num_leaves': 59, 'n_estimators': 755, 'max_depth': 59, 'learning_rate': 0.15673237668876525}
```

### Para 10 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_10Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.09, 0.1)`
- `'num_leaves'` $\in$ `(110, 130)`
- `'n_estimators'` $\in$ `(1045, 1200)`
- `'max_depth'` $\in$ `(33, 39)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 100
Mejor evaluación: -0.7462826329030334 en trial 87
Mejores parámetros: {'num_leaves': 123, 'n_estimators': 1065, 'max_depth': 36, 'learning_rate': 0.09406462135767253}
```

### Para 4 KFold, One_Hot_Encoding
`dropped_columns: 'building_id'`
- `study_LightGBM_4Fold.pkl`
```
Cantidad de evaluaciones: 25
Mejores parámetros: {'num_leaves': 72, 'n_estimators': 831, 'max_depth': 72, 'learning_rate': 0.10069045291345259}
```

# Guardar progreso de búsquedas
### Verificar el nombre del archivo

In [147]:
joblib.dump(study, 'pickle/study_LightGBM_stacked.pkl')

['pickle/study_LightGBM_clean_5Fold.pkl']

# Cargar avances anteriores
### Verificar el nombre del archivo

In [148]:
study = joblib.load('pickle/study_LightGBM_stacked.pkl')

print('Cantidad de evaluaciones:',len(study.trials))
print('Mejor evaluación:', study.best_value, 'en trial', study.best_trial.number)
print('Mejores parámetros:', study.best_params)
print('Evaluaciones:')
study.trials_dataframe().nsmallest(10, 'value')[['value', 'duration', 'params_learning_rate', 'params_max_depth', 'params_n_estimators', 'params_num_leaves']]

Cantidad de evaluaciones: 140
Mejor evaluación: -0.74535784286882 en trial 122
Mejores parámetros: {'num_leaves': 112, 'n_estimators': 880, 'max_depth': 61, 'learning_rate': 0.0691427312603393}
Evaluaciones:


Unnamed: 0,value,duration,params_learning_rate,params_max_depth,params_n_estimators,params_num_leaves
122,-0.745358,0 days 00:01:53.997304,0.069143,61,880,112
124,-0.745277,0 days 00:01:53.912116,0.070696,61,870,112
134,-0.745212,0 days 00:02:11.776983,0.069161,61,870,138
110,-0.745201,0 days 00:01:59.218402,0.070074,63,860,122
109,-0.745185,0 days 00:02:01.608046,0.071428,59,870,122
32,-0.745143,0 days 00:01:48.595696,0.083838,75,850,110
11,-0.745024,0 days 00:01:53.607727,0.070147,50,1050,80
91,-0.745024,0 days 00:02:02.570888,0.066258,64,925,115
74,-0.745009,0 days 00:01:46.633308,0.071106,62,800,115
43,-0.744997,0 days 00:01:56.912877,0.060453,60,750,140
