# Carga de librerías y directorios

In [1]:
import pandas as pd
import numpy as np

# Métrica de evaluación
from sklearn.metrics import f1_score
# fold_score = f1_score(y_test, prediction, average='micro')
# Se especifica average por tener un target multiclase

# Clasificador LightGBM
from lightgbm import LGBMClassifier

# Para la división en k-folds
from sklearn.model_selection import KFold

# Para pasar parámetros varias veces a una función
from functools import partial

# Optuna
import optuna

# Librería para guardar los Trials.
import joblib

# Función para cambiar tipos de datos
from resources.utilidades import cambio_tipos

# Rutas de los archivos a usar

In [2]:
# Ruta train_values.csv
dir_values = 'data/train_values.csv'
# Ruta train_labels.csv
dir_labels = 'data/train_labels.csv'
# Ruta de Binary Encodings para train_values.csv
dir_resources = 'resources'

# Train set

In [3]:
columnas = ['building_id',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other']

# Carga de train_values.csv
train = pd.read_csv(dir_values, usecols=columnas)

### Cambio de tipos de datos

In [4]:
train = cambio_tipos(train)

### Carga de columnas codificadas: One Hot Encoding

In [5]:
# 38 columnas más
train = train.join(pd.read_csv(dir_resources+f'/OHE_train.csv', dtype='uint8'))

### Se quitan columnas

In [6]:
# Se le quitan columnas
to_drop = ['building_id', 'has_secondary_use_school', 'has_secondary_use_health_post',  \
           'has_secondary_use_gov_office',  'has_secondary_use_use_police',  \
           'C_a', 'C_f', 'C_m', 'C_n', 'C_o']
train = train.drop(columns=to_drop)

# GridSearchCV, RandomizedSerach, Bayesian Search y LightGBMClassifier necesitan, o funcionan mejor con, los valores en una dimensión.
# Valores del data set
X = train.values

### Carga de labels

In [7]:
# Carga de train_labels.csv
labels = pd.read_csv(dir_labels, usecols=['damage_grade'], dtype='uint8')

# GridSearchCV, RandomizedSerach, Bayesian Search y LightGBMClassifier necesitan, o funcionan mejor con, los valores en una dimensión.
# Valores del data set
y = labels.damage_grade.values

In [22]:
# trial: 
# x: features
# y: target

def optimize(trial, x, y):
    # Se define el espacio de parámetros
    num_leaves = trial.suggest_int('num_leaves', 105, 120, 1)
    n_estimators = trial.suggest_int('n_estimators', 560, 940, 10)
    max_depth = trial.suggest_int('max_depth', 45, 95, 5)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0727, 0.1343)
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Se instancia el modelo
    model = LGBMClassifier(
        num_leaves = num_leaves,
        n_estimators = n_estimators,
        max_depth = max_depth,
        learning_rate = learning_rate)
    # Hay que realizar el k-folding
    kf = KFold(n_splits=7)
    # Lista de scores
    scores = []
    # Se dividen los datos
    for index in kf.split(X=x, y=y):
        train_index, test_index = index[0], index[1]
        X_train = x[train_index]
        y_train = y[train_index]
        X_test = x[test_index]
        y_test = y[test_index]
        # Entrenamiento del k-fold y predicción
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        fold_score = f1_score(y_test, prediction, average='micro')
        # Se agrega el score a la lista
        scores.append(fold_score)
    # Ahora hay que devolver la función para minimizar
    return -1.0 * np.mean(scores)

# ===========================================================================
# Función de optimización con función partial
# Cada vez que se cambia el espacio de parámetros hay que ejecutar esta orden
optimization_function = partial(optimize, x=X, y=y)

 Si `nuevo_studio = True` se crea una nueva instancia de Study.

### Si se quiere continuar con otro hay que cargar el archivo pickle guardado y usar ese Study.

In [168]:
#=====================
######################
nuevo_studio = False
######################
#=====================

if nuevo_studio:
    study = optuna.create_study(direction='minimize')
else: print('Cargar otro')

[32m[I 2021-07-26 00:31:54,386][0m A new study created in memory with name: no-name-9479dd4d-fbc9-4c70-a423-c1a15079b13e[0m


### Optimización
Se realizan tantas operaciones como indique `n_trials`. Si es un study cargado, las evaluaciones se acumulan con las ya realizadas.

### Solo es necesario ejecutar la siguiente celda para agregar `n_trials` evaluaciones al study.

---
---

In [23]:
study.optimize(optimization_function, n_trials=10)

[32m[I 2021-07-26 12:07:08,798][0m Trial 60 finished with value: -0.7457684394004939 and parameters: {'num_leaves': 113, 'n_estimators': 890, 'max_depth': 90, 'learning_rate': 0.08711406593170527}. Best is trial 53 with value: -0.7460869325437617.[0m
[32m[I 2021-07-26 12:09:51,809][0m Trial 61 finished with value: -0.7450163314630233 and parameters: {'num_leaves': 114, 'n_estimators': 890, 'max_depth': 90, 'learning_rate': 0.08822335635397603}. Best is trial 53 with value: -0.7460869325437617.[0m
[32m[I 2021-07-26 12:12:27,186][0m Trial 62 finished with value: -0.7448014468676198 and parameters: {'num_leaves': 112, 'n_estimators': 870, 'max_depth': 85, 'learning_rate': 0.09475943225671156}. Best is trial 53 with value: -0.7460869325437617.[0m
[32m[I 2021-07-26 12:15:10,969][0m Trial 63 finished with value: -0.7446748165458141 and parameters: {'num_leaves': 118, 'n_estimators': 840, 'max_depth': 95, 'learning_rate': 0.08651379974214805}. Best is trial 53 with value: -0.746086

In [24]:
print('Cantidad de evaluaciones:', len(study.trials))
print('Mejor evaluación:', study.best_value, 'en trial', study.best_trial.number)
print('Mejores parámetros:', study.best_params)
df = study.trials_dataframe().nsmallest(10, 'value')[['value', 'duration', 'params_learning_rate', 'params_max_depth', 'params_n_estimators', 'params_num_leaves']]
df['duration'] = df['duration'].dt.seconds
df

Cantidad de evaluaciones: 70
Mejor evaluación: -0.7460869325437617 en trial 53
Mejores parámetros: {'num_leaves': 106, 'n_estimators': 830, 'max_depth': 85, 'learning_rate': 0.07781085417266771}


Unnamed: 0,value,duration,params_learning_rate,params_max_depth,params_n_estimators,params_num_leaves
53,-0.746087,145,0.077811,85,830,106
52,-0.74591,152,0.077106,90,870,107
15,-0.745795,105,0.134264,55,560,110
60,-0.745768,162,0.087114,90,890,113
47,-0.745768,146,0.090898,80,840,107
64,-0.745665,171,0.079717,90,930,113
68,-0.745653,163,0.084628,85,920,111
45,-0.745642,149,0.088214,90,890,107
44,-0.745626,156,0.086915,90,940,107
37,-0.745577,163,0.083053,70,920,105


### Rangos de parámetros de las 10 mejores evaluaciones acumuladas

In [25]:
top_10 = study.trials_dataframe().nsmallest(10, 'value')
parametro = ['learning_rate', 'num_leaves', 'n_estimators', 'max_depth']
for i in np.arange(len(parametro)):
    print(parametro[i], \
          '\tmim:', top_10['params_'+parametro[i]].min().round(4), \
          '\tmax:', top_10['params_'+parametro[i]].max().round(4), \
          '\tmean:', top_10['params_'+parametro[i]].mean().round(4), \
          '\tmedian:', top_10['params_'+parametro[i]].median().round(4), \
          '\tstd:', top_10['params_'+parametro[i]].std().round(4))

learning_rate 	mim: 0.0771 	max: 0.1343 	mean: 0.089 	median: 0.0858 	std: 0.0166
num_leaves 	mim: 105 	max: 113 	mean: 108.6 	median: 107.0 	std: 2.9136
n_estimators 	mim: 560 	max: 940 	mean: 859.0 	median: 890.0 	std: 111.4002
max_depth 	mim: 55 	max: 90 	mean: 82.5 	median: 87.5 	std: 11.607


### Para 7 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_7Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.0727, 0.1343)`
- `'num_leaves'` $\in$ `(105, 110)` $\longleftarrow$ __Probar elevando el máximo__
- `'n_estimators'` $\in$ `(560, 940)`
- `'max_depth'` $\in$ `(45, 95)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 60
Mejor evaluación: -0.7460869325437617 en trial 53
Mejores parámetros: {'num_leaves': 106, 'n_estimators': 830, 'max_depth': 85, 'learning_rate': 0.07781085417266771}
```

### Para 8 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_8Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.1242, 0.1393)`
- `'num_leaves'` $\in$ `(79, 93)`
- `'n_estimators'` $\in$ `(600, 625)`
- `'max_depth'` $\in$ `(51, 60)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 90
Mejor evaluación: -0.7458528540897384 en trial 63
Mejores parámetros: {'num_leaves': 82, 'n_estimators': 620, 'max_depth': 52, 'learning_rate': 0.12450509333352734}
```

### Para 9 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_9Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.12, 0.2)`
- `'num_leaves'` $\in$ `(58, 76)`
- `'n_estimators'` $\in$ `(580, 690)`
- `'max_depth'` $\in$ `(40, 50)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 60
Mejor evaluación: -0.7461675108209561 en trial 48
Mejores parámetros: {'num_leaves': 59, 'n_estimators': 755, 'max_depth': 59, 'learning_rate': 0.15673237668876525}
```

### Para 10 KFold, One_Hot_Encoding
```
- `study_LightGBM_clean_10Fold.pkl`
```
Tendencias:
- `'learning_rate'` $\in$ `(0.09, 0.1)`
- `'num_leaves'` $\in$ `(110, 130)`
- `'n_estimators'` $\in$ `(1045, 1200)`
- `'max_depth'` $\in$ `(33, 39)`
- `dropped_columns: 'building_id', 'has_secondary_use_school', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'C_a', 'C_f', 'C_m', 'C_n', 'C_o'`
```
Cantidad de evaluaciones: 100
Mejor evaluación: -0.7462826329030334 en trial 87
Mejores parámetros: {'num_leaves': 123, 'n_estimators': 1065, 'max_depth': 36, 'learning_rate': 0.09406462135767253}
```

### Para 4 KFold, One_Hot_Encoding
`dropped_columns: 'building_id'`
- `study_LightGBM_4Fold.pkl`
```
Cantidad de evaluaciones: 25
Mejores parámetros: {'num_leaves': 72, 'n_estimators': 831, 'max_depth': 72, 'learning_rate': 0.10069045291345259}
```

# Guardar progreso de búsquedas

In [26]:
joblib.dump(study, 'pickle/study_LightGBM_clean_7Fold.pkl')

['pickle/study_LightGBM_clean_7Fold.pkl']

# Cargar avances anteriores

In [20]:
study = joblib.load('pickle/study_LightGBM_clean_7Fold.pkl')

In [21]:
print('Cantidad de evaluaciones:',len(study.trials))
print('Mejor evaluación:', study.best_value, 'en trial', study.best_trial.number)
print('Mejores parámetros:', study.best_params)
print('Evaluaciones:')
study.trials_dataframe().nsmallest(10, 'value')[['value', 'duration', 'params_learning_rate', 'params_max_depth', 'params_n_estimators', 'params_num_leaves']]

Cantidad de evaluaciones: 60
Mejor evaluación: -0.7460869325437617 en trial 53
Mejores parámetros: {'num_leaves': 106, 'n_estimators': 830, 'max_depth': 85, 'learning_rate': 0.07781085417266771}
Evaluaciones:


Unnamed: 0,value,duration,params_learning_rate,params_max_depth,params_n_estimators,params_num_leaves
53,-0.746087,0 days 00:02:25.495716,0.077811,85,830,106
52,-0.74591,0 days 00:02:32.789452,0.077106,90,870,107
15,-0.745795,0 days 00:01:45.479879,0.134264,55,560,110
47,-0.745768,0 days 00:02:26.763885,0.090898,80,840,107
45,-0.745642,0 days 00:02:29.083680,0.088214,90,890,107
44,-0.745626,0 days 00:02:36.606736,0.086915,90,940,107
37,-0.745577,0 days 00:02:43.442802,0.083053,70,920,105
41,-0.745554,0 days 00:03:04.228723,0.074262,95,910,108
39,-0.745481,0 days 00:03:11.765937,0.07272,90,940,110
38,-0.745385,0 days 00:02:57.129261,0.081647,45,940,110
