In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [2]:
datos = pd.read_csv('data_train_clean_final.csv')

In [3]:
datos_test = pd.read_csv('data_test_clean_final.csv')

In [4]:
labels = datos['survived']
train = datos.drop(columns='survived')

### Búsqueda de mejores hiperparámetros con RandomizedSearch

In [40]:
param_grid = {
    'num_leaves': [20, 80, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 500, 1000, 2000],
    'max_depth': [3, 5, 20, 30],
    'min_child_samples': [2, 40, 100],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree':[0.5, 0.8, 1.0]}

In [41]:
random_search = RandomizedSearchCV(
    estimator=LGBMClassifier(),
    param_distributions=param_grid,
    n_iter=100, cv=5,
    scoring='accuracy', n_jobs=-1)

In [42]:
# Fit the model to the training data
random_search.fit(train, labels)

[LightGBM] [Info] Number of positive: 500, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 1049, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476644 -> initscore=-0.093490
[LightGBM] [Info] Start training from score -0.093490


In [43]:
mejores_param = random_search.best_params_
# Ver los mejores hiperparámetros encontrados
print("Mejores hiperparámetros encontrados:")
print(random_search.best_params_)

Mejores hiperparámetros encontrados:
{'subsample': 1.0, 'num_leaves': 80, 'n_estimators': 100, 'min_child_samples': 2, 'max_depth': 30, 'learning_rate': 0.01, 'colsample_bytree': 0.5}


### Evaluación Modelo con Cross Validation

In [44]:
# Creo instancia y utilizo los hiperparámetros anteriores.

lgbm = LGBMClassifier(learning_rate = 0.01, num_leaves = 80, max_depth = 30, min_child_samples = 2, n_estimators = 100, subsample = 1.0, colsample_bytree = 0.5)

In [45]:
# Entreno el modelo
lgbm.fit(train, labels)

[LightGBM] [Info] Number of positive: 500, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 1049, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476644 -> initscore=-0.093490
[LightGBM] [Info] Start training from score -0.093490


In [54]:
# Realiza la validación cruzada con 5 particiones
scores = cross_val_score(lgbm, train, labels, cv=15)

# Muestra los puntajes de validación cruzada
print("Puntajes de validación cruzada:", scores)
print("Precisión media:", scores.mean())

[LightGBM] [Info] Number of positive: 467, number of negative: 512
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 376
[LightGBM] [Info] Number of data points in the train set: 979, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477017 -> initscore=-0.091995
[LightGBM] [Info] Start training from score -0.091995
[LightGBM] [Info] Number of positive: 467, number of negative: 512
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 979, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477017 -> initscore=-0.091995
[LightGBM] [Info

Al realizar la validación cruzada con un total de 15 particiones, obtenemos una precisión media de 85,13%.

### Importancia de variables

In [47]:
# Obtener la importancia de las características
importances = lgbm.feature_importances_

# Crear un DataFrame para mostrar las importancias junto con el nombre de las características
feature_importances = pd.DataFrame(importances, index=train.columns, columns=['Importance'])

# Ordenar las características por su importancia de mayor a menor
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Mostrar las características ordenadas por su importancia
feature_importances

Unnamed: 0,Importance
fare,2357
age,1974
sibsp,479
family_size,449
parch,405
embarked,285
age_bin_adult,279
pclass,183
is_alone,175
age_bin_teen,152


Fare y age son las variables con mayor importancia para este modelo, por lejos.

### Predicciones

In [48]:
predicciones = lgbm.predict(datos_test)

In [49]:
predicciones_lgbm = pd.DataFrame(predicciones, columns=['Survived'])

In [50]:
columnas_train = pd.read_csv('test.csv')

In [51]:
columna_id = columnas_train['PassengerId']

In [52]:
intento_lgbm = pd.concat([columna_id, predicciones_lgbm], axis=1)

In [53]:
intento_lgbm.to_csv('intento_lgbm_final.csv', index = False)