In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/accidentes-usa-db/accidentes_usa.db


### <span style="color:#c70d39">**Cargando Data Preprocesada (Train y Test)**</span>

In [2]:
#!pip install --upgrade xgboost

In [2]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import sqlite3
import pandas as pd
import warnings
import cupy as cp
import numpy as np

# Desactivar solo las advertencias de deprecación
warnings.filterwarnings("ignore", category=DeprecationWarning)




In [32]:
conn = sqlite3.connect('/kaggle/input/accidentes-usa-db/accidentes_usa.db')
cursor_obj = conn.cursor()

# Cargar los conjuntos de datos
X_train = pd.read_sql('SELECT * FROM X_train', conn)
X_test = pd.read_sql('SELECT * FROM X_test', conn)
y_train = pd.read_sql('SELECT * FROM y_train', conn)
y_test = pd.read_sql('SELECT * FROM y_test', conn)

# Cerrar la conexión
conn.close()

In [21]:
y_train.value_counts()

Severity
4           16404
1           16386
3           16306
2           16184
Name: count, dtype: int64

In [33]:
# Ajustar las etiquetas de las clases para que comiencen desde 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Configurar el modelo XGBoost para usar la GPU
model_XGBC = xgb.XGBClassifier(tree_method='hist', device='cuda')

# Entrenar el modelo
model_XGBC.fit(X_train, y_train_adjusted)

# Realizar predicciones
y_pred = model_XGBC.predict(X_test)

# Ajustar las predicciones para que coincidan con las etiquetas originales
y_pred_adj = y_pred + 1

# Evaluar el modelo
print(classification_report(y_test, y_pred_adj))


              precision    recall  f1-score   support

           1       0.70      0.82      0.75      4014
           2       0.65      0.59      0.62      4216
           3       0.63      0.62      0.63      4094
           4       0.59      0.55      0.57      3996

    accuracy                           0.65     16320
   macro avg       0.64      0.65      0.64     16320
weighted avg       0.64      0.65      0.64     16320



### Optimizacion con GridSearchCV

In [28]:
!pip install tqdm
!pip install dill





In [14]:
y_train.value_counts()

Severity
4           16404
1           16386
3           16306
2           16184
Name: count, dtype: int64

In [31]:
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import cupy as cp
import numpy as np

# Convertir y_train y y_test a arrays de NumPy
y_train_array = y_train.values if hasattr(y_train, 'values') else y_train
y_test_array = y_test.values if hasattr(y_test, 'values') else y_test

# Ajustar las etiquetas de las clases para que comiencen desde 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Mover los datos a la GPU
X_train_gpu = cp.array(X_train)
y_train_gpu = cp.array(y_train_adjusted)
X_test_gpu = cp.array(X_test)
y_test_gpu = cp.array(y_test_adjusted)

# Definir el modelo XGBoost con early stopping
model_XGBC = xgb.XGBClassifier(
    tree_method='hist', 
    device='cuda', 
    early_stopping_rounds=10
)

# Reducir el espacio de búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.001, 0.05],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.9],
    'gamma': [0, 0.1]
}

# Configurar GridSearchCV sin dask
grid_search = GridSearchCV(
    model_XGBC, 
    param_grid=param_grid, 
    scoring='accuracy', 
    cv=5, 
    verbose=2,  # Aumentar el nivel de verbose
    n_jobs=1  
)

# Entrenar el modelo con GridSearchCV
grid_search.fit(
    cp.asnumpy(X_train_gpu), 
    cp.asnumpy(y_train_gpu), 
    eval_set=[(cp.asnumpy(X_test_gpu), cp.asnumpy(y_test_gpu))],
    verbose=False
)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_

# Realizar predicciones con el mejor modelo
y_pred_gpu = best_model.predict(cp.asnumpy(X_test_gpu))

# Ajustar las predicciones para que coincidan con las etiquetas originales
y_pred_adj = y_pred_gpu + 1

print("Mejores hiperparámetros:", grid_search.best_params_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.

In [38]:
# Ajustar las etiquetas de las clases para que comiencen desde 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

# Configurar el modelo XGBoost para usar la GPU
model_xgbs_gs = xgb.XGBClassifier(colsample_bytree=0.9, gamma=0.1,learning_rate= 0.05, max_depth=10,
                               n_estimators= 200, subsample= 0.7, tree_method='hist', device='cuda',
                               random_state = 42)

# Entrenar el modelo
model_xgbs_gs.fit(X_train, y_train_adjusted)

# Realizar predicciones
y_pred_gs = model_xgbs_gs.predict(X_test)

# Ajustar las predicciones para que coincidan con las etiquetas originales
y_pred_adj = y_pred_gs + 1

# Evaluar el modelo
print(classification_report(y_test, y_pred_adj))


              precision    recall  f1-score   support

           1       0.70      0.81      0.75      4014
           2       0.66      0.60      0.63      4216
           3       0.63      0.63      0.63      4094
           4       0.61      0.57      0.59      3996

    accuracy                           0.65     16320
   macro avg       0.65      0.65      0.65     16320
weighted avg       0.65      0.65      0.65     16320



In [39]:
from pickle import dump

dump(model_xgbs_gs, open("model_xgbc_42_con_scal.sav", "wb"))