# XGBoost

In [17]:
import joblib
import pandas as pd
from multiprocessing import cpu_count
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
import os
from scipy.stats import uniform, loguniform, randint    
from xgboost import XGBClassifier
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import f1_score
from numpy import linspace

# 1 ABERTURA DOS DADOS

In [9]:
# Caminho
data_dir = "C:/Users/Elaine/Desktop/TCC_acidentes/data/" 

# Carregar o preprocess
preprocess = joblib.load('preprocess.pkl')

# Carregar os splits
X_train, X_test, y_train, y_test = joblib.load('data_splits.pkl')

X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 91628 entries, 44026 to 43353
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uf                      91628 non-null  object 
 1   br                      91433 non-null  float64
 2   km                      91433 non-null  float64
 3   dia_semana              91628 non-null  object 
 4   fase_dia                91628 non-null  object 
 5   sentido_via             91628 non-null  object 
 6   condicao_metereologica  91628 non-null  object 
 7   tipo_pista              91628 non-null  object 
 8   tracado_via             91628 non-null  object 
 9   uso_solo                91628 non-null  object 
 10  tipo_veiculo            88174 non-null  object 
 11  ano                     91628 non-null  int64  
 12  frota                   90614 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 9.8+ MB


Unnamed: 0,uf,br,km,dia_semana,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,tipo_veiculo,ano,frota
44026,GO,40.0,16.0,sÃ¡bado,Plena Noite,Crescente,CÃ©u Claro,Dupla,Reta,Sim,AutomÃ³vel,2019,66560.0
9134,PE,101.0,81.0,terÃ§a-feira,Pleno dia,Crescente,CÃ©u Claro,MÃºltipla,Retorno Regulamentado,NÃ£o,Semireboque,2021,217949.0
54794,PR,476.0,135.0,quarta-feira,Pleno dia,Decrescente,CÃ©u Claro,MÃºltipla,Reta,Sim,AutomÃ³vel,2020,1622891.0
106025,SC,101.0,311.0,quarta-feira,Plena Noite,Crescente,Nublado,Simples,RotatÃ³ria,NÃ£o,AutomÃ³vel,2021,31357.0
52102,MS,163.0,490.0,quinta-feira,Plena Noite,Crescente,CÃ©u Claro,Simples,RotatÃ³ria,Sim,Ãnibus,2019,613060.0


# 2 OTIMIZAÇÃO DO MODELO

In [10]:
# Variáveis Categóricas
cat_features = ['br', 'uf', 'dia_semana', 'fase_dia', 'sentido_via', 
                'condicao_metereologica', 'tipo_pista', 'tracado_via', 
                'uso_solo', 'tipo_veiculo'
                ] 


for col in cat_features:
    X_train[col] = X_train[col].astype(str).astype('category')
    X_test[col] = X_test[col].astype(str).astype('category')
    valid_categories = X_train[col].cat.categories
    X_test[col] = X_test[col].astype(str).astype('category')
    X_test[col] = X_test[col].cat.set_categories(valid_categories)

# Modelo
xgb = XGBClassifier(
    tree_method='hist',      
    enable_categorical=True,  #! No lugar de One Hot Encoder
    objective='binary:logistic',
    n_jobs=cpu_count() // 2,
    random_state=42
)

# Grid
param_grid = {
    'n_estimators': randint(500, 1000),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(4, 10),
    'scale_pos_weight': uniform(2, 6), # Ajuste pro seu desbalanceamento
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

# Busca
random_xgb = RandomizedSearchCV(
    xgb,
    param_grid,
    n_iter=50,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=cpu_count() // 2,
)

In [11]:
# Fit
random_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': <scipy.stats....0023D3180AFD0>, 'learning_rate': <scipy.stats....0023D31807620>, 'max_depth': <scipy.stats....0023D318DCE10>, 'n_estimators': <scipy.stats....0023D31807770>, ...}"
,n_iter,50
,scoring,'f1'
,n_jobs,2
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.8890746133104213)
,device,
,early_stopping_rounds,
,enable_categorical,True


# 3 DIAGNÓSTICO

## 3.1 Métricas Básicas

In [13]:
# Resultados
y_pred_xgb = random_xgb.predict(X_test)
print(f"Melhor F1 na Validação Cruzada: {random_xgb.best_score_:.4f}")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

Melhor F1 na Validação Cruzada: 0.4921
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     19305
           1       0.51      0.56      0.53      3603

    accuracy                           0.84     22908
   macro avg       0.71      0.73      0.72     22908
weighted avg       0.85      0.84      0.85     22908

[[17351  1954]
 [ 1598  2005]]


## 3.2 Mudança de Limiar do Predict

In [14]:
y_proba = random_xgb.predict_proba(X_test)[:, 1]

# 2. Teste vários limiares
thresholds = linspace(0, 0.9, 25)

print("Limiar | F1-Score | Recall | Precision")
print("---------------------------------------")
for th in thresholds:
    y_pred_th = (y_proba >= th).astype(int)
    f1 = f1_score(y_test, y_pred_th)
    # Calculando recall e precision manualmente para printar
    rec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_test == 1)
    prec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_pred_th == 1)
    
    print(f" {th:.2f}  |  {f1:.4f}  |  {rec:.4f} |  {prec:.4f}")

Limiar | F1-Score | Recall | Precision
---------------------------------------
 0.00  |  0.2718  |  1.0000 |  0.1573
 0.04  |  0.3843  |  0.8951 |  0.2447
 0.07  |  0.4208  |  0.8385 |  0.2809
 0.11  |  0.4464  |  0.7971 |  0.3101
 0.15  |  0.4620  |  0.7610 |  0.3317
 0.19  |  0.4782  |  0.7333 |  0.3548
 0.22  |  0.4919  |  0.7111 |  0.3760
 0.26  |  0.4985  |  0.6847 |  0.3919
 0.30  |  0.5082  |  0.6633 |  0.4119
 0.34  |  0.5163  |  0.6400 |  0.4327
 0.38  |  0.5233  |  0.6223 |  0.4515
 0.41  |  0.5268  |  0.6012 |  0.4687
 0.45  |  0.5288  |  0.5823 |  0.4843
 0.49  |  0.5287  |  0.5629 |  0.4985
 0.53  |  0.5297  |  0.5443 |  0.5159
 0.56  |  0.5289  |  0.5251 |  0.5328
 0.60  |  0.5239  |  0.5032 |  0.5464
 0.64  |  0.5208  |  0.4782 |  0.5717
 0.67  |  0.5166  |  0.4574 |  0.5934
 0.71  |  0.5113  |  0.4346 |  0.6207
 0.75  |  0.4995  |  0.4108 |  0.6371
 0.79  |  0.4859  |  0.3836 |  0.6628
 0.82  |  0.4648  |  0.3478 |  0.7008
 0.86  |  0.4306  |  0.3056 |  0.7287
 0.90  | 

## 3.2Importância das variáveis

In [None]:
best_model = random_xgb.best_estimator_

r = permutation_importance(
    best_model,
    X_test,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring='f1',  # mesma métrica do treino
    n_jobs=-1
)

perm_importance = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': r.importances_mean,
    'importance_std': r.importances_std
}).sort_values(by='importance_mean', ascending=False)

perm_importance


Unnamed: 0,feature,importance_mean,importance_std
10,tipo_veiculo,0.266586,0.005711
1,br,0.147391,0.003401
0,uf,0.069709,0.004158
8,tracado_via,0.066861,0.002343
12,frota,0.056652,0.005984
3,dia_semana,0.047106,0.004392
2,km,0.040937,0.002934
6,condicao_metereologica,0.023738,0.002202
4,fase_dia,0.015146,0.002629
7,tipo_pista,0.011501,0.002179
