# XGBoost

In [22]:

import joblib
from multiprocessing import cpu_count
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import os
from scipy.stats import uniform, loguniform, randint    
from xgboost import XGBClassifier

from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import f1_score
from numpy import linspace

# I CONFIGURAÇÕES

In [23]:
# Defina os caminhos possíveis aqui
path_jorge = fr"G:\.shortcut-targets-by-id\1UEbcW3gnD82DQPOVUL7QL1Eitp5Qp_rx\Correções TCC\Tcc_aplicação\data"
path_elaine = "C:/Users/Elaine/Desktop/TCC_acidentes/data/" 


if os.path.exists(path_jorge):
    data_dir = path_jorge
    print(f"Ambiente detectado: Orientador (G:)")
elif os.path.exists(path_elaine):
    data_dir = path_elaine
    print(f"Ambiente detectado: Orientanda")
else:
    data_dir = "./dados" 
    print("Nenhum caminho absoluto encontrado. Tentando pasta local './dados'")

print(f"Lendo dados de: {data_dir}")

Ambiente detectado: Orientador (G:)
Lendo dados de: G:\.shortcut-targets-by-id\1UEbcW3gnD82DQPOVUL7QL1Eitp5Qp_rx\Correções TCC\Tcc_aplicação\data


# 1 ABERTURA DOS DADOS

In [24]:
# Carregar o preprocess
preprocess = joblib.load('preprocess.pkl')

# Carregar os splits
X_train, X_test, y_train, y_test = joblib.load('data_splits.pkl')

X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 91628 entries, 44026 to 43353
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uf                      91628 non-null  object 
 1   br                      91433 non-null  float64
 2   km                      91433 non-null  float64
 3   dia_semana              91628 non-null  object 
 4   fase_dia                91628 non-null  object 
 5   sentido_via             91628 non-null  object 
 6   condicao_metereologica  91628 non-null  object 
 7   tipo_pista              91628 non-null  object 
 8   tracado_via             91628 non-null  object 
 9   uso_solo                91628 non-null  object 
 10  tipo_veiculo            88174 non-null  object 
 11  ano                     91628 non-null  int64  
 12  frota                   90614 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 9.8+ MB


Unnamed: 0,uf,br,km,dia_semana,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,tipo_veiculo,ano,frota
44026,GO,40.0,16.0,sÃ¡bado,Plena Noite,Crescente,CÃ©u Claro,Dupla,Reta,Sim,AutomÃ³vel,2019,66560.0
9134,PE,101.0,81.0,terÃ§a-feira,Pleno dia,Crescente,CÃ©u Claro,MÃºltipla,Retorno Regulamentado,NÃ£o,Semireboque,2021,217949.0
54794,PR,476.0,135.0,quarta-feira,Pleno dia,Decrescente,CÃ©u Claro,MÃºltipla,Reta,Sim,AutomÃ³vel,2020,1622891.0
106025,SC,101.0,311.0,quarta-feira,Plena Noite,Crescente,Nublado,Simples,RotatÃ³ria,NÃ£o,AutomÃ³vel,2021,31357.0
52102,MS,163.0,490.0,quinta-feira,Plena Noite,Crescente,CÃ©u Claro,Simples,RotatÃ³ria,Sim,Ãnibus,2019,613060.0


# 2 OTIMIZAÇÃO DO MODELO

In [None]:
# Variáveis Categóricas
cat_features = ['br', 'uf', 'dia_semana', 'fase_dia', 'sentido_via', 
                'condicao_metereologica', 'tipo_pista', 'tracado_via', 
                'uso_solo', 'tipo_veiculo'
                ] 


for col in cat_features:
    X_train[col] = X_train[col].astype(str).astype('category')
    X_test[col] = X_test[col].astype(str).astype('category')
    valid_categories = X_train[col].cat.categories
    X_test[col] = X_test[col].astype(str).astype('category')
    X_test[col] = X_test[col].cat.set_categories(valid_categories)

# Modelo
xgb = XGBClassifier(
    tree_method='hist',      
    enable_categorical=True,  #! No lugar de One Hot Encoder
    objective='binary:logistic',
    n_jobs=cpu_count() // 2,
    random_state=42
)

# Grid
param_grid = {
    'n_estimators': randint(500, 1000),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(4, 10),
    'scale_pos_weight': uniform(2, 6), # Ajuste pro seu desbalanceamento
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

# Busca
random_xgb = RandomizedSearchCV(
    xgb,
    param_grid,
    n_iter=50,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=cpu_count() // 2,
)

# Fit
random_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


# 3 DIAGNÓSTICO

## 3.1 Métricas Básicas

In [26]:
# Resultados
y_pred_xgb = random_xgb.predict(X_test)
print(f"Melhor F1 na Validação Cruzada: {random_xgb.best_score_:.4f}")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

Melhor F1 na Validação Cruzada: 0.4893
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     19305
           1       0.53      0.52      0.52      3603

    accuracy                           0.85     22908
   macro avg       0.72      0.72      0.72     22908
weighted avg       0.85      0.85      0.85     22908

[[17625  1680]
 [ 1726  1877]]


## 3.2 Mudança de Limiar do Predict

In [27]:
y_proba = random_xgb.predict_proba(X_test)[:, 1]

# 2. Teste vários limiares
thresholds = linspace(0, 0.9, 25)

print("Limiar | F1-Score | Recall | Precision")
print("---------------------------------------")
for th in thresholds:
    y_pred_th = (y_proba >= th).astype(int)
    f1 = f1_score(y_test, y_pred_th)
    # Calculando recall e precision manualmente para printar
    rec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_test == 1)
    prec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_pred_th == 1)
    
    print(f" {th:.2f}  |  {f1:.4f}  |  {rec:.4f} |  {prec:.4f}")

Limiar | F1-Score | Recall | Precision
---------------------------------------
 0.00  |  0.2718  |  1.0000 |  0.1573
 0.04  |  0.3724  |  0.9098 |  0.2341
 0.07  |  0.4144  |  0.8604 |  0.2729
 0.11  |  0.4419  |  0.8157 |  0.3030
 0.15  |  0.4644  |  0.7805 |  0.3305
 0.19  |  0.4795  |  0.7452 |  0.3535
 0.22  |  0.4928  |  0.7108 |  0.3771
 0.26  |  0.5054  |  0.6861 |  0.4000
 0.30  |  0.5154  |  0.6575 |  0.4239
 0.34  |  0.5198  |  0.6320 |  0.4415
 0.38  |  0.5218  |  0.6048 |  0.4588
 0.41  |  0.5258  |  0.5790 |  0.4815
 0.45  |  0.5256  |  0.5540 |  0.5000
 0.49  |  0.5256  |  0.5293 |  0.5220
 0.53  |  0.5219  |  0.5037 |  0.5415
 0.56  |  0.5145  |  0.4754 |  0.5605
 0.60  |  0.5077  |  0.4507 |  0.5812
 0.64  |  0.4996  |  0.4252 |  0.6055
 0.67  |  0.4877  |  0.3997 |  0.6255
 0.71  |  0.4749  |  0.3730 |  0.6534
 0.75  |  0.4516  |  0.3400 |  0.6723
 0.79  |  0.4218  |  0.3034 |  0.6918
 0.82  |  0.3942  |  0.2701 |  0.7299
 0.86  |  0.3486  |  0.2268 |  0.7537
 0.90  | 