# Regressão Logística

In [23]:

import joblib
from multiprocessing import cpu_count
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import os
from scipy.stats import uniform, loguniform
from numpy import linspace
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import f1_score

# 1 ABERTURA DOS DADOS

In [24]:
# Caminho
data_dir = "C:/Users/Elaine/Desktop/TCC_acidentes/data/"

# Carregar o preprocess
preprocess = joblib.load('preprocess.pkl')

# Carregar os splits
X_train, X_test, y_train, y_test = joblib.load('data_splits.pkl')

X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 91628 entries, 44026 to 43353
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uf                      91628 non-null  object 
 1   br                      91433 non-null  float64
 2   km                      91433 non-null  float64
 3   dia_semana              91628 non-null  object 
 4   fase_dia                91628 non-null  object 
 5   sentido_via             91628 non-null  object 
 6   condicao_metereologica  91628 non-null  object 
 7   tipo_pista              91628 non-null  object 
 8   tracado_via             91628 non-null  object 
 9   uso_solo                91628 non-null  object 
 10  tipo_veiculo            88174 non-null  object 
 11  ano                     91628 non-null  int64  
 12  frota                   90614 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 9.8+ MB


Unnamed: 0,uf,br,km,dia_semana,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,tipo_veiculo,ano,frota
44026,GO,40.0,16.0,sÃ¡bado,Plena Noite,Crescente,CÃ©u Claro,Dupla,Reta,Sim,AutomÃ³vel,2019,66560.0
9134,PE,101.0,81.0,terÃ§a-feira,Pleno dia,Crescente,CÃ©u Claro,MÃºltipla,Retorno Regulamentado,NÃ£o,Semireboque,2021,217949.0
54794,PR,476.0,135.0,quarta-feira,Pleno dia,Decrescente,CÃ©u Claro,MÃºltipla,Reta,Sim,AutomÃ³vel,2020,1622891.0
106025,SC,101.0,311.0,quarta-feira,Plena Noite,Crescente,Nublado,Simples,RotatÃ³ria,NÃ£o,AutomÃ³vel,2021,31357.0
52102,MS,163.0,490.0,quinta-feira,Plena Noite,Crescente,CÃ©u Claro,Simples,RotatÃ³ria,Sim,Ãnibus,2019,613060.0


# 2 OTMIZAÇÃO DO MODELO

In [25]:
# Pipeline
pipeline_lr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=300))
])

# Grid
param_grid_lr = {
    'clf__solver': ['saga'],                
    'clf__penalty': ['elasticnet'],         
    'clf__C': loguniform(1e-2, 1e2),
    'clf__l1_ratio': uniform(0, 1),        
    'clf__class_weight': [None, 'balanced']
}

# Busca
random_lr = RandomizedSearchCV(
    estimator=pipeline_lr,
    param_distributions=param_grid_lr,  
    n_iter=5,                           
    scoring='f1',
    cv=5,
    n_jobs=cpu_count() // 2,
    random_state=42,
    verbose=1
)


In [26]:
# Fit
random_lr.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


0,1,2
,estimator,Pipeline(step...x_iter=300))])
,param_distributions,"{'clf__C': <scipy.stats....001E802D6BCE0>, 'clf__class_weight': [None, 'balanced'], 'clf__l1_ratio': <scipy.stats....001E804E06570>, 'clf__penalty': ['elasticnet'], ...}"
,n_iter,5
,scoring,'f1'
,n_jobs,2
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,2
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,0.001
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'elasticnet'
,dual,False
,tol,0.0001
,C,np.float64(0....8942406574443)
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,300


# 3 DIAGNÓSTICO

## 3.1 Métricas Básicas

In [27]:
# Resultados
y_pred_lr = random_lr.predict(X_test)
print(f"Melhor F1 na Validação Cruzada: {random_lr.best_score_:.4f}")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

Melhor F1 na Validação Cruzada: 0.3908
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     19305
           1       0.28      0.70      0.40      3603

    accuracy                           0.67     22908
   macro avg       0.60      0.68      0.58     22908
weighted avg       0.82      0.67      0.71     22908

[[12764  6541]
 [ 1095  2508]]


## 3.2 Mudança de Limiar do Predict

In [28]:
y_proba = random_lr.predict_proba(X_test)[:, 1]

# 2. Teste vários limiares
thresholds = linspace(0, 0.9, 25)

print("Limiar | F1-Score | Recall | Precision")
print("---------------------------------------")
for th in thresholds:
    y_pred_th = (y_proba >= th).astype(int)
    f1 = f1_score(y_test, y_pred_th)
    # Calculando recall e precision manualmente para printar
    rec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_test == 1)
    prec = sum((y_pred_th == 1) & (y_test == 1)) / sum(y_pred_th == 1)
    
    print(f" {th:.2f}  |  {f1:.4f}  |  {rec:.4f} |  {prec:.4f}")

Limiar | F1-Score | Recall | Precision
---------------------------------------
 0.00  |  0.2718  |  1.0000 |  0.1573
 0.04  |  0.2945  |  1.0000 |  0.1726
 0.07  |  0.2948  |  1.0000 |  0.1729
 0.11  |  0.2956  |  0.9997 |  0.1734
 0.15  |  0.2982  |  0.9986 |  0.1753
 0.19  |  0.3032  |  0.9947 |  0.1789
 0.22  |  0.3095  |  0.9831 |  0.1837
 0.26  |  0.3193  |  0.9703 |  0.1911
 0.30  |  0.3292  |  0.9470 |  0.1992
 0.34  |  0.3434  |  0.9245 |  0.2109
 0.38  |  0.3547  |  0.8845 |  0.2218
 0.41  |  0.3668  |  0.8354 |  0.2350
 0.45  |  0.3782  |  0.7724 |  0.2504
 0.49  |  0.3899  |  0.7133 |  0.2682
 0.53  |  0.4052  |  0.6520 |  0.2939
 0.56  |  0.4057  |  0.5748 |  0.3135
 0.60  |  0.4079  |  0.5046 |  0.3422
 0.64  |  0.3921  |  0.4230 |  0.3654
 0.67  |  0.3779  |  0.3639 |  0.3931
 0.71  |  0.3486  |  0.2992 |  0.4175
 0.75  |  0.3164  |  0.2417 |  0.4579
 0.79  |  0.2727  |  0.1868 |  0.5049
 0.82  |  0.1943  |  0.1185 |  0.5391
 0.86  |  0.1081  |  0.0594 |  0.5994
 0.90  | 