In [1]:
import lightgbm as lgb
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

df = pd.read_csv('data/Chicago_Crimes_All.csv')

  df = pd.read_csv('data/Chicago_Crimes_All.csv')


In [2]:
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'Month', 'Hour', 'Weekday'],
      dtype='object')

In [3]:
# 5) Remover linhas com dados críticos faltando
# df.dropna(subset=['Year','Month','Hour','Primary_Type','Arrest'], inplace=True)

# 6) Definir X e y (target binário: Arrest)
if df['Arrest'].dtype == 'bool':
    y = df['Arrest'].astype(int)
else:
    y = df['Arrest'].map({'True':1,'False':0}).fillna(df['Arrest']).astype(int)
X = df.drop(columns=['Arrest'])

# 7) Converter object → pandas.Categorical para LightGBM
cat_cols = X.select_dtypes(include='object').columns.tolist()
for c in cat_cols:
    X[c] = X[c].astype('category')

# 8) Split treino / validação (estratificado)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [4]:
# 9) Criar lgb.Dataset
dtrain = lgb.Dataset(
    X_train, label=y_train,
    categorical_feature=cat_cols
)
dval = lgb.Dataset(
    X_val, label=y_val,
    reference=dtrain,
    categorical_feature=cat_cols
)

# 10) Parâmetros e callbacks (early stopping + log a cada 100 iters)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'seed': 42
}

bst = lgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# 11) Avaliação no conjunto de validação
y_prob = bst.predict(X_val)
y_pred = (y_prob > 0.5).astype(int)

Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.290238
[200]	valid_0's binary_logloss: 0.286597
[300]	valid_0's binary_logloss: 0.284944
[400]	valid_0's binary_logloss: 0.283798
[500]	valid_0's binary_logloss: 0.282732
[600]	valid_0's binary_logloss: 0.281959
[700]	valid_0's binary_logloss: 0.281171
[800]	valid_0's binary_logloss: 0.280626
[900]	valid_0's binary_logloss: 0.280186
[1000]	valid_0's binary_logloss: 0.279756
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.279756


In [5]:
print("=== Métricas de Classificação ===")
print(f"Acurácia : {accuracy_score(y_val, y_pred):.4f}")
print(f"Precisão : {precision_score(y_val, y_pred):.4f}")
print(f"Recall   : {recall_score(y_val, y_pred):.4f}")
print(f"F1-Score : {f1_score(y_val, y_pred):.4f}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_val, y_pred))
print("\nRelatório Completo:\n", classification_report(y_val, y_pred))

# 12) Exportar modelo + metadados para .pkl
output = {
    'model': bst,
    'cat_cols': cat_cols,
    'features': X.columns.tolist()
}
out_path = 'models/modelo_lgb.pkl'
with open(out_path, 'wb') as f:
    pickle.dump(output, f)
print(f"\nModelo salvo em: {out_path}")


=== Métricas de Classificação ===
Acurácia : 0.8915
Precisão : 0.9147
Recall   : 0.6803
F1-Score : 0.7803

Matriz de Confusão:
 [[1109838   28535]
 [ 143816  306068]]

Relatório Completo:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93   1138373
           1       0.91      0.68      0.78    449884

    accuracy                           0.89   1588257
   macro avg       0.90      0.83      0.85   1588257
weighted avg       0.89      0.89      0.89   1588257


Modelo salvo em: models/modelo_lgb.pkl
