In [26]:
import os
import pickle
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

paths = [
    
    'data/Chicago_Crimes_2001_to_2004.csv',
    'data/Chicago_Crimes_2005_to_2007.csv',
    'data/Chicago_Crimes_2012_to_2017.csv'
]
dfs = []
for p in paths:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Arquivo não encontrado: {p}")
    # parse_dates para otimizar extração de features de tempo
    dfs.append(pd.read_csv(p, parse_dates=['Date'], on_bad_lines='skip'))
df = pd.concat(dfs, ignore_index=True)

# 2) Sanitizar nomes de coluna (evita JSON errors no LightGBM)
df.columns = (
    df.columns
        .str.strip()
        .str.replace('[^0-9A-Za-z_]', '_', regex=True)
)

# 3) Drop de colunas irrelevantes
to_drop = [
    'ID','Case_Number','Block','IUCR','FBI_Code',
    'X_Coordinate','Y_Coordinate','Updated_On','Location'
]
df.drop(columns=[c for c in to_drop if c in df.columns], inplace=True)

# 4) Extrair features de data/hora
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year']    = df['Date'].dt.year
df['Month']   = df['Date'].dt.month
df['Day']     = df['Date'].dt.day
df['Hour']    = df['Date'].dt.hour
df['Weekday'] = df['Date'].dt.weekday
df.drop(columns=['Date'], inplace=True)


  dfs.append(pd.read_csv(p, parse_dates=['Date'], on_bad_lines='skip'))
  dfs.append(pd.read_csv(p, parse_dates=['Date'], on_bad_lines='skip'))
  dfs.append(pd.read_csv(p, parse_dates=['Date'], on_bad_lines='skip'))
  dfs.append(pd.read_csv(p, parse_dates=['Date'], on_bad_lines='skip'))


In [27]:
# 5) Remover linhas com dados críticos faltando
df.dropna(subset=['Year','Month','Day','Hour','Primary_Type','Arrest'], inplace=True)

# 6) Definir X e y (target binário: Arrest)
if df['Arrest'].dtype == 'bool':
    y = df['Arrest'].astype(int)
else:
    y = df['Arrest'].map({'True':1,'False':0}).fillna(df['Arrest']).astype(int)
X = df.drop(columns=['Arrest'])

# 7) Converter object → pandas.Categorical para LightGBM
cat_cols = X.select_dtypes(include='object').columns.tolist()
for c in cat_cols:
    X[c] = X[c].astype('category')

# 8) Split treino / validação (estratificado)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [28]:
# 9) Criar lgb.Dataset
dtrain = lgb.Dataset(
    X_train, label=y_train,
    categorical_feature=cat_cols
)
dval = lgb.Dataset(
    X_val, label=y_val,
    reference=dtrain,
    categorical_feature=cat_cols
)

# 10) Parâmetros e callbacks (early stopping + log a cada 100 iters)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'seed': 42
}

bst = lgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# 11) Avaliação no conjunto de validação
y_prob = bst.predict(X_val)
y_pred = (y_prob > 0.5).astype(int)


Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.294438
[200]	valid_0's binary_logloss: 0.29049
[300]	valid_0's binary_logloss: 0.288943
[400]	valid_0's binary_logloss: 0.287905
[500]	valid_0's binary_logloss: 0.287018
[600]	valid_0's binary_logloss: 0.28644
[700]	valid_0's binary_logloss: 0.285986
[800]	valid_0's binary_logloss: 0.28562
[900]	valid_0's binary_logloss: 0.285314
[1000]	valid_0's binary_logloss: 0.285014
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.285014


In [30]:
print("=== Métricas de Classificação ===")
print(f"Acurácia : {accuracy_score(y_val, y_pred):.4f}")
print(f"Precisão : {precision_score(y_val, y_pred):.4f}")
print(f"Recall   : {recall_score(y_val, y_pred):.4f}")
print(f"F1-Score : {f1_score(y_val, y_pred):.4f}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_val, y_pred))
print("\nRelatório Completo:\n", classification_report(y_val, y_pred))

# 12) Exportar modelo + metadados para .pkl
output = {
    'model': bst,
    'cat_cols': cat_cols,
    'features': X.columns.tolist()
}
out_path = 'models/lgbm_chicago_arrest.pkl'
with open(out_path, 'wb') as f:
    pickle.dump(output, f)
print(f"\nModelo salvo em: {out_path}")


=== Métricas de Classificação ===
Acurácia : 0.8891
Precisão : 0.9185
Recall   : 0.6773
F1-Score : 0.7796

Matriz de Confusão:
 [[727938  18293]
 [ 98201 206083]]

Relatório Completo:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93    746231
           1       0.92      0.68      0.78    304284

    accuracy                           0.89   1050515
   macro avg       0.90      0.83      0.85   1050515
weighted avg       0.89      0.89      0.88   1050515


Modelo salvo em: models/lgbm_chicago_arrest.pkl
