In [29]:
import pandas as pd
import numpy as np
from hdbscan import HDBSCAN
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA
import warnings
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import pickle
import mlflow

warnings.filterwarnings('ignore')

In [30]:
DATASET_PATH = Path("../dataset")
MODEL_PATH = Path("../models")
ARTIFACT_PATH = Path("../mlops")
OUTPUT_PATH = Path("../output")
VIOLENT_CRIMES = [
    "HOMICIDE",
    "ASSAULT",
    "BATTERY",
    "ROBBERY",
    "CRIMINAL SEXUAL ASSAULT",
    "SEX OFFENSE",
    "KIDNAPPING",
    "INTIMIDATION",
    "STALKING",
    "OFFENSE INVOLVING CHILDREN",
    "Sequestro", "Homicídio", "Estupro",
    "Roubo", "Latrocínio", "Violência Doméstica"
]

In [31]:
mlflow.set_tracking_uri(ARTIFACT_PATH)

mlflow.set_experiment("Hotspot_Forecasting")

<Experiment: artifact_location='file:///D:/senac-ads/projeto_integrador_policia/ml/mlops/751433899071555315', creation_time=1759455767505, experiment_id='751433899071555315', last_update_time=1759455767505, lifecycle_stage='active', name='Hotspot_Forecasting', tags={}>

In [32]:
min_cluster_size=200
min_samples=60
freq="D"
metric = "haversine"
cluster_selection_method = "eom"

In [33]:
df = pd.read_csv(DATASET_PATH / "dataset_ocorrencias_delegacia_5.csv")

In [34]:
violent_crimes_df = df#df[df["tipo_crime"].isin(VIOLENT_CRIMES)]

In [35]:
violent_crimes_df.shape

(5000, 14)

In [36]:
violent_crimes_df["data_ocorrencia"] = pd.to_datetime(violent_crimes_df["data_ocorrencia"])

In [None]:
coords = violent_crimes_df[['latitude', 'longitude']].dropna()
coords_radians = np.radians(coords)

In [38]:
partition_key = "recife"

In [39]:
with mlflow.start_run(run_name="HDBSCAN_Clustering"):
    clusterer = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric=metric,
        cluster_selection_method=cluster_selection_method,
        prediction_data=True
    )
    
    mlflow.log_params({
        "min_cluster_size": min_cluster_size,
        "min_samples": min_samples,
        "metric": "haversine",
        "cluster_selection_method": cluster_selection_method,
        "prediction_data": True
    })
    
    labels = clusterer.fit_predict(coords_radians)
    
    
    violent_crimes_df.loc[coords.index, "hotspot_id"] = labels
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # ignorando ruído
    n_noise = list(labels).count(-1)
    
    cluster_sizes = [list(labels).count(l) for l in set(labels) if l != -1]
    mean_cluster_size = np.mean(cluster_sizes) if cluster_sizes else 0
    max_cluster_size = np.max(cluster_sizes) if cluster_sizes else 0
    min_cluster_size = np.min(cluster_sizes) if cluster_sizes else 0
    
    mlflow.log_metrics({
        "n_clusters": n_clusters,
        "n_noise_points": n_noise,
        "mean_cluster_size": mean_cluster_size,
        "max_cluster_size": max_cluster_size,
        "min_cluster_size": min_cluster_size
    })
    dir_path = MODEL_PATH / partition_key
    dir_path.mkdir(parents=True, exist_ok=True)
    
    model_file = dir_path / f"hdbscan.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump(clusterer, f)
    print(f"✓ Modelo salvo: {model_file}")

✓ Modelo salvo: ..\models\recife\hdbscan.pkl


In [40]:
violent_crimes_df.to_csv(OUTPUT_PATH / "violent_crimes_chicago.csv")

In [41]:
for hotspot_id in violent_crimes_df["hotspot_id"].dropna().unique():
    if hotspot_id == -1:
        continue
    try:
        with mlflow.start_run(run_name=f"hotspot_{hotspot_id}"):
            print(f"\n{'='*50}")
            print(f"Processando Hotspot: {hotspot_id}")
            print(f"{'='*50}")
            
            # Filtra dados do hotspot
            df_hotspot = violent_crimes_df[violent_crimes_df["hotspot_id"] == hotspot_id]
            print(f"Total de registros: {len(df_hotspot)}")
            
            # Agrega por data (contagem diária)
            ts = df_hotspot.groupby("data_ocorrencia").size().reset_index(name="y")
            ts["unique_id"] = str(hotspot_id)
            ts = ts.rename(columns={"data_ocorrencia": "ds"})
            print(f"Dias únicos na série temporal: {len(ts)}")
            
            # Verificar se há dados suficientes
            if len(ts) < 14:  # Mínimo para treino/teste
                print(f"⚠️ Hotspot {hotspot_id} tem poucos dados ({len(ts)} dias). Pulando...")
                mlflow.log_param("status", "skipped_insufficient_data")
                continue
            
            # Separar treino/teste
            train_size = int(len(ts) * 0.8)
            train, test = ts.iloc[:train_size], ts.iloc[train_size:]
            print(f"Treino: {len(train)} dias | Teste: {len(test)} dias")
            
            # Treinar AutoARIMA
            print("Treinando modelo AutoARIMA...")
            sf = StatsForecast(models=[AutoARIMA(season_length=7)], freq="D", n_jobs=-1)
            fcst_df = sf.forecast(df=train, h=len(test), fitted=True)
            print("✓ Modelo treinado")
            
            # Pegar previsão correta
            y_pred = fcst_df.drop(columns=["unique_id","ds"]).iloc[:,0].values
            y_true = test['y'].values
            
            # Métricas
            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            
            print(f"MAE: {mae:.2f} | RMSE: {rmse:.2f}")
            
            # Salvar métricas no MLflow
            mlflow.log_metrics({
                "MAE": mae,
                "RMSE": rmse,
                "train_size": len(train),
                "test_size": len(test)
            })
            
            # Logar parâmetros básicos
            mlflow.log_params({
                "model": "AutoARIMA",
                "season_length": 7,
                "freq": "D",
                "hotspot_id": str(hotspot_id)
            })
            
            # Salvar previsões como artifact
            forecast_file = f"forecast_hotspot_{hotspot_id}.csv"
            fcst_df.to_csv(forecast_file, index=False)
            mlflow.log_artifact(forecast_file)
            print(f"✓ Artifact salvo: {forecast_file}")
            
            # Salvar o StatsForecast completo
            dir_path = MODEL_PATH / partition_key
            dir_path.mkdir(parents=True, exist_ok=True)
            
            model_file = dir_path / f"{hotspot_id}_statsforecast.pkl"
            
            with open(model_file, 'wb') as f:
                pickle.dump(sf, f)
            
            print(f"✓ Modelo salvo: {model_file}")
            
            print(f"✅ Hotspot {hotspot_id} concluído com sucesso!\n")
            
    except Exception as e:
        print(f"❌ ERRO ao processar hotspot {hotspot_id}: {str(e)}")
        import traceback
        traceback.print_exc()
        
        # Tentar logar o erro no MLflow
        try:
            mlflow.log_param("status", "failed")
            mlflow.log_param("error", str(e)[:250])  # MLflow tem limite de chars
        except:
            pass
        
        continue  # Continua para o próximo hotspot


Processando Hotspot: 1.0
Total de registros: 1697
Dias únicos na série temporal: 1697
Treino: 1357 dias | Teste: 340 dias
Treinando modelo AutoARIMA...
✓ Modelo treinado
MAE: 0.00 | RMSE: 0.00
✓ Artifact salvo: forecast_hotspot_1.0.csv
✓ Modelo salvo: ..\models\recife\1.0_statsforecast.pkl
✅ Hotspot 1.0 concluído com sucesso!


Processando Hotspot: 0.0
Total de registros: 1278
Dias únicos na série temporal: 1278
Treino: 1022 dias | Teste: 256 dias
Treinando modelo AutoARIMA...
✓ Modelo treinado
MAE: 0.00 | RMSE: 0.00
✓ Artifact salvo: forecast_hotspot_0.0.csv
✓ Modelo salvo: ..\models\recife\0.0_statsforecast.pkl
✅ Hotspot 0.0 concluído com sucesso!

