In [None]:
import duckdb
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

# =========================
# CONFIG
# =========================
PARQUET_PATH = "combined_data_26122025_2154.parquet"
TIME_COL = "Month"
CRIME_COL = "Crime type"
COUNT_COL = "count"

BACKTEST = 6
FORECAST = 12
SEASONAL = 12

# =========================
# LOAD + PREP
# =========================
df = duckdb.query(f"""
SELECT Month, "Crime type"
FROM read_parquet('{PARQUET_PATH}')
""").to_df()

df[TIME_COL] = pd.to_datetime(df[TIME_COL], format="%Y-%m", errors="coerce")
df = df.dropna()

crime_ts = (
    df.groupby([TIME_COL, CRIME_COL])
      .size()
      .reset_index(name=COUNT_COL)
      .sort_values([CRIME_COL, TIME_COL])
)

# =========================
# FEATURE ENGINEERING
# =========================
def make_features(y):
    df = pd.DataFrame({"y": y})
    df["t"] = np.arange(len(df))
    df["sin12"] = np.sin(2*np.pi*df["t"]/12)
    df["cos12"] = np.cos(2*np.pi*df["t"]/12)
    return df

def metrics(y, p):
    return dict(
        R2=r2_score(y, p),
        MAE=mean_absolute_error(y, p),
        RMSE=np.sqrt(mean_squared_error(y, p))
    )

def plot_crime(crime):

    # --- your existing code ---
    fig = go.Figure()

    # (all your fig.add_trace(...) calls here)

    fig.update_layout(
        title=f"{crime} — Interactive Multi-Model Forecast",
        xaxis_title="Month",
        yaxis_title="Crime Count",
        hovermode="x unified"
    )

    # ✅ SAVE HERE (fig and crime both exist)
    filename = f"{crime.replace(' ', '_')}_forecast.html"
    fig.write_html(filename)

    print(f"[SAVED] {filename}")


# =========================
# RUN ONE CRIME (INTERACTIVE)
# =========================
def plot_crime(crime):
    d = crime_ts[crime_ts[CRIME_COL] == crime]
    if len(d) < 24:
        return

    y = d[COUNT_COL].values
    m = d[TIME_COL].values

    y_train, y_test = y[:-BACKTEST], y[-BACKTEST:]
    m_test = m[-BACKTEST:]

    fig = go.Figure()

    # Actual
    fig.add_trace(go.Scatter(
        x=m, y=y, name="Actual", line=dict(color="black")
    ))

    # Naive
    naive = np.tile(y_train[-SEASONAL:], 2)[:BACKTEST]
    met = metrics(y_test, naive)
    fig.add_trace(go.Scatter(
        x=m_test, y=naive,
        name=f"Naive (R2={met['R2']:.2f})"
    ))

    # Linear + Ridge
    X = make_features(y)
    Xtr, Xte = X.iloc[:-BACKTEST], X.iloc[-BACKTEST:]

    for name, model in {
        "Linear": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "GBR": GradientBoostingRegressor(random_state=42)
    }.items():

        model.fit(Xtr[["t","sin12","cos12"]], Xtr["y"])
        pred = model.predict(Xte[["t","sin12","cos12"]])
        met = metrics(y_test, pred)

        fig.add_trace(go.Scatter(
            x=m_test, y=pred,
            name=f"{name} (R2={met['R2']:.2f})"
        ))

    # SARIMA (optional)
    try:
        sar = SARIMAX(
            y_train,
            order=(1,1,1),
            seasonal_order=(1,1,1,SEASONAL),
            enforce_stationarity=False,
            enforce_invertibility=False
        ).fit(disp=False)

        pred = sar.get_forecast(BACKTEST).predicted_mean
        met = metrics(y_test, pred)

        fig.add_trace(go.Scatter(
            x=m_test, y=pred,
            name=f"SARIMA (R2={met['R2']:.2f})",
            line=dict(dash="dash")
        ))
    except:
        pass

    fig.update_layout(
        title=f"{crime} — Interactive Multi-Model Forecast",
        xaxis_title="Month",
        yaxis_title="Crime Count",
        hovermode="x unified"
    )

    fig.show()

# =========================
# RUN ALL
# =========================
for c in sorted(crime_ts[CRIME_COL].unique()):
    plot_crime(c)




KeyboardInterrupt: 

In [None]:
# ============================================================
# INTERACTIVE MULTI-MODEL CRIME FORECASTING (FULL SCRIPT)
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import duckdb
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from statsmodels.tsa.statespace.sarimax import SARIMAX

import plotly.graph_objects as go

# ============================================================
# CONFIG
# ============================================================

PARQUET_PATH = "combined_data_26122025_2154.parquet"

TIME_COL   = "Month"
CRIME_COL  = "Crime type"
COUNT_COL  = "count"

BACKTEST_MONTHS = 36
FORECAST_MONTHS = 12
SEASONAL_PERIOD = 12

# ============================================================
# 1) LOAD DATA
# ============================================================

print("Loading data...")

df_inc = duckdb.query(f"""
    SELECT
        Month,
        "Crime type"
    FROM read_parquet('{PARQUET_PATH}')
""").to_df()

df_inc[TIME_COL] = pd.to_datetime(df_inc[TIME_COL], format="%Y-%m", errors="coerce")
df_inc = df_inc.dropna()

print("Rows loaded:", len(df_inc))

# ============================================================
# 2) BUILD MONTHLY TIME SERIES
# ============================================================

crime_ts = (
    df_inc
    .groupby([TIME_COL, CRIME_COL])
    .size()
    .reset_index(name=COUNT_COL)
    .sort_values([CRIME_COL, TIME_COL])
    .reset_index(drop=True)
)

print("Crime types:", crime_ts[CRIME_COL].nunique())

# ============================================================
# 3) MODEL HELPERS
# ============================================================

def metrics(y, p):
    mse = mean_squared_error(y, p)
    return {
        "MAE": mean_absolute_error(y, p),
        "RMSE": np.sqrt(mse),
        "R2": r2_score(y, p)
    }

def seasonal_naive(train, horizon):
    last = train[-SEASONAL_PERIOD:]
    reps = int(np.ceil(horizon / SEASONAL_PERIOD))
    return np.tile(last, reps)[:horizon]

def fit_sarima(train):
    model = SARIMAX(
        train,
        order=(1,1,1),
        seasonal_order=(1,1,1,SEASONAL_PERIOD),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    return model.fit(disp=False)

def make_features(ts):
    df = pd.DataFrame({"y": ts})
    df["t"] = np.arange(len(df))
    df["sin12"] = np.sin(2*np.pi*df["t"]/12)
    df["cos12"] = np.cos(2*np.pi*df["t"]/12)
    return df

# ============================================================
# 4) PLOT ONE CRIME (INTERACTIVE)
# ============================================================

def plot_crime(crime):

    df = crime_ts[crime_ts[CRIME_COL] == crime]
    if len(df) < 24:
        print(f"[SKIP] {crime} (too little data)")
        return

    ts = df[COUNT_COL].values
    months = df[TIME_COL].values

    train_ts = ts[:-BACKTEST_MONTHS]
    test_ts  = ts[-BACKTEST_MONTHS:]
    test_months = months[-BACKTEST_MONTHS:]

    # ---------------- Naive
    naive_pred = seasonal_naive(train_ts, BACKTEST_MONTHS)
    m_naive = metrics(test_ts, naive_pred)

    # ---------------- SARIMA
    sarima = fit_sarima(train_ts)
    sarima_pred = np.asarray(sarima.get_forecast(BACKTEST_MONTHS).predicted_mean)
    m_sarima = metrics(test_ts, sarima_pred)

    # ---------------- GBR
    feat_train = make_features(train_ts)
    gbr = GradientBoostingRegressor(random_state=42)
    gbr.fit(feat_train[["t","sin12","cos12"]], feat_train["y"])

    feat_all = make_features(ts)
    gbr_pred = gbr.predict(feat_all[["t","sin12","cos12"]])[-BACKTEST_MONTHS:]
    m_gbr = metrics(test_ts, gbr_pred)

    # ---------------- Linear Regression
    lr = LinearRegression()
    lr.fit(feat_train[["t","sin12","cos12"]], feat_train["y"])
    lr_pred = lr.predict(feat_all[["t","sin12","cos12"]])[-BACKTEST_MONTHS:]
    m_lr = metrics(test_ts, lr_pred)

    # ---------------- Forecast future
    future_months = pd.date_range(
        start=months[-1],
        periods=FORECAST_MONTHS+1,
        freq="MS"
    )[1:]

    sarima_fore = np.asarray(sarima.get_forecast(FORECAST_MONTHS).predicted_mean)

    last_feat = feat_all.iloc[-1:].copy()
    gbr_fore = []
    lr_fore  = []

    for _ in range(FORECAST_MONTHS):
        last_feat["t"] += 1
        last_feat["sin12"] = np.sin(2*np.pi*last_feat["t"]/12)
        last_feat["cos12"] = np.cos(2*np.pi*last_feat["t"]/12)
        gbr_fore.append(gbr.predict(last_feat[["t","sin12","cos12"]])[0])
        lr_fore.append(lr.predict(last_feat[["t","sin12","cos12"]])[0])

    # ========================================================
    # INTERACTIVE PLOT
    # ========================================================

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=months, y=ts,
        mode="lines",
        name="Actual",
        line=dict(color="black", width=2)
    ))

    fig.add_trace(go.Scatter(
        x=test_months, y=naive_pred,
        mode="lines",
        name=f"Naive (R2={m_naive['R2']:.2f})",
        line=dict(dash="dot")
    ))

    fig.add_trace(go.Scatter(
        x=test_months, y=sarima_pred,
        mode="lines",
        name=f"SARIMA (R2={m_sarima['R2']:.2f})",
        line=dict(dash="dash")
    ))

    fig.add_trace(go.Scatter(
        x=test_months, y=gbr_pred,
        mode="lines",
        name=f"GBR (R2={m_gbr['R2']:.2f})",
        line=dict(dash="dashdot")
    ))

    fig.add_trace(go.Scatter(
        x=test_months, y=lr_pred,
        mode="lines",
        name=f"Linear (R2={m_lr['R2']:.2f})",
        line=dict(dash="longdash")
    ))

    fig.add_trace(go.Scatter(
        x=future_months, y=sarima_fore,
        mode="lines",
        name="SARIMA Forecast",
        line=dict(color="red")
    ))

    fig.add_trace(go.Scatter(
        x=future_months, y=gbr_fore,
        mode="lines",
        name="GBR Forecast",
        line=dict(color="green")
    ))

    fig.add_trace(go.Scatter(
        x=future_months, y=lr_fore,
        mode="lines",
        name="Linear Forecast",
        line=dict(color="blue")
    ))

    fig.update_layout(
        title=f"{crime} — Interactive Multi-Model Forecast",
        xaxis_title="Month",
        yaxis_title="Crime Count",
        hovermode="x unified"
    )

    # ---------------- SAVE (standalone)
    ts_now = datetime.now().strftime("%Y%m%d_%H%M%S")
    fname = f"{crime.replace(' ', '_')}_{ts_now}.html"
    fig.write_html(fname)

    print(f"[SAVED] {fname}")

# ============================================================
# 5) RUN ALL CRIMES
# ============================================================

for c in sorted(crime_ts[CRIME_COL].unique()):
    plot_crime(c)

print("\nALL DONE.")
