In [None]:
import pandas as pd
import numpy as np
import os

# 1. Load and clean the data
df = pd.read_csv("../data/cleaned_data.csv")
df.columns = df.columns.str.lower()

# 2. Clean up county
df['county'] = df['county'].fillna(df['municipality'])
df['county'] = df['county'].str.replace(' län', '', regex=False).str.strip()

# 3. Clean datetime from 'time' column only (which already includes full datetime string)
df['datetime'] = pd.to_datetime(df['time'], errors='coerce')

# 4. Extract month and hour
df['month'] = df['datetime'].dt.month
df['hour'] = df['datetime'].dt.hour

# 5. Remove rows with missing or empty values in grouping columns
group_cols = ['county', 'species', 'month', 'hour']
df_clean = df.copy()
for col in group_cols:
    df_clean = df_clean[df_clean[col].notna() & (df_clean[col].astype(str).str.strip() != '')]

# Debug output
print("✅ Total rows:", len(df))
print("✅ Rows after cleaning:", len(df_clean))
print("❗Missing per column:\n", df[group_cols].isna().sum())
print("❗Empty strings per column:\n", df[group_cols].astype(str).apply(lambda x: (x.str.strip() == '').sum()))

# 6. Group and count
agg = (
    df_clean
    .groupby(group_cols)
    .size()
    .reset_index(name='n_collisions')
)

# 7. Normalize risk score
agg['risk_score'] = agg.groupby(['county', 'species'])['n_collisions'] \
    .transform(lambda s: (s - s.min()) / (s.max() - s.min() + 1e-9))

# 8. Fallback: All species
df_all = df.copy()
for col in ['county', 'month', 'hour']:
    df_all = df_all[df_all[col].notna() & (df_all[col].astype(str).str.strip() != '')]

agg_all = (
    df_all.assign(species='All species')
    .groupby(['county', 'species', 'month', 'hour'])
    .size()
    .reset_index(name='n_collisions')
)
agg_all['risk_score'] = agg_all.groupby(['county', 'species'])['n_collisions'] \
    .transform(lambda s: (s - s.min()) / (s.max() - s.min() + 1e-9))

# 10. Combine and export
baseline = pd.concat([agg, agg_all], ignore_index=True)

# Round risk_score to 4 decimals
baseline['risk_score'] = baseline['risk_score'].round(4)

os.makedirs("../data/processed", exist_ok=True)
baseline.to_csv("../data/processed/baseline_risk.csv", index=False, float_format="%.4f")
print("✅ Saved data/processed/baseline_risk.csv")