# Planning Approval Prediction Model Training

Run this notebook on Google Colab or Kaggle if your local machine lacks resources.

**Pre-requisites:**
1. Run `python scripts/export_training_data.py` locally to generate `planning_applications_training.csv`.
2. Upload that CSV file to this notebook environment.

In [None]:
# Install dependencies
!pip install pandas xgboost scikit-learn joblib

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import joblib

In [None]:
# Load data
# Make sure to upload 'planning_applications_training.csv' to the Colab files sidebar first
try:
    df = pd.read_csv('planning_applications_training.csv')
    print(f"Loaded {len(df)} records.")
except FileNotFoundError:
    print("File not found! Please upload 'planning_applications_training.csv' using the Files tab on the left.")
    from google.colab import files
    uploaded = files.upload()
    df = pd.read_csv('planning_applications_training.csv')

In [None]:
# Define features and target
FEATURE_COLS = [
    "flood_zone",
    "in_conservation_area",
    "in_greenbelt",
    "in_article4_zone",
    "local_approval_rate",
    "avg_decision_time_days",
    "similar_applications_nearby",
    "avg_price_per_m2",
    "price_trend_24m",
    "epc_score",
]
TARGET_COL = "approved"

# Convert boolean columns to int if needed (pandas usually handles this, but XGBoost prefers numerics)
bool_cols = ["in_conservation_area", "in_greenbelt", "in_article4_zone"]
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(int)

# Handle missing values (simple strategy)
df = df.fillna(0)

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost Model
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test AUC: {auc:.4f}")
print(classification_report(y_test, model.predict(X_test)))

In [None]:
# Save Model
joblib.dump(model, 'planning_model.pkl')
print("Model saved as planning_model.pkl")

# Download the model file
try:
    from google.colab import files
    files.download('planning_model.pkl')
except ImportError:
    print("Download manually from the file browser if not using Colab.")