In [None]:
import os
import json
import joblib
import warnings
warnings.filterwarnings("ignore")

import pandas as pd, numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

# Try optional imports
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False

try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
    has_tf = True
except Exception:
    has_tf = False

DATA_PATH = "/content/bangalore_parking_6months.csv"  # place file in same folder or change path
assert os.path.exists(DATA_PATH), f"Dataset not found at {DATA_PATH}"

# 1) Load and basic preprocessing
df = pd.read_csv(DATA_PATH) # Removed parse_dates here
df['timestamp'] = pd.to_datetime(df['timestamp']) # Explicit conversion
df = df.sort_values(["loc_id","timestamp"]).reset_index(drop=True)

# time features
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
df['date'] = df['timestamp'].dt.date

target_col = 'occupancy_rate'  # between 0 and 1

# 2) Create lag features (in 15-min steps)
lag_steps = [1,4,96]  # 15min, 1h, 24h
for lag in lag_steps:
    df[f'lag_{lag}'] = df.groupby('loc_id')[target_col].shift(lag)

# Rolling statistics
df['roll_mean_4'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=4, min_periods=1).mean().reset_index(level=0, drop=True)
df['roll_mean_96'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=96, min_periods=1).mean().reset_index(level=0, drop=True)

# Fill missing lag/rolls with loc mean
df['loc_mean'] = df.groupby('loc_id')[target_col].transform('mean')
for c in [f'lag_{l}' for l in lag_steps] + ['roll_mean_4','roll_mean_96']:
    df[c] = df[c].fillna(df['loc_mean'])


# Label-encode loc_id
le = LabelEncoder()
df['loc_enc'] = le.fit_transform(df['loc_id'])

# Features for tree model
feature_cols = ['loc_enc','capacity','hour','dayofweek','is_weekend',
                'lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96','rain','is_event']

# Drop any remaining NaNs after feature engineering
# This ensures clean data for model training and evaluation
initial_rows = df.shape[0]
df.dropna(subset=[target_col] + feature_cols, inplace=True)
if df.shape[0] < initial_rows:
    print(f"Dropped {initial_rows - df.shape[0]} rows with NaNs after feature engineering.")

# 3) Time-based splits (example: train May-Jul, val Aug, test Sep)
df['month'] = df['timestamp'].dt.month
train_df = df[df['month']==5] # May
val_df = df[df['month']==6] # June
test_df = df[df['month']==7] # July

X_train = train_df[feature_cols].values
y_train = train_df[target_col].values
X_val = val_df[feature_cols].values
y_val = val_df[target_col].values
X_test = test_df[feature_cols].values
y_test = test_df[target_col].values

# 4) Scale continuous features
cont_cols = ['capacity','lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96']
cont_idx = [feature_cols.index(c) for c in cont_cols]
scaler = StandardScaler()
scaler.fit(X_train[:, cont_idx])

def scale_X(X):
    Xs = X.copy().astype(float)
    Xs[:, cont_idx] = scaler.transform(Xs[:, cont_idx])
    return Xs

X_train_s = scale_X(X_train)
X_val_s = scale_X(X_val)
X_test_s = scale_X(X_test)

# Save label encoder and scaler
os.makedirs("artifacts", exist_ok=True)
joblib.dump(le, "artifacts/loc_label_encoder.joblib")
joblib.dump(scaler, "artifacts/feature_scaler.joblib")

# 5) Train XGBoost (or fallback HGB)
if has_xgb:
    dtrain = xgb.DMatrix(X_train_s, label=y_train, feature_names=feature_cols)
    dval = xgb.DMatrix(X_val_s, label=y_val, feature_names=feature_cols)
    params = {
        "objective":"reg:squarederror",
        "eval_metric":"rmse",
        "tree_method":"hist",
        "learning_rate":0.1,
        "max_depth":8,
        "subsample":0.8,
        "colsample_bytree":0.8,
        "seed":42
    }
    xgb_model = xgb.train(params, dtrain, num_boost_round=300,
                          evals=[(dtrain,'train'),(dval,'val')],
                          early_stopping_rounds=20, verbose_eval=20)
    xgb_model.save_model("artifacts/xgb_parking_model.json")
    y_pred_xgb = xgb_model.predict(xgb.DMatrix(X_test_s, feature_names=feature_cols))
else:
    print("xgboost not found — using sklearn HistGradientBoostingRegressor fallback")
    hgb = HistGradientBoostingRegressor(max_iter=300, learning_rate=0.05, max_depth=10, random_state=42)
    hgb.fit(X_train_s, y_train)
    joblib.dump(hgb, "artifacts/hgb_parking_model.joblib")
    y_pred_xgb = hgb.predict(X_test_s)

# Evaluate XGB/HGB
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
print("XGBoost/HGB Results — MAE: {:.4f}, RMSE: {:.4f}, R2: {:.4f}".format(mae_xgb, rmse_xgb, r2_xgb))

# Save sample predictions
test_out = test_df[['timestamp','loc_id','occupancy_rate']].copy()
test_out['pred_xgb'] = y_pred_xgb
test_out.to_csv("artifacts/test_predictions_xgb_sample.csv", index=False)

# Initialize LSTM metrics
mae_lstm = None
rmse_lstm = None
r2_lstm = None

# 6) LSTM pipeline (OPTIONAL — requires TensorFlow)
if not has_tf:
    print("TensorFlow not installed. Skipping LSTM training. To run the LSTM, install tensorflow and re-run.")
else:
    # We'll train a demo LSTM on a subset of locations (to keep runtime reasonable)
    sample_locs = df['loc_id'].unique()[:8]  # choose 8 locs
    lstm_df = df[df['loc_id'].isin(sample_locs)].copy()
    period_start = datetime(2025,7,1)
    period_end = period_start + pd.Timedelta(days=60) - pd.Timedelta(minutes=15)
    lstm_df = lstm_df[(lstm_df['timestamp']>=period_start) & (lstm_df['timestamp']<=period_end)]

    window = 96  # 24h
    X_seqs = []
    y_seqs = []
    for loc in lstm_df['loc_id'].unique():
        sub = lstm_df[lstm_df['loc_id']==loc].sort_values('timestamp')
        vals = sub[target_col].values
        hours = sub['hour'].values.reshape(-1,1)
        weekends = sub['is_weekend'].values.reshape(-1,1)
        features = np.concatenate([vals.reshape(-1,1), hours, weekends], axis=1)
        for i in range(len(features)-window-1):
            X_seqs.append(features[i:i+window])
            y_seqs.append(features[i+window,0])

    X_seqs = np.array(X_seqs); y_seqs = np.array(y_seqs)
    n = len(X_seqs); train_n = int(n*0.8); val_n = int(n*0.1)
    X_tr, X_va, X_te = X_seqs[:train_n], X_seqs[train_n:train_n+val_n], X_seqs[train_n+val_n:]
    y_tr, y_va, y_te = y_seqs[:train_n], y_seqs[train_n:train_n+val_n], y_seqs[train_n+val_n:]

    tf.random.set_seed(42)
    model = Sequential([
        LSTM(64, input_shape=(window, X_tr.shape[-1]), return_sequences=False),
        BatchNormalization(),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    model.summary()

    # Train (adjust epochs/batch_size to your GPU/CPU resources)
    history = model.fit(X_tr, y_tr, validation_data=(X_va,y_va), epochs=6, batch_size=128, verbose=2)

    y_pred_lstm = model.predict(X_te).flatten()
    mae_lstm = mean_absolute_error(y_te, y_pred_lstm)
    rmse_lstm = np.sqrt(mean_squared_error(y_te, y_pred_lstm))
    r2_lstm = r2_score(y_te, y_pred_lstm)
    print("LSTM demo results — MAE: {:.4f}, RMSE: {:.4f}, R2: {:.4f}".format(mae_lstm, rmse_lstm, r2_lstm))

    model.save("artifacts/lstm_parking_demo_model.keras")
    pd.DataFrame({"y_true":y_te[:200], "y_pred":y_pred_lstm[:200]}).to_csv("artifacts/lstm_test_sample.csv", index=False)

# 7) Save run summary
summary_lstm_mae = None
summary_lstm_rmse = None
summary_lstm_r2 = None

if has_tf:
    if mae_lstm is not None:
        summary_lstm_mae = float(mae_lstm)
    if rmse_lstm is not None:
        summary_lstm_rmse = float(rmse_lstm)
    if r2_lstm is not None:
        summary_lstm_r2 = float(r2_lstm)

summary = {
    "xgb": {"mae": float(mae_xgb), "rmse": float(rmse_xgb), "r2": float(r2_xgb)},
    "lstm_demo": {
        "mae": summary_lstm_mae,
        "rmse": summary_lstm_rmse,
        "r2": summary_lstm_r2
    }
}

with open("artifacts/run_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("Artifacts saved to artifacts/ directory.")

In [None]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from tensorflow.keras.models import load_model
from datetime import datetime, timedelta

COOL_JOINT_LAT = 12.9289
COOL_JOINT_LON = 77.5850

# Paths to saved models/scalers
XGB_MODEL_PATH = "artifacts/xgb_parking_model.json"
SCALER_PATH = "artifacts/feature_scaler.joblib"
LABEL_ENCODER_PATH = "artifacts/loc_label_encoder.joblib"

LSTM_MODEL_PATH = "artifacts/lstm_parking_demo_model.keras"           # if saved as TF SavedModel
# or: LSTM_MODEL_PATH = "artifacts/lstm_parking_model.h5" if .h5

# Your data
DATA_PATH = "bangalore_parking_6months.csv"

df = pd.read_csv(DATA_PATH) # Removed parse_dates here
df['timestamp'] = pd.to_datetime(df['timestamp']) # Explicit conversion
df = df.sort_values(["loc_id", "timestamp"])

# --- Re-create feature engineering for df ---
target_col = 'occupancy_rate'

# time features
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
df['date'] = df['timestamp'].dt.date

# Create lag features (in 15-min steps)
lag_steps = [1,4,96]  # 15min, 1h, 24h
for lag in lag_steps:
    df[f'lag_{lag}'] = df.groupby('loc_id')[target_col].shift(lag)

# Rolling statistics
df['roll_mean_4'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=4, min_periods=1).mean().reset_index(level=0, drop=True)
df['roll_mean_96'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=96, min_periods=1).mean().reset_index(level=0, drop=True)

# Fill missing lag/rolls with loc mean
df['loc_mean'] = df.groupby('loc_id')[target_col].transform('mean')
for c in [f'lag_{l}' for l in lag_steps] + ['roll_mean_4','roll_mean_96']:
    df[c] = df[c].fillna(df['loc_mean'])
# --- End re-create feature engineering ---

xgb_model = XGBRegressor()
xgb_model.load_model(XGB_MODEL_PATH)

scaler = joblib.load(SCALER_PATH)
le = joblib.load(LABEL_ENCODER_PATH)

lstm_model = None
try:
    lstm_model = load_model(LSTM_MODEL_PATH)
    has_lstm = True
except Exception as e:
    print("LSTM model not loaded:", e)
    has_lstm = False

# Compute distance to all locations (Euclidean approx)
locs = df.groupby("loc_id")[["lat","lon"]].first().reset_index()
locs["dist"] = np.sqrt((locs["lat"] - COOL_JOINT_LAT)**2 + (locs["lon"] - COOL_JOINT_LON)**2)
nearest = locs.sort_values("dist").iloc[0]
loc_id = nearest["loc_id"]
print("Nearest synthetic loc_id to Cool Joint:", loc_id, "with distance:", nearest["dist"])

# Filter df for that loc_id
loc_df = df[df["loc_id"] == loc_id].copy().reset_index(drop=True)

# Feature-engineering for the latest timestamp
latest = loc_df.iloc[-1].copy()
latest_hour = latest["timestamp"].hour
latest_dow = latest["timestamp"].weekday()
latest["hour"] = latest_hour
latest["dayofweek"] = latest_dow
latest["is_weekend"] = 1 if latest_dow >= 5 else 0

feat_cols = ['loc_enc','capacity','hour','dayofweek','is_weekend',
             'lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96','rain','is_event']

# Encode loc_enc
latest["loc_enc"] = le.transform([latest["loc_id"]])[0]

xgb_input = latest[feat_cols].values.reshape(1, -1).astype(float)
# Scale continuous features: need to scale exactly as during training
# Let's assume capacity, lag_*, roll_mean_* were scaled
cont_idx = [feat_cols.index(c) for c in ['capacity','lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96']]
xgb_input[:, cont_idx] = scaler.transform(xgb_input[:, cont_idx])

pred_xgb = xgb_model.predict(xgb_input)[0]

pred_lstm = None
if has_lstm:
    # Prepare sequence: take last N occupancy points
    window = 96  # 24h if using 15-min steps
    seq = loc_df["occupancy_rate"].values[-window:]
    if len(seq) < window:
        raise ValueError("Not enough history for LSTM prediction")

    # Also include exogenous like hour and weekend flag
    hours = loc_df["hour"].values[-window:]
    weekends = loc_df["is_weekend"].values[-window:]

    features = np.column_stack([seq, hours, weekends])
    features = features.reshape(1, window, features.shape[1])

    # If you had scaled for LSTM
    # (If you used a scaler for LSTM training, load and apply it here.)
    # For now, assuming raw values
    pred_lstm = lstm_model.predict(features)[0][0]

print("Prediction for Cool Joint (approx) at coordinates:", COOL_JOINT_LAT, COOL_JOINT_LON)
print("Using synthetic location:", loc_id)
print("XGBoost prediction (occupancy_rate):", round(pred_xgb, 3))
if has_lstm:
    print("LSTM prediction (occupancy_rate):", round(pred_lstm, 3))

Nearest synthetic loc_id to Cool Joint: L002 with distance: 0.0012899352696991244
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
Prediction for Cool Joint (approx) at coordinates: 12.9289 77.585
Using synthetic location: L002
XGBoost prediction (occupancy_rate): 0.658
LSTM prediction (occupancy_rate): 0.727


In [None]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from tensorflow.keras.models import load_model
from datetime import datetime

CLARENCE_LAT = 12.9250  # <— approximate latitude
CLARENCE_LON = 77.5760  # <— approximate longitude

# Paths to your saved models / scaler
XGB_MODEL_PATH = "artifacts/xgb_parking_model.json"
SCALER_PATH = "artifacts/feature_scaler.joblib"
LABEL_ENCODER_PATH = "artifacts/loc_label_encoder.joblib"

LSTM_MODEL_PATH = "artifacts/lstm_parking_demo_model.keras"  # or .h5 path, as per your setup

# Data
DATA_PATH = "bangalore_parking_6months.csv"

df = pd.read_csv(DATA_PATH, parse_dates=["timestamp"])
df = df.sort_values(["loc_id","timestamp"])

# Re-create feature engineering for df
target_col = 'occupancy_rate'

df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
df['date'] = df['timestamp'].dt.date

lag_steps = [1,4,96]  # 15min, 1h, 24h
for lag in lag_steps:
    df[f'lag_{lag}'] = df.groupby('loc_id')[target_col].shift(lag)

df['roll_mean_4'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=4, min_periods=1).mean().reset_index(level=0, drop=True)
df['roll_mean_96'] = df.groupby('loc_id')[target_col].shift(1).rolling(window=96, min_periods=1).mean().reset_index(level=0, drop=True)

df['loc_mean'] = df.groupby('loc_id')[target_col].transform('mean')
for c in [f'lag_{l}' for l in lag_steps] + ['roll_mean_4','roll_mean_96']:
    df[c] = df[c].fillna(df['loc_mean'])

# Load models / scaler
xgb_model = XGBRegressor()
xgb_model.load_model(XGB_MODEL_PATH)

scaler = joblib.load(SCALER_PATH)
le = joblib.load(LABEL_ENCODER_PATH)

try:
    lstm_model = load_model(LSTM_MODEL_PATH)
    has_lstm = True
except Exception as e:
    print("Could not load LSTM:", e)
    has_lstm = False

locs = df.groupby("loc_id")[["lat", "lon"]].first().reset_index()
locs["dist"] = np.sqrt((locs["lat"] - CLARENCE_LAT)**2 + (locs["lon"] - CLARENCE_LON)**2)
nearest = locs.sort_values("dist").iloc[0]
loc_id = nearest["loc_id"]
print("Nearest synthetic loc_id to Clarence Public School:", loc_id, "distance:", nearest["dist"])

loc_df = df[df["loc_id"] == loc_id].copy().reset_index(drop=True)

latest = loc_df.iloc[-1].copy()
latest["hour"] = latest["timestamp"].hour
latest["dayofweek"] = latest["timestamp"].dayofweek
latest["is_weekend"] = 1 if latest["dayofweek"] >= 5 else 0

# Feature columns — should match training features
feat_cols = ['loc_enc','capacity','hour','dayofweek','is_weekend',
             'lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96','rain','is_event']

# Encode loc
latest["loc_enc"] = le.transform([latest["loc_id"]])[0]

xgb_input = latest[feat_cols].values.reshape(1, -1).astype(float)

# Scale continuous features
cont_cols = ['capacity','lag_1','lag_4','lag_96','roll_mean_4','roll_mean_96']
cont_idx = [feat_cols.index(c) for c in cont_cols]
xgb_input[:, cont_idx] = scaler.transform(xgb_input[:, cont_idx])

pred_xgb = xgb_model.predict(xgb_input)[0]

pred_lstm = None
if has_lstm:
    window = 96  # same window used in training
    seq = loc_df["occupancy_rate"].values[-window:]
    if len(seq) < window:
        raise ValueError("Not enough data for LSTM sequence")

    hours = loc_df["hour"].values[-window:]
    weekends = loc_df["is_weekend"].values[-window:]
    features = np.column_stack([seq, hours, weekends])
    features = features.reshape(1, window, features.shape[1])

    pred_lstm = lstm_model.predict(features)[0][0]

print("Predicted occupancy for Clarence Public School (JP Nagar 4th Phase)")
print("Synthetic loc_id used:", loc_id)
print(f"XGBoost occupancy_rate: {pred_xgb:.3f}")
if has_lstm:
    print(f"LSTM occupancy_rate: {pred_lstm:.3f}")

Nearest synthetic loc_id to Clarence Public School: L004 distance: 0.008076448786450918
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
Predicted occupancy for Clarence Public School (JP Nagar 4th Phase)
Synthetic loc_id used: L004
XGBoost occupancy_rate: 0.665
LSTM occupancy_rate: 0.684
