# Load and Verify Saved Model
Load the previously saved model, features list, and metrics using joblib. Verify the model's current performance baseline.

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer

# Load saved model and metadata
model_path = "/Users/chetan/Documents/GitHub/nj_transit/models/delay_predictor.joblib"
features_path = "/Users/chetan/Documents/GitHub/nj_transit/models/features_list.joblib"
metrics_path = "/Users/chetan/Documents/GitHub/nj_transit/models/metrics.joblib"

try:
    # Load model and features
    print("Loading saved model and features...")
    model = joblib.load(model_path)
    features = joblib.load(features_path)
    baseline_metrics = joblib.load(metrics_path)
    
    print("Loading data...")
    data_path = '/Users/chetan/Documents/GitHub/nj_transit/data/data.csv'
    df = pd.read_csv(data_path)

    # Preprocess data
    print("Preprocessing data...")
    df['scheduled_time'] = pd.to_datetime(df['scheduled_time'], errors='coerce')
    df['actual_time'] = pd.to_datetime(df['actual_time'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'])
    
    # Drop invalid times
    df = df.dropna(subset=['scheduled_time', 'actual_time'])

    # Create features
    df['hour_of_day'] = df['scheduled_time'].dt.hour
    df['day_of_week'] = df['scheduled_time'].dt.dayofweek
    df['month'] = df['scheduled_time'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_rush_hour'] = df['hour_of_day'].isin([7, 8, 9, 16, 17, 18, 19]).astype(int)

    # Handle missing values
    df['delay_minutes'] = df['delay_minutes'].fillna(0)
    df['from_id'] = df['from_id'].fillna(-1)
    df['to_id'] = df['to_id'].fillna(-1)

    # Remove outliers
    q1 = df['delay_minutes'].quantile(0.25)
    q3 = df['delay_minutes'].quantile(0.75)
    iqr = q3 - q1
    df = df[df['delay_minutes'].between(q1 - 1.5*iqr, q3 + 1.5*iqr)]

    # One-hot encode categoricals
    df = pd.get_dummies(df, columns=['line'], prefix='line')
    df = pd.get_dummies(df, columns=['type'], prefix='type')
    df = pd.get_dummies(df, columns=['status'], prefix='status')

    # Ensure all expected features exist
    for feature in features:
        if feature not in df.columns:
            df[feature] = 0

    # Prepare features and target
    X = df[features]
    y = df['delay_minutes']

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(
        imputer.fit_transform(X),
        columns=X.columns,
        index=X.index
    )

    print("Further training model...")
    # Further train the model with new data
    model.fit(X_imputed, y)

    # Make predictions and evaluate
    y_pred = model.predict(X_imputed)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    print("\nModel Performance:")
    print(f"Previous MAE: {baseline_metrics['MAE']:.2f} minutes")
    print(f"Updated MAE: {mae:.2f} minutes")
    print(f"Previous RMSE: {baseline_metrics['RMSE']:.2f} minutes")
    print(f"Updated RMSE: {rmse:.2f} minutes")

 

except Exception as e:
    print(f"Error occurred: {str(e)}")

Loading saved model and features...
Loading data...
Preprocessing data...
Further training model...


In [None]:
   # Save updated model and metrics
print("\nSaving updated model...")
joblib.dump(model, model_path)
joblib.dump({'MAE': mae, 'RMSE': rmse}, metrics_path)
print("Model updated successfully!")