# Import Required Libraries
Import huggingface_hub and other necessary libraries for model handling and API interactions.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load Data
print("Loading data...")
data_path = '/Users/chetan/Documents/GitHub/nj_transit/data/data.csv'
df = pd.read_csv(data_path)

# Step 2: Data Preprocessing
print("Preprocessing data...")

# Convert datetime columns
df['scheduled_time'] = pd.to_datetime(df['scheduled_time'], errors='coerce')
df['actual_time'] = pd.to_datetime(df['actual_time'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

# Drop rows where time conversion failed
df = df.dropna(subset=['scheduled_time', 'actual_time'])

# Create basic features
df['hour_of_day'] = df['scheduled_time'].dt.hour
df['day_of_week'] = df['scheduled_time'].dt.dayofweek
df['month'] = df['scheduled_time'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_rush_hour'] = df['hour_of_day'].isin([7, 8, 9, 16, 17, 18, 19]).astype(int)

# Handle missing values
df['delay_minutes'] = df['delay_minutes'].fillna(0)
df['from_id'] = df['from_id'].fillna(-1)
df['to_id'] = df['to_id'].fillna(-1)

# Remove outliers
q1 = df['delay_minutes'].quantile(0.25)
q3 = df['delay_minutes'].quantile(0.75)
iqr = q3 - q1
df = df[df['delay_minutes'].between(q1 - 1.5*iqr, q3 + 1.5*iqr)]

# One-hot encode categorical variables
if 'line' in df.columns:
    df = pd.get_dummies(df, columns=['line'], prefix='line')
if 'type' in df.columns:
    df = pd.get_dummies(df, columns=['type'], prefix='type')
if 'status' in df.columns:
    df = pd.get_dummies(df, columns=['status'], prefix='status')

# Step 3: Prepare Features
print("Preparing features...")
features = [
    'hour_of_day', 'day_of_week', 'month', 
    'is_weekend', 'is_rush_hour',
    'from_id', 'to_id'
]

# Add one-hot encoded columns
features.extend([col for col in df.columns if col.startswith(('line_', 'type_', 'status_'))])

# Verify all features exist
features = [f for f in features if f in df.columns]
print(f"Using features: {features}")

X = df[features]
y = df['delay_minutes']

# Step 4: Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train model
print("Training model...")
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)

# Step 6: Evaluate model
print("Evaluating model...")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nModel Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")

# Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

# Simple prediction function
def predict_delay(hour_of_day, day_of_week, from_id, to_id, month=None):
    if month is None:
        month = pd.Timestamp.now().month
        
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': 1 if day_of_week in [5, 6] else 0,
        'is_rush_hour': 1 if hour_of_day in [7, 8, 9, 16, 17, 18, 19] else 0,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Add dummy columns for categorical variables
    for col in features:
        if col not in input_data.columns:
            input_data[col] = 0
            
    input_data = input_data[features]
    return model.predict(input_data)[0]

# Test prediction
print("\nTesting prediction...")
try:
    example_prediction = predict_delay(8, 1, 105, 107)  # Example IDs
    print(f"Predicted delay: {example_prediction:.2f} minutes")
except Exception as e:
    print(f"Error in example prediction: {e}")

print("\nProcess completed successfully!")

Loading data...
Preprocessing data...
Preparing features...
Using features: ['hour_of_day', 'day_of_week', 'month', 'is_weekend', 'is_rush_hour', 'from_id', 'to_id', 'line_Atl. City Line', 'line_Bergen Co. Line ', 'line_Gladstone Branch', 'line_Main Line', 'line_Montclair-Boonton', 'line_Morristown Line', 'line_No Jersey Coast', 'line_Northeast Corrdr', 'line_Pascack Valley', 'line_Princeton Shuttle', 'line_Raritan Valley', 'type_NJ Transit', 'status_cancelled', 'status_departed', 'status_estimated']
Splitting data...
Training model...
Evaluating model...

Model Performance:
Mean Absolute Error (MAE): 1.40 minutes
Root Mean Squared Error (RMSE): 1.86 minutes

Top 10 Most Important Features:
                   feature  importance
6                    to_id    0.194014
0              hour_of_day    0.183709
5                  from_id    0.179001
20         status_departed    0.129046
2                    month    0.076188
7      line_Atl. City Line    0.038401
1              day_of_week 

In [None]:

import os

# Save model locally
model_dir = '/Users/chetan/Documents/GitHub/nj_transit/models'
os.makedirs(model_dir, exist_ok=True)

# Save model and metadata
joblib.dump(model, os.path.join(model_dir, 'delay_predictor.joblib'))
joblib.dump(features, os.path.join(model_dir, 'features_list.joblib'))
joblib.dump({'MAE': mae, 'RMSE': rmse}, os.path.join(model_dir, 'metrics.joblib'))


['/Users/chetan/Documents/GitHub/nj_transit/models/metrics.joblib']

# Save Model to File
Save the trained model and necessary metadata (features list, performance metrics) to local files.

In [None]:
# Load model and metadata
model_path = "/Users/chetan/Documents/GitHub/nj_transit/models/delay_predictor.joblib"
features_path = "/Users/chetan/Documents/GitHub/nj_transit/models/features_list.joblib"
metrics_path = "/Users/chetan/Documents/GitHub/nj_transit/models/metrics.joblib"

loaded_model = joblib.load(model_path)
loaded_features = joblib.load(features_path)
loaded_metrics = joblib.load(metrics_path)

# Prediction function
def predict_delay(hour_of_day, day_of_week, from_id, to_id, month=None):
    if month is None:
        month = datetime.now().month
        
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': 1 if day_of_week in [5, 6] else 0,
        'is_rush_hour': 1 if hour_of_day in [7, 8, 9, 16, 17, 18, 19] else 0,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Add dummy columns
    for col in loaded_features:
        if col not in input_data.columns:
            input_data[col] = 0
            
    return loaded_model.predict(input_data[loaded_features])[0]

# Test prediction
example_delay = predict_delay(8, 1, 105, 107)
print(f"Predicted delay: {example_delay:.2f} minutes")

Predicted delay: 0.63 minutes
