# Import Required Libraries
Import huggingface_hub and other necessary libraries for model handling and API interactions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load Data
print("Loading data...")
data_path = '/Users/chetan/Documents/GitHub/nj_transit/data/data.csv'
df = pd.read_csv(data_path)

# Step 2: Data Preprocessing
print("Preprocessing data...")

# Convert datetime columns
df['scheduled_time'] = pd.to_datetime(df['scheduled_time'], errors='coerce')
df['actual_time'] = pd.to_datetime(df['actual_time'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

# Drop rows where time conversion failed
df = df.dropna(subset=['scheduled_time', 'actual_time'])

# Create basic features
df['hour_of_day'] = df['scheduled_time'].dt.hour
df['day_of_week'] = df['scheduled_time'].dt.dayofweek
df['month'] = df['scheduled_time'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_rush_hour'] = df['hour_of_day'].isin([7, 8, 9, 16, 17, 18, 19]).astype(int)

# Handle missing values
df['delay_minutes'] = df['delay_minutes'].fillna(0)
df['from_id'] = df['from_id'].fillna(-1)
df['to_id'] = df['to_id'].fillna(-1)

# Remove outliers
q1 = df['delay_minutes'].quantile(0.25)
q3 = df['delay_minutes'].quantile(0.75)
iqr = q3 - q1
df = df[df['delay_minutes'].between(q1 - 1.5*iqr, q3 + 1.5*iqr)]

# One-hot encode categorical variables
if 'line' in df.columns:
    df = pd.get_dummies(df, columns=['line'], prefix='line')
if 'type' in df.columns:
    df = pd.get_dummies(df, columns=['type'], prefix='type')
if 'status' in df.columns:
    df = pd.get_dummies(df, columns=['status'], prefix='status')

# Step 3: Prepare Features
print("Preparing features...")
features = [
    'hour_of_day', 'day_of_week', 'month', 
    'is_weekend', 'is_rush_hour',
    'from_id', 'to_id'
]

# Add one-hot encoded columns
features.extend([col for col in df.columns if col.startswith(('line_', 'type_', 'status_'))])

# Verify all features exist
features = [f for f in features if f in df.columns]
print(f"Using features: {features}")

X = df[features]
y = df['delay_minutes']

# Step 4: Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train model
print("Training model...")
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)

# Step 6: Evaluate model
print("Evaluating model...")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nModel Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")

# Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

# Simple prediction function
def predict_delay(hour_of_day, day_of_week, from_id, to_id, month=None):
    if month is None:
        month = pd.Timestamp.now().month
        
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': 1 if day_of_week in [5, 6] else 0,
        'is_rush_hour': 1 if hour_of_day in [7, 8, 9, 16, 17, 18, 19] else 0,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Add dummy columns for categorical variables
    for col in features:
        if col not in input_data.columns:
            input_data[col] = 0
            
    input_data = input_data[features]
    return model.predict(input_data)[0]

# Test prediction
print("\nTesting prediction...")
try:
    example_prediction = predict_delay(8, 1, 105, 107)  # Example IDs
    print(f"Predicted delay: {example_prediction:.2f} minutes")
except Exception as e:
    print(f"Error in example prediction: {e}")

print("\nProcess completed successfully!")

In [None]:
# Import Required Libraries
from huggingface_hub import HfApi, HfFolder, Repository
import joblib
import os
import datetime

# Save Model to File
Save the trained model and necessary metadata (features list, performance metrics) to local files.

In [None]:
# Save Model to File

# Save the trained model locally
model_dir = '/Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/models'
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'delay_prediction_model.joblib')
joblib.dump(model, model_path)

# Save features list and performance metrics
features_path = os.path.join(model_dir, 'features_list.joblib')
joblib.dump(features, features_path)

metrics = {
    'MAE': mae,
    'RMSE': rmse
}
metrics_path = os.path.join(model_dir, 'metrics.joblib')
joblib.dump(metrics, metrics_path)

# Upload the model to Hugging Face
api = HfApi()
repo_url = api.create_repo(name="nj_transit_delay_prediction", private=False)
repo = Repository(local_dir=model_dir, clone_from=repo_url)

# Add files to the repository and push
repo.git_add()
repo.git_commit("Add delay prediction model and metadata")
repo.git_push()

# Load the model from Hugging Face for further predictions
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="delay_prediction_model.joblib")
model = joblib.load(model_path)

features_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="features_list.joblib")
features = joblib.load(features_path)

metrics_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="metrics.joblib")
metrics = joblib.load(metrics_path)

In [None]:
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load Data
print("Loading data...")
data_path = '/Users/chetan/Documents/GitHub/nj_transit/data/data.csv'
df = pd.read_csv(data_path)

# Step 2: Data Preprocessing
print("Preprocessing data...")

# Convert datetime columns
df['scheduled_time'] = pd.to_datetime(df['scheduled_time'], errors='coerce')
df['actual_time'] = pd.to_datetime(df['actual_time'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

# Drop rows where time conversion failed
df = df.dropna(subset=['scheduled_time', 'actual_time'])

# Create basic features
df['hour_of_day'] = df['scheduled_time'].dt.hour
df['day_of_week'] = df['scheduled_time'].dt.dayofweek
df['month'] = df['scheduled_time'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_rush_hour'] = df['hour_of_day'].isin([7, 8, 9, 16, 17, 18, 19]).astype(int)

# Handle missing values
df['delay_minutes'] = df['delay_minutes'].fillna(0)
df['from_id'] = df['from_id'].fillna(-1)
df['to_id'] = df['to_id'].fillna(-1)

# Remove outliers
q1 = df['delay_minutes'].quantile(0.25)
q3 = df['delay_minutes'].quantile(0.75)
iqr = q3 - q1
df = df[df['delay_minutes'].between(q1 - 1.5*iqr, q3 + 1.5*iqr)]

# One-hot encode categorical variables
if 'line' in df.columns:
    df = pd.get_dummies(df, columns=['line'], prefix='line')
if 'type' in df.columns:
    df = pd.get_dummies(df, columns=['type'], prefix='type')
if 'status' in df.columns:
    df = pd.get_dummies(df, columns=['status'], prefix='status')

# Step 3: Prepare Features
print("Preparing features...")
features = [
    'hour_of_day', 'day_of_week', 'month', 
    'is_weekend', 'is_rush_hour',
    'from_id', 'to_id'
]

# Add one-hot encoded columns
features.extend([col for col in df.columns if col.startswith(('line_', 'type_', 'status_'))])

# Verify all features exist
features = [f for f in features if f in df.columns]
print(f"Using features: {features}")

X = df[features]
y = df['delay_minutes']

# Step 4: Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train model
print("Training model...")
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)

# Step 6: Evaluate model
print("Evaluating model...")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nModel Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} minutes")

# Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

# Simple prediction function
def predict_delay(hour_of_day, day_of_week, from_id, to_id, month=None):
    if month is None:
        month = pd.Timestamp.now().month
        
    input_data = pd.DataFrame([{
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': 1 if day_of_week in [5, 6] else 0,
        'is_rush_hour': 1 if hour_of_day in [7, 8, 9, 16, 17, 18, 19] else 0,
        'from_id': from_id,
        'to_id': to_id
    }])
    
    # Add dummy columns for categorical variables
    for col in features:
        if col not in input_data.columns:
            input_data[col] = 0
            
    input_data = input_data[features]
    return model.predict(input_data)[0]

# Test prediction
print("\nTesting prediction...")
try:
    example_prediction = predict_delay(8, 1, 105, 107)  # Example IDs
    print(f"Predicted delay: {example_prediction:.2f} minutes")
except Exception as e:
    print(f"Error in example prediction: {e}")

print("\nProcess completed successfully!")

# Upload Model to Hugging Face
Create a Hugging Face repository and upload the model files using huggingface_hub API.

In [None]:
# Upload Model to Hugging Face

from huggingface_hub import HfApi, HfFolder, Repository
import joblib
import os
import datetime

# Save Model to File

# Save the trained model locally
model_dir = '/Users/chetan/Documents/GitHub/nj_transit_data_ru_hack/models'
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'delay_prediction_model.joblib')
joblib.dump(model, model_path)

# Save features list and performance metrics
features_path = os.path.join(model_dir, 'features_list.joblib')
joblib.dump(features, features_path)

metrics = {
    'MAE': mae,
    'RMSE': rmse
}
metrics_path = os.path.join(model_dir, 'metrics.joblib')
joblib.dump(metrics, metrics_path)

# Upload the model to Hugging Face
api = HfApi()
repo_url = api.create_repo(name="nj_transit_delay_prediction", private=False)
repo = Repository(local_dir=model_dir, clone_from=repo_url)

# Add files to the repository and push
repo.git_add()
repo.git_commit("Add delay prediction model and metadata")
repo.git_push()

# Load the model from Hugging Face for further predictions
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="delay_prediction_model.joblib")
model = joblib.load(model_path)

features_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="features_list.joblib")
features = joblib.load(features_path)

metrics_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="metrics.joblib")
metrics = joblib.load(metrics_path)

# Download and Use Model from Hugging Face
Download the saved model from Hugging Face and prepare it for making predictions.

In [None]:
# Download and Use Model from Hugging Face

from huggingface_hub import hf_hub_download
import joblib

# Load the model from Hugging Face for further predictions
model_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="delay_prediction_model.joblib")
model = joblib.load(model_path)

# Load the features list from Hugging Face
features_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="features_list.joblib")
features = joblib.load(features_path)

# Load the performance metrics from Hugging Face
metrics_path = hf_hub_download(repo_id="username/nj_transit_delay_prediction", filename="metrics.joblib")
metrics = joblib.load(metrics_path)

# Display the loaded metrics
print("Loaded Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {metrics['MAE']:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f} minutes")

# Example prediction using the loaded model
example_prediction = predict_delay(8, 1, 105, 107)  # Example IDs
print(f"Predicted delay: {example_prediction:.2f} minutes")

# Test Predictions with Loaded Model
Verify the loaded model works correctly by making test predictions and comparing with original results.

In [None]:
# Test Predictions with Loaded Model

# Verify the loaded model works correctly by making test predictions and comparing with original results

# Display the loaded metrics
print("Loaded Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {metrics['MAE']:.2f} minutes")
print(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f} minutes")

# Example prediction using the loaded model
example_prediction = predict_delay(8, 1, 105, 107)  # Example IDs
print(f"Predicted delay: {example_prediction:.2f} minutes")