In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# 1. Load the Dataset
#print("Loading data...")
df = pd.read_csv('GTFS_Data.csv')

# 2. Data Cleaning & Feature Engineering
# Extract hour and minute from arrival_time
df['arrival_time'] = pd.to_datetime(df['arrival_time'], format='%H:%M:%S', errors='coerce')
df = df.dropna(subset=['arrival_time', 'time', 'SRI'])

df['hour'] = df['arrival_time'].dt.hour
df['minute'] = df['arrival_time'].dt.minute

# Convert SRI to numeric (Surface Roughness Index - affects speed)
df['SRI'] = pd.to_numeric(df['SRI'], errors='coerce')
df = df.dropna(subset=['SRI'])

# Convert target 'time' (originally in hours) to minutes
df['travel_time_min'] = df['time'] * 60

# Encode Congestion Levels (Very smooth, Smooth, Heavy congestion, etc.)
le = LabelEncoder()
df['congestion_level'] = le.fit_transform(df['Degree_of_congestion'])

# 3. Feature Selection
# Features: From-Stop, To-Stop, Hour of day, Minute, Road Surface Index, Congestion Level
features = ['stop_id_from', 'stop_id_to', 'hour', 'minute', 'SRI', 'congestion_level']
X = df[features]
y = df['travel_time_min']

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Model Building (Random Forest)
print("Training the model... this might take a minute.")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 6. Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"\nModel Performance:")
print(f"Mean Absolute Error: {mae:.2f} minutes")
print(f"R2 Score (Accuracy): {r2_score(y_test, y_pred):.2f}")

# 7. Save the Model
joblib.dump(model, 'pmpml_model.pkl')
joblib.dump(le, 'label_encoder.pkl')
print("\nModel saved as pmpml_model.pkl")

# 8. Test a real-world scenario
test_bus = pd.DataFrame([{
    'stop_id_from': 36156, 
    'stop_id_to': 38709, 
    'hour': 11, 
    'minute': 15, 
    'SRI': 5.14, 
    'congestion_level': 2 # 'Heavy congestion'
}])
prediction = model.predict(test_bus)
print(f"\nExample Prediction:")
print(f"For a bus at 11:15 AM in heavy traffic, predicted travel time: {prediction[0]:.2f} mins")

Training the model... this might take a minute.

Model Performance:
Mean Absolute Error: 0.23 minutes
R2 Score (Accuracy): 0.76

Model saved as pmpml_model.pkl

Example Prediction:
For a bus at 11:15 AM in heavy traffic, predicted travel time: 2.70 mins
