In [1]:
# Import relevant libraries and packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, r2_score
import warnings

In [2]:
# --- 1. Load Data ---
DB_PATH = 'data/cta_database.db'
conn = sqlite3.connect(DB_PATH)
query = "SELECT * FROM train_positions"
df = pd.read_sql_query(query, conn)
conn.close()

In [3]:
# Re-create the datetime features from EDA
df['fetch_datetime'] = pd.to_datetime(df['fetch_timestamp'], unit='s')
df['hour_of_day'] = df['fetch_datetime'].dt.hour
df['day_of_week'] = df['fetch_datetime'].dt.day_name()

In [4]:
# --- 2. Define Target Variable and Features ---
# For this first model, let's predict speed.
# We'll drop rows with missing data for simplicity.
df.dropna(subset=['latitude', 'longitude', 'heading'], inplace=True)

In [5]:
TARGET = 'speed_kmh' # We need to calculate this feature first!

In [6]:
# We'll calculate speed just like in the EDA notebook
# (This ensures our modeling notebook is self-contained)
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

In [7]:
df_sorted = df.sort_values(by=['run_number', 'fetch_timestamp']).copy()
df_sorted['time_diff_s'] = df_sorted.groupby('run_number')['fetch_timestamp'].diff()
df_sorted['lat_prev'] = df_sorted.groupby('run_number')['latitude'].shift(1)
df_sorted['lon_prev'] = df_sorted.groupby('run_number')['longitude'].shift(1)
df_sorted['distance_m'] = haversine_distance(df_sorted['lat_prev'], df_sorted['lon_prev'], df_sorted['latitude'], df_sorted['longitude'])
df_sorted['speed_mps'] = (df_sorted['distance_m'] / df_sorted['time_diff_s']).fillna(0)
df_sorted['speed_kmh'] = df_sorted['speed_mps'] * 3.6
df = df_sorted.dropna(subset=[TARGET]).copy()

In [8]:
# Define which columns are features (X) and which is the target (y)
features = ['latitude', 'longitude', 'heading', 'hour_of_day', 'day_of_week', 'is_delayed', 'next_station_name']
X = df[features]
y = df[TARGET]

In [9]:
# --- 3. Time-Based Data Split ---
# It's crucial to split time-series data chronologically
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 12171
Validation set size: 4057
Test set size: 4058


In [10]:
# --- 4. Preprocessing Pipeline Setup ---
# Define which columns need which transformation
numerical_features = ['latitude', 'longitude', 'heading', 'hour_of_day']
categorical_features = ['day_of_week', 'next_station_name', 'is_delayed']

In [11]:
# Create the preprocessing pipelines for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [12]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

In [13]:
# Display the first few rows of the training data
display(X_train.head())

Unnamed: 0,latitude,longitude,heading,hour_of_day,day_of_week,is_delayed,next_station_name
4811,42.01906,-87.67289,130,16,Wednesday,0,Morse
4832,42.00836,-87.66591,178,16,Wednesday,0,Loyola
19486,41.72238,-87.62441,358,18,Thursday,0,87th
19508,41.73537,-87.62475,357,18,Thursday,0,79th
19530,41.74091,-87.62488,357,18,Thursday,0,79th


In [14]:
# --- 5. Create and Train the LightGBM Model ---

# Create a full pipeline that includes the preprocessor and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42))
])

In [15]:
# Train the entire pipeline on the training data
print("Training the LightGBM model...")
pipeline.fit(X_train, y_train)
print("Training complete.")

Training the LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 12171, number of used features: 41
[LightGBM] [Info] Start training from score 31.624262
Training complete.


In [16]:
# --- 6. Evaluate the Model ---

# Make predictions on the validation set
y_pred_val = pipeline.predict(X_val)



In [17]:
# Calculate performance metrics
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\n--- Model Performance on Validation Set ---")
print(f"Mean Absolute Error (MAE): {mae:.2f} km/h")
print(f"R-squared (R²): {r2:.2f}")
print("-----------------------------------------")


--- Model Performance on Validation Set ---
Mean Absolute Error (MAE): 12.71 km/h
R-squared (R²): 0.43
-----------------------------------------


In [18]:
# Display a few sample predictions vs actual values
results_df = pd.DataFrame({'Actual Speed': y_val, 'Predicted Speed': y_pred_val})
print("\nSample Predictions:")
display(results_df.head(10))


Sample Predictions:


Unnamed: 0,Actual Speed,Predicted Speed
4463,22.319575,15.521432
4482,44.376751,32.645638
4500,0.0,32.645638
4518,36.233518,20.301292
4537,29.735171,23.044429
4555,0.0,22.563419
4573,28.755693,22.059349
4592,32.258811,18.120851
4611,0.0,18.120851
4630,71.28501,26.430163


### MLFLOW

In [19]:
import mlflow
import mlflow.sklearn

In [20]:
# --- 7. Log Experiment with MLflow ---

# Set the experiment name. MLflow will create it if it doesn't exist.
mlflow.set_experiment("CTA Train Speed Prediction")

<Experiment: artifact_location='file:///Users/gauravkhanal/Documents/transit_anomaly/mlruns/967605301230387316', creation_time=1756408536360, experiment_id='967605301230387316', last_update_time=1756408536360, lifecycle_stage='active', name='CTA Train Speed Prediction', tags={}>

In [21]:
# Start an MLflow run. Everything inside this block will be logged.
with mlflow.start_run(run_name="LightGBM_Baseline"):
    print("Starting MLflow run...")

    pipeline.fit(X_train, y_train)
    y_pred_val = pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    r2 = r2_score(y_val, y_pred_val)

    print(f"\n--- Logging metrics ---")
    print(f"MAE: {mae:.2f}")
    print(f"R2: {r2:.2f}")

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # --- FIX: Add the input_example parameter ---
    # This will automatically create and save the model's signature.
    mlflow.sklearn.log_model(
        pipeline,
        "lightgbm_model",
        input_example=X_train.head(1) # Use one row of training data as an example
    )

    print("\nMLflow run complete. Model, metrics, and signature have been logged.")

Starting MLflow run...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 12171, number of used features: 41
[LightGBM] [Info] Start training from score 31.624262





--- Logging metrics ---
MAE: 12.71
R2: 0.43

MLflow run complete. Model, metrics, and signature have been logged.




In [22]:
mlflow ui

SyntaxError: invalid syntax (3574192917.py, line 1)