1. Setup + Imports

In [104]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_error

2. Data Preparation
    - loading dataset
    - preprocessing
    - creating sequences
    - splitting data

In [105]:
df = pd.read_csv("train delay data.csv")
print(df.shape)
df.head()

(2878, 7)


Unnamed: 0,Distance Between Stations (km),Weather Conditions,Day of the Week,Time of Day,Train Type,Historical Delay (min),Route Congestion
0,100,Clear,Monday,Morning,Express,5,Low
1,150,Rainy,Tuesday,Afternoon,Superfast,10,Medium
2,200,Foggy,Wednesday,Evening,Local,15,High
3,50,Clear,Thursday,Night,Express,2,Low
4,75,Rainy,Friday,Morning,Superfast,8,Medium


In [106]:
def preprocess_data(df, feature_cols: list, target_col: str, ordinal_vars= None): 

    categorical_columns = df.select_dtypes(include=['object']).columns #gets columns that needs to be encoded

    if ordinal_vars is not None:  #dict specifying order may not be inputted, especially if too many features 
        ordinal_vars = {key: value for key, value in ordinal_vars.items() if key in feature_cols} #filtering feature values from ordered dictionary 

        for i in ordinal_vars.keys(): #specifically encodes for ordered categorical values 
            categorical_columns = categorical_columns.delete(categorical_columns.get_loc(i)) #preventing encoding twice
            order = ordinal_vars[i] #getting the specififed order for encoding 
            df[f'{i}_Encoded'] = df[i].astype(pd.CategoricalDtype(categories = order, ordered = True)).cat.codes #establishes the order and encodes it 
            del df[i] 

    df = pd.get_dummies(df, columns=categorical_columns) #encodes non-ordered categorical variables
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df) #scales from range 0,1
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns) 
    df_scaled[target_col] = df_scaled.pop(target_col) #ensures that target col is in the last pos
    return df_scaled


In [107]:
ordinal_vars = {'Route Congestion': ['Low', 'Medium', 'High']}
    #ordinal columns dictionary
    #keys: column names
    #values: order of values from small to large

target_cols = 'Historical Delay (min)'
feature_cols = df.drop(columns = [target_cols], axis = 1).columns.tolist()

df_scaled = preprocess_data(df, feature_cols, target_cols, ordinal_vars)
df_scaled

Unnamed: 0,Distance Between Stations (km),Route Congestion_Encoded,Weather Conditions_Clear,Weather Conditions_Foggy,Weather Conditions_Rainy,Day of the Week_Friday,Day of the Week_Monday,Day of the Week_Saturday,Day of the Week_Sunday,Day of the Week_Thursday,Day of the Week_Tuesday,Day of the Week_Wednesday,Time of Day_Afternoon,Time of Day_Evening,Time of Day_Morning,Time of Day_Night,Train Type_Express,Train Type_Local,Train Type_Superfast,Historical Delay (min)
0,0.104712,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.004065
1,0.157068,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.008130
2,0.209424,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.012195
3,0.052356,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.001626
4,0.078534,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.006504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2873,0.989529,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.983740
2874,0.968586,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.987805
2875,0.994764,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.991870
2876,0.973822,0.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.995935


3. Model set up
    - Define LSTM architecture
    - Initialize model, optimizer, and loss function


4. Training Loop
    - Train model over epochs
    - Track loss and accuracy
    - Save best model (optional)

5. Evaluation & Results
    - Evaluate on test set
    - Plot training/validation loss curves
    - Visualize predicted vs actual delays


In [110]:
# Convert test predictions back to original scale if normalized
y_test_actual = y_test  
y_pred_actual = y_pred.squeeze()  

def scores(y_test_actual, y_pred_actual): 
        mae = mean_absolute_error(y_test_actual, y_pred_actual)
        r2 = r2_score(y_test_actual, y_pred_actual)
        return f"Test Mean Absolute Error: {mae}, Test R^2: {r2}"

def visualizing (y_test_actual, y_pred_actual):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_test_actual, y=y_pred_actual, alpha=0.5)
    plt.plot([min(y_test_actual), max(y_test_actual)], [min(y_test_actual), max(y_test_actual)],
            color='red', linestyle='--', label="Perfect Prediction")

    # Labels and title
    plt.xlabel("Actual Delay")
    plt.ylabel("Predicted Delay")
    plt.title("Actual vs. Predicted Train Delays")
    plt.legend()
    plt.show()

NameError: name 'y_test' is not defined