## Imports/Setup

In [1]:
import pandas as pd
import os
import re
import numpy as np
from datetime import datetime, timedelta

In [2]:
northbound = [[66, 82, 86, 88, 94, 132, 176, 178, 190, 194, 150, 160, 162, 164, 166, 168, 170, 172, 174]]
southbound = [[67, 93, 95, 99, 135, 169, 177, 137, 139, 161, 163, 165, 167, 171, 173, 175, 195]]

all_trains = sorted(northbound[0] + southbound[0])

In [3]:
years = [year for year in range(2011, 2024+1)]
print(years)

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


## Create output folders

In [4]:
for year in years:
    for train_num in all_trains:
        train_subfolder = f"../data/3-augmented/{year}/{train_num}"
        try:
            os.makedirs(train_subfolder, exist_ok=True)
        except :
            print(f"Failed to create folder for train number {train_num}.")
            continue

In [5]:
def parse_custom_time(time_str):
    """
    Parses a time string like '450P' or '125A' and returns a datetime.time object.
    """
    if pd.isna(time_str) or time_str == '*':
        return pd.NaT
    
    time_str = str(time_str)
    am_pm = time_str[-1].upper()
    time_val = time_str[:-1]

    # Handle cases where the time is a single digit hour
    if len(time_val) == 3:
        hour = int(time_val[0])
        minute = int(time_val[1:])
    else:
        hour = int(time_val[:2])
        minute = int(time_val[2:])

    # Adjust hour for AM/PM logic
    if am_pm == 'P' and hour != 12:
        hour += 12
    elif am_pm == 'A' and hour == 12:  # Special case for 12 AM
        hour = 0
    
    return pd.to_datetime(f"{hour:02d}:{minute:02d}", format='%H:%M').time()

## Augment raw data with information from filename

In [7]:
exceptions = []

for year in years:
    print(year)
    for train_num in all_trains:
        input_dir = f"../data/2-csv/{year}/{train_num}"
        output_dir = f"../data/3-augmented/{year}/{train_num}"
        if os.path.exists(input_dir):
            num_files = len(os.listdir(input_dir))
            #print(f"    Num files for {year} and {train_num}: {num_files}")
            for file in os.listdir(input_dir):
                if not file.startswith(".") and not file == 'index.csv':
                    try:
                        filepath = os.path.join(input_dir, file)
                        filename = filepath.split("/")[-1]
                        train_number_extracted = filename.split("_")[0]
                        date_string = filename.split("_")[1].rstrip(".csv")
                        date_parsed = datetime.strptime(date_string, "%Y%m%d").date()
                        #print(f"Train Num: {train_number_extracted} | Date Part: {date_parsed}")
                
                        df = pd.read_csv(filepath)
                        
                        # Augment with columns containing actual date of departure and arrival 
                        origination_date = pd.to_datetime(date_parsed)

                        df['Origin Date'] = origination_date
                        df['Train Number'] = train_num
                        df['Service Disruption'] = 0
                        df['Cancellation'] = 0
                        
                        departure_day_offset = pd.to_numeric(df['Schedule Departure Day'], errors='coerce') - 1
                        arrival_day_offset = pd.to_numeric(df['Schedule Arrival Day'], errors='coerce') - 1

                        df['Scheduled Departure Date'] = origination_date + pd.to_timedelta(departure_day_offset, unit='d')
                        df['Scheduled Arrival Date'] = origination_date + pd.to_timedelta(arrival_day_offset, unit='d')
    
                        # Augment with columns for actual datetime 
                        df['Parsed Schedule Departure Time'] = df['Schedule Departure Time'].apply(parse_custom_time)
                        df['Parsed Schedule Arrival Time'] = df['Schedule Arrival Time'].apply(parse_custom_time)
                        df['Parsed Actual Departure Time'] = df['Actual Departure Time'].apply(parse_custom_time)
                        df['Parsed Actual Arrival Time'] = df['Actual Arrival Time'].apply(parse_custom_time)
                        
                        df['Scheduled Departure Datetime'] = df.apply(
                            lambda row: pd.to_datetime(str(row['Scheduled Departure Date']).split()[0] + ' ' + str(row['Parsed Schedule Departure Time'])) 
                            if pd.notna(row['Scheduled Departure Date']) and pd.notna(row['Parsed Schedule Departure Time'])
                            else pd.NaT, axis=1)
                        
                        df['Scheduled Arrival Datetime'] = df.apply(
                            lambda row: pd.to_datetime(str(row['Scheduled Arrival Date']).split()[0] + ' ' + str(row['Parsed Schedule Arrival Time'])) 
                            if pd.notna(row['Scheduled Arrival Date']) and pd.notna(row['Parsed Schedule Arrival Time'])
                            else pd.NaT, axis=1)
                        
                        df['Actual Departure Datetime'] = df.apply(
                            lambda row: pd.to_datetime(str(row['Scheduled Departure Date']).split()[0] + ' ' + str(row['Parsed Actual Departure Time'])) 
                            if pd.notna(row['Scheduled Departure Date']) and pd.notna(row['Parsed Actual Departure Time'])
                            else pd.NaT, axis=1)
                        
                        df['Actual Arrival Datetime'] = df.apply(
                            lambda row: pd.to_datetime(str(row['Scheduled Arrival Date']).split()[0] + ' ' + str(row['Parsed Actual Arrival Time'])) 
                            if pd.notna(row['Scheduled Arrival Date']) and pd.notna(row['Parsed Actual Arrival Time'])
                            else pd.NaT, axis=1)
    
                        # Output cleaned + augmented data
                        df = df[['Origin Date', 'Train Number', 'Service Disruption', 'Cancellation', 'Station Code','Scheduled Departure Datetime', 'Scheduled Arrival Datetime', 'Actual Departure Datetime', 'Actual Arrival Datetime', 'Comments']]
                        output_filepath = os.path.join(output_dir, file)                   
                        df.to_csv(output_filepath, index=False)
                    except ValueError as e:
                        exceptions.append([filepath, e])
                    except IndexError as e:
                        exceptions.append([filepath, e])

2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


In [8]:
len(exceptions)

2286

In [9]:
exceptions = pd.DataFrame(exceptions, columns=['filepath', 'error'])

In [10]:
exceptions.to_csv("exceptions_full.csv", index=False)