## Imports/Setup

In [1]:
import pandas as pd
import os
import re
import numpy as np
from datetime import datetime, timedelta

In [2]:
northbound = [[66, 82, 86, 88, 94, 132, 176, 178, 190, 194, 150, 160, 162, 164, 166, 168, 170, 172, 174]]
southbound = [[67, 93, 95, 99, 135, 169, 177, 137, 139, 161, 163, 165, 167, 171, 173, 175, 195]]

all_trains = sorted(northbound[0] + southbound[0])

In [3]:
years = [year for year in range(2011, 2024+1)]
print(years)

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


### Create merged data folder for each year

In [4]:
for year in years:
    train_subfolder = f"../data/4-merged/{year}"
    try:
        os.makedirs(train_subfolder, exist_ok=True)
    except :
        print(f"Failed to create folder for {year}.")
        continue

### Merge all files per year/train number into one file per train within the year folder

In [15]:
for year in years:
    for train_num in all_trains:
        train_files = os.listdir(f"../data/3-augmented/{year}/{train_num}")
        all_run_dfs = []
        for filepath in train_files:
            df = pd.read_csv(f"../data/3-augmented/{year}/{train_num}/{filepath}" )
            try:
                df['Train Number'] = str(train_num)
                df['Service Disruption'] = df['Service Disruption'].astype(bool)
                df['Cancellation'] = df['Cancellation'].astype(bool)
                all_run_dfs.append(df)
            except:
                continue

    
        merged_df = pd.concat(all_run_dfs, axis=0)
        merged_df.to_csv(f"../data/4-merged/{year}/{train_num}.csv", index=False)


In [17]:
for file in os.listdir('../data/4-merged/2011/'):
    df = pd.read_csv('../data/4-merged/2011/' + file)
    print(df.head())
    print("\n")

  Origin Date  Train Number  Service Disruption  Cancellation Station Code  \
0  2011-02-26           169               False         False          BOS   
1  2011-02-26           169               False         False          BBY   
2  2011-02-26           169               False         False          RTE   
3  2011-02-26           169               False         False          PVD   
4  2011-02-26           169               False         False          KIN   

  Scheduled Departure Datetime Scheduled Arrival Datetime  \
0          2011-02-26 18:40:00                        NaN   
1          2011-02-26 18:45:00                        NaN   
2          2011-02-26 18:55:00                        NaN   
3          2011-02-26 19:20:00                        NaN   
4          2011-02-26 19:41:00                        NaN   

  Actual Departure Datetime Actual Arrival Datetime            Comments  
0       2011-02-26 18:40:00                     NaN  Departed: on time.  
1       2011-02-

### Merge all files by year

In [18]:
train_subfolder = f"../data/5-yearly/{year}"
os.makedirs(train_subfolder, exist_ok=True)

In [19]:
for year in years: 
    all_train_dfs = []
    for train_num in all_trains:
        filepath = f"../data/4-merged/{year}/{train_num}.csv"
        df = pd.read_csv(filepath)
        all_train_dfs.append(df)
    merged_df = pd.concat(all_train_dfs, axis=0)
    merged_df.to_csv(f"../data/5-yearly/{year}.csv")