In [1]:
import pandas as pd
import os
import sys
import zipfile
import re
from io import StringIO

In [5]:
northbound = [[66, 82, 86, 88, 94, 132, 176, 178, 190, 194, 150, 160, 162, 164, 166, 168, 170, 172, 174]]
southbound = [[67, 93, 95, 99, 135, 169, 177, 137, 139, 161, 163, 165, 167, 171, 173, 175, 195]]

all_trains = sorted(northbound[0] + southbound[0])

In [6]:
years = [year for year in range(2011, 2024+1)]
print(years)

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


## Function to parse train data text file

In [7]:
def parse_train_file(input_file):
    """
    Parse individual train .txt file from https://dixielandsoftware.net/Amtrak/status/StatusPages/index.html archives
    """ 
    # CSV column headers
    headers = [
        'Station Code',
        'Schedule Arrival Day',
        'Schedule Arrival Time',
        'Schedule Departure Day',
        'Schedule Departure Time',
        'Actual Arrival Time',
        'Actual Departure Time',
        'Comments'
    ]
    
    parsed_data = [] # Store parsed rows
    
    try:
        # Read the file line by line.
        with open(input_file, 'r') as data_file:
            for line in data_file.readlines():
                # We are only interested in lines that start with '*' and a 3-letter station code.
                # We ignore the header and comment lines.
                if re.match(r'\* [A-Z]{3}', line):
                     # Remove the leading '*' and space from the line.
                    line = line.lstrip('* ')
                    
                    # Clean up the line to handle inconsistent spacing
                    clean_line = re.sub(r'\s+', ' ', line.strip())
                    parts = clean_line.split()
    
                    station_code = parts[0].strip('*')
                    
                    # Default values for the row
                    data_fields = []
                    comments = ''
    
                    # Find the index where the comments start, looking for either 'Arrived:' or 'Departed:'
                    comment_start_index = -1
                    try:
                        comment_start_index = parts.index('Departed:')
                    except ValueError:
                        try:
                            comment_start_index = parts.index('Arrived:')
                        except ValueError:
                            # If no comment is found, all parts are data fields
                            comment_start_index = len(parts)
    
                    # The data fields are everything before the comment
                    data_fields = parts[1:comment_start_index]
                    
                    # The comments are everything from the comment_start_index onwards
                    comments = ' '.join(parts[comment_start_index:])
                    
                    # If there's a '|' in the comments, replace it with a space for cleaner output
                    comments = comments.replace('|', '').strip()
    
                    # Pad data_fields to ensure a length of 6 for the time-related columns
                    # We need to handle cases where there are less than 6 fields before the comment.
                    while len(data_fields) < 6:
                        # Insert empty string for missing data, e.g., actual times.
                        data_fields.append('')
                    
                    # Construct the final row with a single comment field
                    row = [station_code] + data_fields[:6] + [comments]
                                
                    parsed_data.append(row)
        
        # Write parsed data to CSV

        output_directory = "data/csv/"
        input_file_root = os.path.splitext(input_file)[0]
        input_file_root = input_file_root.lstrip("data/extracted/")
        output_filename = os.path.join(output_directory, f"{input_file_root}.csv") 
        #print(output_filename)
        
        df = pd.DataFrame(parsed_data, columns=headers)
        df.to_csv(output_filename, index=False)
    
    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")

## 1. Unzip all files to extracted folders

In [8]:
for year in years:
    for train_num in all_trains:
        zip_path = f"./data/zips/{year}/{train_num}.zip"
        extract_to = f"./data/extracted/{year}/"
        os.makedirs(extract_to, exist_ok=True)
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
        except :
            print(f"Failed to unzip {year} train number {train_num} data.")
            continue

## 2. Create output csv subfolders

In [9]:
for year in years:
    for train_num in all_trains:
        train_subfolder = f"./data/csv/{year}/{train_num}"
        try:
            os.makedirs(train_subfolder, exist_ok=True)
        except :
            print(f"Failed to create folder for train number {train_num}.")
            continue

## 3. Parse all files

In [10]:
for year in years:
    print(year)
    for train_num in all_trains:
        input_dir = f"data/extracted/{year}/{train_num}"
        if os.path.exists(input_dir):
            num_files = len(os.listdir(input_dir))
            print(f"\tNum files for {year} and {train_num}: {num_files}")
            for file in os.listdir(input_dir):
                filepath = os.path.join(input_dir, file)
                try:
                    parse_train_file(filepath)
                except UnicodeDecodeError as e:
                    print(filepath)
                    continue

2011
	Num files for 2011 and 66: 365
	Num files for 2011 and 67: 365
	Num files for 2011 and 82: 57
	Num files for 2011 and 86: 252
	Num files for 2011 and 88: 112
	Num files for 2011 and 93: 202
	Num files for 2011 and 94: 253
	Num files for 2011 and 95: 253
	Num files for 2011 and 99: 112
	Num files for 2011 and 132: 51
	Num files for 2011 and 135: 108
	Num files for 2011 and 137: 246
	Num files for 2011 and 139: 51
	Num files for 2011 and 150: 108
	Num files for 2011 and 160: 108
	Num files for 2011 and 161: 108
	Num files for 2011 and 162: 108
	Num files for 2011 and 163: 108
	Num files for 2011 and 164: 111
	Num files for 2011 and 165: 107
	Num files for 2011 and 166: 51
	Num files for 2011 and 167: 58
	Num files for 2011 and 168: 58
	Num files for 2011 and 169: 107
	Num files for 2011 and 170: 246
	Num files for 2011 and 171: 253
	Num files for 2011 and 172: 245
	Num files for 2011 and 173: 246
	Num files for 2011 and 174: 253
	Num files for 2011 and 175: 246
	Num files for 2011 