# ETL and EDA Notebook

# Part 1 - Amtrak Northeast Regional Train Data

## SECTION 1A - Setup

#### Imports for processing train data

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import date, timedelta
import re
import requests
import lxml.html as lh

#### Helper functions for retrieving the data

In [2]:
def make_dict():
    """
    Creates dictionary to hold raw data sorted by arrival and direction, then by station.
    """
    dictionary = {'Arrive': {s: [] for s in ['NYP', 'BOS', 'WAS']},
                  'Depart': {s: [] for s in ['BOS', 'WAS', 'NHV', 'NYP', 'PHL', 'BAL',
                                             'PVD', 'WIL', 'BWI', 'NWK', 'BBY', 'RTE',
                                             'TRE', 'STM', 'NCR', 'KIN', 'NLC']}}
    return dictionary

def convert_train_nums_to_string(train_nums_list):
    """
    Give a list of train numbers, converts it to a string that can be used in a url.
    """
    output = str(train_nums_list[0])
    for train_num in train_nums_list[1:]:
        output += '%2C' + str(train_num)
    return output


def convert_dates_to_string(dt_start, dt_end):
    """
    Function to convert a date object to a url string.
    """
    start = '&date_start=' + str(dt_start.month) + '%2F' + str(dt_start.day) + \
        '%2F' + str(dt_start.year)
    end = '&date_end=' + str(dt_end.month) + '%2F' + str(dt_end.day) + \
        '%2F' + str(dt_end.year)
    return start + end


def construct_urls(northbound_trains, southbound_trains, start_date, end_date):
    """
    Inputs: 2 lists of lists of train numbers, 2 dates
        - list of northbound train subset lists
        - list of southbound train subset lists
        - start date for fetching data
        - end date for fetching data
    Outputs: dictionary of urls based on arrivals and departures from select stations
    """
    URL_BEGIN = 'https://juckins.net/amtrak_status/archive/html/history.php?train_num='
    URL_END_DP = '&df1=1&df2=1&df3=1&df4=1&df5=1&df6=1&df7=1&sort=schDp&sort_dir=ASC&co=gt&limit_mins=&dfon=1'
    URL_END_AR = '&df1=1&df2=1&df3=1&df4=1&df5=1&df6=1&df7=1&sort=schAr&sort_dir=ASC&co=gt&limit_mins=&dfon=1'
    dates_string = convert_dates_to_string(start_date, end_date)
    arrive = ['NYP']
    depart = ['NHV', 'NYP', 'PHL', 'BAL', 'PVD', 'WIL', 'BWI', 'NWK',
              'BBY', 'RTE', 'TRE', 'STM', 'NCR', 'KIN', 'NLC']
    urls = {'Arrive': [], 'Depart': []}
    for trains_list in northbound_trains:
        trains_string = convert_train_nums_to_string(trains_list)
        URL_BASE = URL_BEGIN + trains_string + dates_string + '&station='
        for station in depart + ['WAS']:
            urls['Depart'].append((station, URL_BASE + station + URL_END_DP))
        for station in arrive + ['BOS']:
            urls['Arrive'].append((station, URL_BASE + station + URL_END_AR))
    for trains_list in southbound_trains:
        trains_string = convert_train_nums_to_string(trains_list)
        URL_BASE = URL_BEGIN + trains_string + dates_string + '&station='
        for station in depart + ['BOS']:
            urls['Depart'].append((station, URL_BASE + station + URL_END_DP))
        for station in arrive + ['WAS']:
            urls['Arrive'].append((station, URL_BASE + station + URL_END_AR))
    return urls


def make_request(url):
    """
    Given a url, request the data and return the page content or None if
    retrieving data failed on the first try.
    """
    page = None
    try:
        req = requests.get(url, timeout=20)
        page = req.content
    except requests.exceptions.RequestException as e:
        print("An exception occurred: ", e)
    return page


def fetch_data_from_urls(urls):
    """
    For each url in the urls dictionary, fetch the data and save it to a
    dictionary for later access.
    """
    raw_data = make_dict()
    start_time = time.time()
    for station, url in urls['Depart']:
        data = make_request(url)
        raw_data['Depart'][station].append(data)
    for station, url in urls['Arrive']:
        raw_data['Arrive'][station].append(make_request(url))
    print('Retrieved data in {} seconds'.format(time.time() - start_time))
    return raw_data

#### Function to retrieve the raw HTML data from website
* Train data sourced from [Chris Juckins' website - Amtrak Status Maps Archive Database (ASMAD)](https://juckins.net/amtrak_status/archive/html/home.php), retrieved with his permission.
* Thank you, Chris!

In [3]:
def get_data(start=date.today()-timedelta(days=1), end=date.today()):
    """
    Function to retrieve new data from the website for specified dates, or else skip to next step.
    """
    # If querying a long time period, it is better to use smaller groups of trains (i.e. more requests)
    # northbound = [[66, 82, 86, 88], [94, 132, 150], [160, 162, 164, 166], [168, 170, 172, 174]]
    # southbound = [[67, 83, 93, 95], [99, 135, 137, 139], [161, 163, 165,167],[171, 173, 175, 195]]
    
    # If only querying a few days, we can just do them all at once
    northbound = [[66, 82, 86, 88, 94, 132,150, 160, 162, 164, 166, 168, 170, 172, 174]]
    southbound = [[67, 83, 93, 95, 99, 135, 137, 139, 161, 163, 165, 167, 171, 173, 175, 195]]
    # Function can be found in fetch_data.py. It constructs the proper URL to run the query
    urls = construct_urls(northbound, southbound, start, end)
    # The function that retrieves the raw data
    data = fetch_data_from_urls(urls)
    return data

#### Helper functions for data cleaning and processing

In [4]:
def get_direction(num):
    """
    Return direction of the train (odd = Southbound, even = Northbound).
    """
    if num % 2 == 0:
        return 'Northbound'
    else:
        return 'Southbound'


def get_num(re_match):
    """
    Assuming input contains a match , extract and return the numerical data from input.
    """
    num_match = re.search('(?P<num>[0-9]+)', re_match)
    return int(num_match.group('num'))


def make_dict_from_cols(col_names):
    """
    Create dictionary from a list of column names
    """
    dictionary = { col_name: [] for col_name in col_names }
    return dictionary


def get_html_col_names(raw_data, arrive_or_depart):
    """
    Using NYP (station with both arrival times and departure times), 
    retrieve column names from the HTML table, located in the 2nd row.
    """
    data_list = raw_data[arrive_or_depart]['NYP']
    page_content = data_list[0]
    doc = lh.fromstring(page_content)
    tr_elements = doc.xpath('//tr')
    html_col_names = [entry.text_content().strip() for entry in tr_elements[1]]        
    return html_col_names

#### Main function to convert html data to a basic pandas data frame

In [5]:
def raw_data_to_raw_df(raw_data, arrive_or_depart):
    """
    Function to put the raw html data in a dataframe for ease of processing.
    """
    col_names = get_html_col_names(raw_data, arrive_or_depart)
    N = 7
    data_dict = make_dict_from_cols(['Direction', 'Station'] + col_names)
    for station in raw_data[arrive_or_depart].keys():
        data_list = raw_data[arrive_or_depart][station]
        L = len(data_list)
        for i in range(L):
            page_content = data_list[i]
            doc = lh.fromstring(page_content)
            tr_elements = doc.xpath('//tr')
            if len(tr_elements) > 3:
                title = tr_elements[0].text_content()
                direction = get_direction(get_num(title))
                for j in range(2, len(tr_elements)):
                    table_row = tr_elements[j] 
                    if len(table_row) == N:
                        data_dict['Direction'].append(direction)
                        data_dict['Station'].append(station)
                        for col_name, entry in zip(col_names, table_row):
                            data = entry.text_content()
                            data_dict[col_name].append(data)
                    else:
                        continue            
            else:
                print("No data for this period, or an error occurred", station, arrive_or_depart)
    return pd.DataFrame.from_dict(data_dict)

## SECTION 1B - Retrieving Data

#### Retrieving new data
* Run cell below and wait for the request to complete

In [6]:
start =  date(2021,4,1)
end = date(2021,4,8) 
data = get_data(start=start, end=end)

Retrieved data in 21.8351628780365 seconds


In [7]:
print(data.keys())

dict_keys(['Arrive', 'Depart'])


## SECTION 1C - Save Raw Data to File

#### Process departure data to clean format

In [8]:
start_time = time.time()
depart =  raw_data_to_raw_df(data, 'Depart')
print('Elapsed:', time.time() - start_time)
depart.head()

Elapsed: 0.07958030700683594


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Dp,Act Dp,Comments,Service Disruption,Cancellations
0,Southbound,BOS,95,04/01/2021 (Th),04/01/2021 6:10 AM (Th),6:11AM,Dp: 1 min late.,,
1,Southbound,BOS,171,04/01/2021 (Th),04/01/2021 8:15 AM (Th),8:16AM,Dp: 1 min late.,,
2,Southbound,BOS,93,04/01/2021 (Th),04/01/2021 9:30 AM (Th),9:31AM,Dp: 1 min late.,,
3,Southbound,BOS,173,04/01/2021 (Th),04/01/2021 11:15 AM (Th),11:15AM,Dp: On time.,,
4,Southbound,BOS,137,04/01/2021 (Th),04/01/2021 1:40 PM (Th),1:41PM,Dp: 1 min late.,,


#### Proces arrival data to clean format

In [9]:
start_time = time.time()
arrive = raw_data_to_raw_df(data, 'Arrive')
print('elapsed:', time.time() - start_time)
arrive.head()

elapsed: 0.020621061325073242


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Ar,Act Ar,Comments,Service Disruption,Cancellations
0,Northbound,NYP,66,03/31/2021 (We),04/01/2021 1:25 AM (Th),1:20AM,Ar: 5 min early. | Dp: 1 min late.,,
1,Northbound,NYP,170,04/01/2021 (Th),04/01/2021 8:15 AM (Th),8:15AM,Ar: On time. | Dp: On time.,,
2,Northbound,NYP,172,04/01/2021 (Th),04/01/2021 10:44 AM (Th),10:48AM,Ar: 4 min late. | Dp: 1 min late.,,
3,Northbound,NYP,174,04/01/2021 (Th),04/01/2021 1:35 PM (Th),1:34PM,Ar: 1 min early. | Dp: On time.,,
4,Northbound,NYP,94,04/01/2021 (Th),04/01/2021 5:22 PM (Th),5:24PM,Ar: 2 min late. | Dp: On time.,,


#### Save the raw data

In [10]:
arrive_filestring = './data/trains/raw_arrive_' + str(start) + '_' + str(end) + '.csv'
depart_filestring = './data/trains/raw_depart_' + str(start) + '_' +  str(end) + '.csv'
arrive.to_csv(arrive_filestring, line_terminator='\n', index=False)
depart.to_csv(depart_filestring, line_terminator='\n', index=False)

In [11]:
print(arrive.shape[0])
arrive.head()

183


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Ar,Act Ar,Comments,Service Disruption,Cancellations
0,Northbound,NYP,66,03/31/2021 (We),04/01/2021 1:25 AM (Th),1:20AM,Ar: 5 min early. | Dp: 1 min late.,,
1,Northbound,NYP,170,04/01/2021 (Th),04/01/2021 8:15 AM (Th),8:15AM,Ar: On time. | Dp: On time.,,
2,Northbound,NYP,172,04/01/2021 (Th),04/01/2021 10:44 AM (Th),10:48AM,Ar: 4 min late. | Dp: 1 min late.,,
3,Northbound,NYP,174,04/01/2021 (Th),04/01/2021 1:35 PM (Th),1:34PM,Ar: 1 min early. | Dp: On time.,,
4,Northbound,NYP,94,04/01/2021 (Th),04/01/2021 5:22 PM (Th),5:24PM,Ar: 2 min late. | Dp: On time.,,


In [12]:
print(depart.shape[0])
depart.head()

1450


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Dp,Act Dp,Comments,Service Disruption,Cancellations
0,Southbound,BOS,95,04/01/2021 (Th),04/01/2021 6:10 AM (Th),6:11AM,Dp: 1 min late.,,
1,Southbound,BOS,171,04/01/2021 (Th),04/01/2021 8:15 AM (Th),8:16AM,Dp: 1 min late.,,
2,Southbound,BOS,93,04/01/2021 (Th),04/01/2021 9:30 AM (Th),9:31AM,Dp: 1 min late.,,
3,Southbound,BOS,173,04/01/2021 (Th),04/01/2021 11:15 AM (Th),11:15AM,Dp: On time.,,
4,Southbound,BOS,137,04/01/2021 (Th),04/01/2021 1:40 PM (Th),1:41PM,Dp: 1 min late.,,


## SECTION 1D - Transforming Raw Data Columns to Processed Form

#### Helper functions to process the columns

In [13]:
def get_col_names(arrive_or_depart):
    """
    This function returns the proper column names for the data depending on whether the
    data being processed is arrival or departure data
    """
    
    if arrive_or_depart == 'Arrive':
        return ['Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year',
                'Origin Quarter', 'Origin Month', 'Origin Day', 'Origin Week Day',
                'Full Sch Ar Date', 'Sch Ar Date', 'Sch Ar Day', 'Sch Ar Time',
                'Act Ar Time', 'Arrive Diff', 'Service Disruption', 'Cancellations']
    elif arrive_or_depart == 'Depart':
        return [ 'Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year', 
                 'Origin Quarter', 'Origin Month', 'Origin Day', 'Origin Week Day',
                 'Full Sch Dp Date','Sch Dp Date', 'Sch Dp Day', 'Sch Dp Time',
                 'Act Dp Time', 'Depart Diff', 'Service Disruption', 'Cancellations']

    
def get_key_names(arrive_or_depart):
    """
    This function returns the proper keys to create the column names depending on 
    whether the data being processed is arrival or departure data.
    """
    if arrive_or_depart == 'Arrive':
        return {'Sch Full Date': 'Full Sch Ar Date', 'Sch Abbr': 'Sch Ar', 
                'Act Abbr': 'Act Ar', 'Diff': 'Arrive Diff'}
    
    elif arrive_or_depart == 'Depart':
        return {'Sch Full Date': 'Full Sch Dp Date', 'Sch Abbr': 'Sch Dp', 
                'Act Abbr': 'Act Dp', 'Diff': 'Depart Diff'}

#### Main function to process each column in the semi-raw data frame to a better form

In [14]:
def process_columns(df, arrive_or_depart):
    """
    This function takes an input of the initial data (a pandas data frame) and whether it is 
    arrival or departure data. It takes each column of the initial data and does various operations
    to create the fully processed data frame.
    """
    ad_keys = get_key_names(arrive_or_depart) # the specific keys depending on if new_df is for arr or dep data
    
    new_df = pd.DataFrame()
    new_df['Train Num'] = pd.to_numeric(df['Train #'])
    new_df['Station'] = df['Station']
    new_df['Direction'] = df['Direction']
    
    origin_date = pd.to_datetime(df['Origin Date'], format="%m/%d/%Y", exact=False, errors='coerce')    
    new_df['Origin Date'] = origin_date
    new_df['Origin Year'] = origin_date.dt.year
    new_df['Origin Quarter'] = origin_date.dt.quarter
    new_df['Origin Month'] = origin_date.dt.month
    new_df['Origin Day'] = origin_date.dt.day
    new_df['Origin Week Day'] = origin_date.dt.day_name()
    
    sched_full_date = pd.to_datetime(df[ad_keys['Sch Abbr']], format='%m/%d/%Y %I:%M %p', exact=False, errors='coerce')
    new_df[ad_keys['Sch Full Date']] = sched_full_date
    new_df[ad_keys['Sch Abbr'] + ' Date'] = sched_full_date.dt.date
    new_df[ad_keys['Sch Abbr'] + ' Day'] = sched_full_date.dt.day_name()
    new_df[ad_keys['Sch Abbr'] + ' Time'] = sched_full_date.dt.time
    act_time = pd.to_datetime(df[ad_keys['Act Abbr']], format='%I:%M%p', exact=False, errors='coerce')
    new_df[ad_keys['Act Abbr'] + ' Time'] = act_time.dt.time
    
    df['Sched Date'] = sched_full_date 
    df['Act Date'] = pd.to_datetime(sched_full_date.dt.date.astype(str) + " " + df[ad_keys['Act Abbr']].astype(str),exact=False, errors='coerce')
    max_expected_delay = pd.Timedelta(hours=10)
    delta = df['Act Date'] - df['Sched Date']
    m_late = (delta < max_expected_delay) & (-1*max_expected_delay > delta)
    m_early = (-1*delta < max_expected_delay) & (-1*max_expected_delay > -1*delta)
    df.loc[m_late, 'Act Date'] += pd.Timedelta(days=1)
    df.loc[m_early, 'Act Date'] -= pd.Timedelta(days=1)
    new_df[ad_keys['Diff']] = np.rint((df['Act Date'] - df['Sched Date']).dt.total_seconds()/60).astype(int)
    new_df['Service Disruption'] = df['Service Disruption'].replace('SD', 1).replace('', 0)
    new_df['Cancellations'] =  df['Cancellations'].replace('C', 1).replace('', 0)
    return new_df.replace('', np.nan).dropna()

#### Process the arrival data to final format

In [15]:
full_arrive = process_columns(arrive, 'Arrive')
full_arrive.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Ar Date,Sch Ar Date,Sch Ar Day,Sch Ar Time,Act Ar Time,Arrive Diff,Service Disruption,Cancellations
0,66,NYP,Northbound,2021-03-31,2021,1,3,31,Wednesday,2021-04-01 01:25:00,2021-04-01,Thursday,01:25:00,01:20:00,-5,0,0
1,170,NYP,Northbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 08:15:00,2021-04-01,Thursday,08:15:00,08:15:00,0,0,0
2,172,NYP,Northbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 10:44:00,2021-04-01,Thursday,10:44:00,10:48:00,4,0,0
3,174,NYP,Northbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 13:35:00,2021-04-01,Thursday,13:35:00,13:34:00,-1,0,0
4,94,NYP,Northbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 17:22:00,2021-04-01,Thursday,17:22:00,17:24:00,2,0,0


In [16]:
full_arrive.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Ar Date,Sch Ar Date,Sch Ar Day,Sch Ar Time,Act Ar Time,Arrive Diff,Service Disruption,Cancellations
178,171,WAS,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 16:17:00,2021-04-08,Thursday,16:17:00,16:13:00,-4,0,0
179,93,WAS,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 17:18:00,2021-04-08,Thursday,17:18:00,17:32:00,14,0,0
180,173,WAS,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 19:10:00,2021-04-08,Thursday,19:10:00,19:06:00,-4,0,0
181,137,WAS,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 21:52:00,2021-04-08,Thursday,21:52:00,22:08:00,16,0,0
182,175,WAS,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 23:16:00,2021-04-08,Thursday,23:16:00,23:12:00,-4,0,0


In [17]:
full_arrive.shape[0]

183

#### Process the departure data to final format

In [18]:
full_depart = process_columns(depart, "Depart")
full_depart.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
0,95,BOS,Southbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 06:10:00,2021-04-01,Thursday,06:10:00,06:11:00,1,0,0
1,171,BOS,Southbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 08:15:00,2021-04-01,Thursday,08:15:00,08:16:00,1,0,0
2,93,BOS,Southbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 09:30:00,2021-04-01,Thursday,09:30:00,09:31:00,1,0,0
3,173,BOS,Southbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 11:15:00,2021-04-01,Thursday,11:15:00,11:15:00,0,0,0
4,137,BOS,Southbound,2021-04-01,2021,2,4,1,Thursday,2021-04-01 13:40:00,2021-04-01,Thursday,13:40:00,13:41:00,1,0,0


In [19]:
full_depart.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
1445,93,NLC,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 11:12:00,2021-04-08,Thursday,11:12:00,11:12:00,0,0,0
1446,173,NLC,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 12:48:00,2021-04-08,Thursday,12:48:00,12:49:00,1,0,0
1447,137,NLC,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 15:38:00,2021-04-08,Thursday,15:38:00,15:38:00,0,0,0
1448,175,NLC,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 16:57:00,2021-04-08,Thursday,16:57:00,16:57:00,0,0,0
1449,67,NLC,Southbound,2021-04-08,2021,2,4,8,Thursday,2021-04-08 23:31:00,2021-04-08,Thursday,23:31:00,23:31:00,0,0,0


In [20]:
full_depart.shape[0]

1442

#### Special processing for new 2021 data: concatenate with previously retrieved 2021 data
* It is possible that there will be duplicates for , but these do not matter as the database ignores all repeated entries

In [21]:
prev_arrive_2021 = pd.read_csv('./data/trains/processed_arrive_2021.csv')
prev_depart_2021 = pd.read_csv('./data/trains/processed_depart_2021.csv')

In [22]:
new_arrive_2021 = pd.concat([prev_arrive_2021, full_arrive], ignore_index=True, axis=0)
new_depart_2021 = pd.concat([prev_depart_2021, full_depart], ignore_index=True, axis=0)

In [23]:
new_arrive_2021.to_csv('./data/trains/processed_arrive_2021.csv', line_terminator='\n', index=False)
new_depart_2021.to_csv('./data/trains/processed_depart_2021.csv', line_terminator='\n', index=False)

In [24]:
new_depart_2021.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
0,99,BOS,Southbound,2021-01-01 00:00:00,2021,1,1,1,Friday,2021-01-01 08:40:00,2021-01-01,Friday,08:40:00,08:40:00,0,0,0
1,99,BOS,Southbound,2021-01-02 00:00:00,2021,1,1,2,Saturday,2021-01-02 08:40:00,2021-01-02,Saturday,08:40:00,08:40:00,0,0,0
2,99,BOS,Southbound,2021-01-03 00:00:00,2021,1,1,3,Sunday,2021-01-03 08:40:00,2021-01-03,Sunday,08:40:00,08:40:00,0,0,0
3,67,BOS,Southbound,2021-01-03 00:00:00,2021,1,1,3,Sunday,2021-01-03 21:30:00,2021-01-03,Sunday,21:30:00,21:30:00,0,0,0
4,95,BOS,Southbound,2021-01-04 00:00:00,2021,1,1,4,Monday,2021-01-04 06:10:00,2021-01-04,Monday,06:10:00,06:11:00,1,0,0


In [25]:
new_depart_2021.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Quarter,Origin Month,Origin Day,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
18027,93,NLC,Southbound,2021-04-08 00:00:00,2021,2,4,8,Thursday,2021-04-08 11:12:00,2021-04-08,Thursday,11:12:00,11:12:00,0,0,0
18028,173,NLC,Southbound,2021-04-08 00:00:00,2021,2,4,8,Thursday,2021-04-08 12:48:00,2021-04-08,Thursday,12:48:00,12:49:00,1,0,0
18029,137,NLC,Southbound,2021-04-08 00:00:00,2021,2,4,8,Thursday,2021-04-08 15:38:00,2021-04-08,Thursday,15:38:00,15:38:00,0,0,0
18030,175,NLC,Southbound,2021-04-08 00:00:00,2021,2,4,8,Thursday,2021-04-08 16:57:00,2021-04-08,Thursday,16:57:00,16:57:00,0,0,0
18031,67,NLC,Southbound,2021-04-08 00:00:00,2021,2,4,8,Thursday,2021-04-08 23:31:00,2021-04-08,Thursday,23:31:00,23:31:00,0,0,0


# Part 2 - Visual Crossing Weather Data


## SECTION 2A - Setup

#### Imports for retrieving weather data

In [None]:
import requests
import pandas as pd
import os

In [None]:
assert os.environ.get('VC_TOKEN') != None , 'empty token!'

#### All locations in lists containing proper URL format and filename format

In [None]:
locations = ['Boston,MA', 'Providence,RI', 'Kingston,RI', 'New%20London,CT', 'New%20Haven,CT', 'Stamford,CT', 
             'Manhattan,NY', 'Newark,NJ', 'Trenton,NJ', 'Philadelphia,PA', 'Wilmington,DE', 'Baltimore,MD', 
             'Baltimore%20BWI%20Airport,MD', 'New%20Carrollton,MD', 'Washington,DC']

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 
                            'Stamford_CT', 'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 
                            'Wilmington_DE', 'Baltimore_MD', 'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 
                            'Washington_DC']

#### Dates List
* Showing start and end dates for the yearly weather files

In [None]:
OLD_DATES_LIST = """
dates_list = [('2011-01-01','2012-01-01'), 
              ('2012-01-01','2013-01-01'),
              ('2013-01-01','2014-01-01'),
              ('2014-01-01','2015-01-01'),
              ('2015-01-01','2016-01-01'),
              ('2016-01-01','2017-01-01'),
              ('2017-01-01','2018-01-01'),
              ('2018-01-01','2019-01-01'),
              ('2019-01-01','2020-01-01'),
              ('2020-01-01','2021-01-01')]"""

## SECTION 2B - Retrieve the data

#### Retrieve and save the raw data to CSV

In [None]:
dates_list = [('2021-04-02', '2021-04-08')]

In [None]:
url_base = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=1&startDateTime='

for location, filename in zip(locations, location_names_for_files):
    print('Running urls for', filename)
    for startdate, enddate in dates_list:
        url = url_base + startdate + 'T00:00:00&endDateTime=' + enddate + 'T23:59:00&unitGroup=us&contentType=csv&location=' + location + '&key='+ os.environ.get('VC_TOKEN')
        csv_bytes = requests.get(url).content
        filestring = './data/weather_original/' + filename + '_weather_data_' + startdate + '_' + enddate + '.csv'
        with open(filestring, 'w', newline='\n') as csvfile:
            csvfile.write(csv_bytes.decode())
        csvfile.close()

## SECTION 2C - Data Cleaning

#### Run if reopening file and variables are not in the workspace

In [None]:
locations = ['Boston,MA', 'Providence,RI', 'Kingston,RI', 'New%20London,CT', 'New%20Haven,CT', 'Stamford,CT', 
             'Manhattan,NY', 'Newark,NJ', 'Trenton,NJ', 'Philadelphia,PA', 'Wilmington,DE', 'Baltimore,MD', 
             'Baltimore%20BWI%20Airport,MD', 'New%20Carrollton,MD', 'Washington,DC']

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 
                            'Stamford_CT', 'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 
                            'Wilmington_DE', 'Baltimore_MD', 'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 
                            'Washington_DC']

In [None]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
dates_list = ('2021-04-02', '2021-04-08')

####  Processing old data by year - fixing the old data and adding columns for separate date and time
* Select subset of columns and make minor fixes to data

In [None]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
for location in location_names_for_files:
    for year in years:
        filename = './data/weather/' + location + '_weather_' + str(year) + '_subset.csv'
        full_weather = pd.read_csv(filename)
        full_datetime = pd.to_datetime(full_weather['Date time'], format='%Y-%m-%d %H:%M:%S', exact=False, errors='raise')    
        full_weather['Date'] = full_datetime.dt.date
        full_weather['Time'] = full_datetime.dt.time
        cols_reordered = ['Address', 'Date time', 'Date', 'Time', 'Latitude', 'Longitude', 'Temperature', 
                          'Precipitation', 'Cloud Cover', 'Conditions']
        full_weather = full_weather[cols_reordered]
        full_weather.to_csv(filename, index=False)

#### Processing recent data by year - add new columns, make minor fixes to string format, take subset of full columns list
* This part is assuming 2021 data is being read and concatenates the previously retrieved data with the new data to create a single combined file

In [None]:
for location in location_names_for_files:
    for startdate, enddate in dates_list:
        filename = './data/weather_original/' + location + '_weather_data_' + startdate + '_' + enddate + '.csv'
        cols_list = ['Address', 'Date time', 'Latitude', 'Longitude', 'Temperature', 
                     'Precipitation', 'Cloud Cover', 'Conditions']
        full_weather = pd.read_csv(filename, usecols = cols_list)
        full_weather['Address'] = full_weather['Address'].str.replace(',', ', ')
        full_datetime = pd.to_datetime(full_weather['Date time'], format='%m/%d/%Y %H:%M:%S', exact=False, errors='raise')    
        full_weather['Date'] = full_datetime.dt.date
        full_weather['Time'] = full_datetime.dt.time
        dropna_weather = full_weather.replace('', np.nan).dropna()
        print(location, 'fraction of data kept: ', dropna_weather.shape[0]/full_weather.shape[0])
        prev_weather = pd.read_csv('./data/weather/' +  location + '_weather_2021_subset.csv')
        combined_weather = pd.concat([prev_weather, dropna_weather], ignore_index=True, axis=0)
        cols_reordered = ['Address', 'Date time', 'Date', 'Time', 'Latitude', 'Longitude', 'Temperature', 
                          'Precipitation', 'Cloud Cover', 'Conditions']
        combined_weather = combined_weather[cols_reordered]
        print(combined_weather.shape)
        combined_weather.to_csv('./data/weather/' +  location + '_weather_2021_subset.csv', index=False)

#### Data sample for viewing

In [26]:
sample = pd.read_csv('./data/weather/Providence_RI_weather_2021_subset.csv')
sample.head()

Unnamed: 0,Address,Date time,Date,Time,Latitude,Longitude,Temperature,Precipitation,Cloud Cover,Conditions
0,"Providence, RI",2021-01-01 00:00:00,2021-01-01,00:00:00,41.8239,-71.412,31.7,0.0,0.0,Clear
1,"Providence, RI",2021-01-01 01:00:00,2021-01-01,01:00:00,41.8239,-71.412,30.2,0.0,0.0,Clear
2,"Providence, RI",2021-01-01 02:00:00,2021-01-01,02:00:00,41.8239,-71.412,29.8,0.0,0.0,Clear
3,"Providence, RI",2021-01-01 03:00:00,2021-01-01,03:00:00,41.8239,-71.412,27.2,0.0,0.0,Clear
4,"Providence, RI",2021-01-01 04:00:00,2021-01-01,04:00:00,41.8239,-71.412,27.6,0.0,0.0,Clear


In [27]:
sample.tail()

Unnamed: 0,Address,Date time,Date,Time,Latitude,Longitude,Temperature,Precipitation,Cloud Cover,Conditions
2391,"Providence, RI",04/08/2021 19:00:00,2021-04-08,19:00:00,41.8239,-71.412,58.0,0.0,28.3,Partially cloudy
2392,"Providence, RI",04/08/2021 20:00:00,2021-04-08,20:00:00,41.8239,-71.412,54.1,0.0,14.1,Clear
2393,"Providence, RI",04/08/2021 21:00:00,2021-04-08,21:00:00,41.8239,-71.412,49.7,0.0,0.0,Clear
2394,"Providence, RI",04/08/2021 22:00:00,2021-04-08,22:00:00,41.8239,-71.412,47.5,0.0,0.0,Clear
2395,"Providence, RI",04/08/2021 23:00:00,2021-04-08,23:00:00,41.8239,-71.412,45.9,0.0,0.0,Clear


# Part 3: Loading Data into Postgres Database

## SECTION 3A - Setup

#### Imports for loading into database

In [28]:
import psycopg2
import csv
import os
import sys 
import time
assert os.environ.get('DB_PASS') != None , 'empty password!'

#### Functions to create tables in the database

In [30]:
def create_data_tables(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS train_info CASCADE;
        CREATE TABLE train_info (
            train_info_id SERIAL PRIMARY KEY,
            train_num text UNIQUE,
            operating_direction text,
            reg_operates_on_mon boolean,
            reg_operates_on_tues boolean,
            reg_operates_on_wed boolean,
            reg_operates_on_thurs boolean,
            reg_operates_on_fri boolean,
            reg_operates_on_sat boolean,
            reg_operates_on_sun boolean,
            depart_origin_time text,
            depart_NY_time text,
            arrive_dest_time text
            
        );
        """,
        """ 
        DROP TABLE IF EXISTS arrivals CASCADE;
        CREATE TABLE arrivals (
            dataset_id SERIAL PRIMARY KEY,
            train_num text REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_arr_datetime timestamp,
            sched_arr_date date,
            sched_arr_week_day text,
            sched_arr_time time,
            act_arr_time time,
            arrive_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """,
        """ 
        DROP TABLE IF EXISTS departures CASCADE;
        CREATE TABLE departures (
            dataset_id SERIAL PRIMARY KEY,
            train_num text REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_dep_datetime timestamp,
            sched_dep_date date,
            sched_dep_week_day text,
            sched_dep_time time,
            act_dep_time time,
            depart_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """,
        """
        DROP TABLE IF EXISTS weather_hourly CASCADE;
        CREATE TABLE weather_hourly (
            weather_id SERIAL PRIMARY KEY,
            location text,
            date_time timestamp,
            date date,
            time time,
            latitude real,
            longitude real,
            temperature real,
            precipitation real,
            cloud_cover real,
            conditions text
        );
        """
    ]
    try:
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)

In [31]:
def update_train_info_table(conn, csv_file):
    c = conn.cursor()
    command = """INSERT INTO train_info (train_num, operating_direction, reg_operates_on_mon, 
                   reg_operates_on_tues, reg_operates_on_wed, reg_operates_on_thurs, 
                   reg_operates_on_fri, reg_operates_on_sat, reg_operates_on_sun, 
                   depart_origin_time, depart_NY_time, arrive_dest_time)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""               
                
    with open(csv_file, newline='') as file:
        info_reader = csv.reader(file, delimiter=',')
        next(info_reader) # skip header                                                                          
        for row in info_reader:                                           
            try:
                c.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                conn.rollback()
        conn.commit() 


def update_arrive_table(conn, csv_file):
    c = conn.cursor()
    command = """INSERT INTO arrivals (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_arr_datetime, sched_arr_date, sched_arr_week_day,
                               sched_arr_time, act_arr_time, arrive_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""                      
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)     # skip header                                                                         
        for row in train_reader:                                           
            try:
                c.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()


def update_depart_table(conn, csv_file):
    c = conn.cursor()
    command = """INSERT INTO departures (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_dep_datetime, sched_dep_date, sched_dep_week_day,
                               sched_dep_time, act_dep_time, depart_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)   # skip header                                                                           
        for row in train_reader:                                           
            try:
                c.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

In [39]:
def update_weather_table(conn, csv_file):
    c = conn.cursor()
    command = """INSERT INTO weather_hourly (location, date_time, date, time, latitude, 
                   longitude, temperature, precipitation, cloud_cover, conditions)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""                
                   
    with open(csv_file, newline='') as file: 
        data_reader = csv.reader(file, delimiter=',')
        next(data_reader, None)   # skip header                                                                           
        for row in data_reader:                                           
            try:
                c.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

## SECTION 3B - Load Train Data

#### Create connection to database

In [34]:
DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)
assert conn is not None, 'need to fix conn!!'

#### Create tables for train arrivals and departures, train info, and hourly weather data

In [35]:
create_data_tables(conn)

#### Load train data into database

In [37]:
conn = psycopg2.connect(DSN)

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

begin_everything = time.time()

update_train_info_table(conn, './data/trains/train_nums.csv')

for year in years:
    start = time.time()
    arrive_csv = './data/trains/processed_arrive_' + str(year) + '.csv'
    depart_csv = './data/trains/processed_depart_' + str(year) + '.csv'
    update_arrive_table(conn, arrive_csv)
    update_depart_table(conn, depart_csv)
    print('DONE WITH', year, 'in', time.time() - start)
print('COMPLETE in', time.time() - begin_everything)

DONE WITH 2011 in 3.511457681655884
DONE WITH 2012 in 3.423931837081909
DONE WITH 2013 in 3.6111598014831543
DONE WITH 2014 in 3.753896951675415
DONE WITH 2015 in 3.7109007835388184
DONE WITH 2016 in 3.804558038711548
DONE WITH 2017 in 3.7870168685913086
DONE WITH 2018 in 3.8256869316101074
DONE WITH 2019 in 3.8487741947174072
DONE WITH 2020 in 5.315364122390747
DONE WITH 2021 in 0.8932220935821533
COMPLETE in 39.49402904510498


## SECTION 3C - Load Weather Data

#### Load data

In [40]:
conn = psycopg2.connect(DSN)

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 
                            'Stamford_CT', 'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 
                            'Wilmington_DE', 'Baltimore_MD', 'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 
                            'Washington_DC']

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

begin_everything = time.time()
for location in location_names_for_files:
    start = time.time()
    for year in years:
        weather_csv = './data/weather/' + location + '_weather_' + str(year) + '_subset.csv'
        update_weather_table(conn, weather_csv)
    print('Finished adding location', location, 'to the database in', time.time() - start, 'seconds')
print("COMPLETE in", time.time() - begin_everything)

Finished adding location Boston_MA to the database in 2.7674989700317383 seconds
Finished adding location Providence_RI to the database in 2.702225923538208 seconds
Finished adding location Kingston_RI to the database in 2.7043211460113525 seconds
Finished adding location New_London_CT to the database in 2.7071149349212646 seconds
Finished adding location New_Haven_CT to the database in 2.738985776901245 seconds
Finished adding location Stamford_CT to the database in 2.713691234588623 seconds
Finished adding location Manhattan_NY to the database in 2.7072160243988037 seconds
Finished adding location Newark_NJ to the database in 2.753577947616577 seconds
Finished adding location Trenton_NJ to the database in 2.713850736618042 seconds
Finished adding location Philadelphia_PA to the database in 2.757599115371704 seconds
Finished adding location Wilmington_DE to the database in 2.873321056365967 seconds
Finished adding location Baltimore_MD to the database in 2.924015998840332 seconds
Fini

In [41]:
%load_ext sql

In [42]:
%sql postgresql://appuser:test@localhost:5432/amtrakproject

In [43]:
%%sql

SELECT COUNT(*)
FROM weather_hourly;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
1349722


In [44]:
%%sql

SELECT COUNT(*)
FROM departures;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
862239


## SECTION 3D - Add Stations Table Linking Weather Stations and Amtrak Stations

#### Imports for updating database

In [1]:
import pandas as pd
import psycopg2
import csv
import os
import sys 
import time
assert os.environ.get('DB_PASS') != None , 'empty password!'

In [2]:
DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)
assert conn is not None, 'need to fix conn!!'

In [3]:
geo_stations = pd.read_csv("./data/visualization/geo_stations_info.csv")

In [4]:
geo_stations

Unnamed: 0,STNNAME,CITY2,STATE,LON,LAT,Northbound Mile,Southbound Mile,STNCODE
0,"Boston (South Station), Massachusetts",Boston,MA,-71.05517,42.351642,457,0,BOS
1,"Boston (Back Bay), Massachusetts",Boston,MA,-71.075828,42.347317,456,1,BBY
2,"Westwood, Route 128 Station, Massachusetts",Route 128,MA,-71.147894,42.210242,446,11,RTE
3,"Providence, Rhode Island",Providence,RI,-71.413478,41.82949,414,43,PVD
4,"West Kingston, Rhode Island",Kingston,RI,-71.560597,41.483959,387,70,KIN
5,"New London, Connecticut",New London,CT,-72.093225,41.354267,352,105,NLC
6,"New Haven, Connecticut",New Haven,CT,-72.92667,41.297714,301,156,NHV
7,"Stamford, Connecticut",Stamford,CT,-73.54216,41.04713,262,195,STM
8,"New York (Penn Station), New York",New York,NY,-73.994459,40.750327,226,231,NYP
9,"Newark (Penn Station), New Jersey",Newark,NJ,-74.16475,40.734706,216,241,NWK


In [5]:
def create_stations_table(conn):
    """
    Create table that can link weather data to train station data.
    """
    command = """ 
        DROP TABLE IF EXISTS station_info CASCADE;
        CREATE TABLE station_info (
            station_info_id SERIAL PRIMARY KEY,
            station_name text,
            city text,
            state text,
            longitude real,
            latitude real, 
            nb_mile numeric,
            sb_mile numeric,
            station_code text
        );
        """
    try:
        cur = conn.cursor()
        cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
        conn.rollback()

In [6]:
def update_stations_table(conn, csv_file):
    c = conn.cursor()
    command = """INSERT INTO station_info (station_name, city, state,
                   longitude, latitude, nb_mile, sb_mile, station_code)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""                
                   
    with open(csv_file, newline='') as file: 
        data_reader = csv.reader(file, delimiter=',')
        next(data_reader, None)   # skip header                                                                           
        for row in data_reader:                                           
            try:
                c.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

In [7]:
create_stations_table(conn)

In [8]:
update_stations_table(conn, "./data/visualization/geo_stations_info.csv")