# ETL and EDA Notebook

# Part 1 - Amtrak Northeast Regional Train Data

## SECTION 1A - Setup

#### Imports for processing train data

In [1]:
import time
import requests
import re
import lxml.html as lh
import pandas as pd
import numpy as np
from datetime import date, timedelta

#### Helper functions for retrieving the data

In [2]:
def make_dict():
    """
    Creates dictionary to hold raw data sorted by arrival and direction, then by station.
    """
    dictionary = {'Arrive': {s: [] for s in ['NYP', 'BOS', 'WAS']},
                  'Depart': {s: [] for s in ['BOS', 'WAS', 'NHV', 'NYP', 'PHL', 'BAL',
                                             'PVD', 'WIL', 'BWI', 'NWK', 'BBY', 'RTE',
                                             'TRE', 'STM', 'NCR', 'KIN', 'NLC']}}
    return dictionary


def convert_train_nums_to_string(train_nums_list):
    """
    Give a list of train numbers, converts it to a string that can be used in a url.
    """
    output = str(train_nums_list[0])
    for train_num in train_nums_list[1:]:
        output += '%2C' + str(train_num)
    return output


def convert_dates_to_string(dt_start, dt_end):
    """
    Function to convert a date object to a url string.
    """
    start = '&date_start=' + str(dt_start.month) + '%2F' + str(dt_start.day) + \
        '%2F' + str(dt_start.year)
    end = '&date_end=' + str(dt_end.month) + '%2F' + str(dt_end.day) + \
        '%2F' + str(dt_end.year)
    return start + end


def construct_urls(northbound_trains, southbound_trains, start_date, end_date):
    """
    Inputs: 2 lists of lists of train numbers, 2 dates
        - list of northbound train subset lists
        - list of southbound train subset lists
        - start date for fetching data
        - end date for fetching data
    Outputs: dictionary of urls based on arrivals and departures from select stations
    """
    URL_ROOT = 'https://juckins.net/amtrak_status/archive/html/history.php?train_num='
    DFS = '&df1=1&df2=1&df3=1&df4=1&df5=1&df6=1&df7=1'
    ARR = '&sort=schAr'
    DEP = '&sort=schDp'
    URL_END = '&sort_dir=ASC&co=gt&limit_mins=&dfon=1'
    DATES = convert_dates_to_string(start_date, end_date)
    arrive = ['NYP']
    depart = ['NHV', 'NYP', 'PHL', 'BAL', 'PVD', 'WIL', 'BWI', 'NWK',
              'BBY', 'RTE', 'TRE', 'STM', 'NCR', 'KIN', 'NLC']
    urls = {'Arrive': [], 'Depart': []}
    for trains_list in northbound_trains:
        TRAINS = convert_train_nums_to_string(trains_list)
        for station in depart + ['WAS']:
            STATION = '&station=' + station
            URL = URL_ROOT + TRAINS + DATES + STATION + DFS + DEP + URL_END
            urls['Depart'].append((station, URL))
        for station in arrive + ['BOS']:
            STATION = '&station=' + station
            URL = URL_ROOT + TRAINS + DATES + STATION + DFS + ARR + URL_END
            urls['Arrive'].append((station, URL))
    for trains_list in southbound_trains:
        TRAINS = convert_train_nums_to_string(trains_list)
        for station in depart + ['BOS']:
            STATION = '&station=' + station
            URL = URL_ROOT + TRAINS + DATES + STATION + DFS + DEP + URL_END
            urls['Depart'].append((station, URL))
        for station in arrive + ['WAS']:
            STATION = '&station=' + station
            URL = URL_ROOT + TRAINS + DATES + STATION + DFS + ARR + URL_END
            urls['Arrive'].append((station, URL))
    return urls


def make_request(url):
    """
    Given a url, request the data and return the page content or None if
    retrieving data failed on the first try.
    """
    page = None
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("An error occurred while retrieving data for the following url:")
        print('        {}'.format(url))
        print("Error: {}".format(e))
    page = response.content
    return page

#### Function to retrieve the raw HTML data from website
* Train data sourced from [Chris Juckins' website - Amtrak Status Maps Archive Database (ASMAD)](https://juckins.net/amtrak_status/archive/html/home.php), retrieved with his permission.
* Thank you, Chris!

In [3]:
def retrieve_data(start=date.today()-timedelta(days=1), end=date.today()):
    """
    Function to retrieve new data from the website for specified dates. If not given input
    start and end dates, defaults to retrieving data for yesterday.
    """
    # If querying a long time period, it is better to use smaller groups of trains (more requests)
    # northbound = [[66, 82, 86, 88], [94, 132, 150], [160, 162, 164, 166], [168, 170, 172, 174]]
    # southbound = [[67, 83, 93, 95], [99, 135, 137, 139], [161, 163, 165,167],[171, 173, 175, 195]]
    # If only querying a few days, we can just do them all at once
    northbound = [[66, 82, 86, 88, 94, 132, 150, 160, 162, 164, 166, 168, 170, 172, 174]]
    southbound = [[67, 83, 93, 95, 99, 135, 137, 139, 161, 163, 165, 167, 171, 173, 175, 195]]
    # Function can be found in fetch_data.py. It constructs the proper URL to run the query
    urls = construct_urls(northbound, southbound, start, end)
    raw_data = make_dict()
    failed_retrievals = []
    start_time = time.time()
    for station, url in urls['Depart']:
        data = make_request(url)
        if data is not None:
            raw_data['Depart'][station].append(data)
        else:
            failed_retrievals.append((station, url))
    for station, url in urls['Arrive']:
        data = make_request(url)
        if data is not None:
            raw_data['Arrive'][station].append(data)
        else:
            failed_retrievals.append((station, url))
    if len(failed_retrievals) > 0:
        print('Failed to retrieve data for the following filenames:')
        for station, url in failed_retrievals:
            print('        STATION:   {}'.format(station))
            print('        URL:   {}'.format(url))
    print('Complete in {} seconds'.format(time.time() - start_time))
    return raw_data

#### Helper functions for data cleaning and processing

In [4]:
def get_direction(num):
    """
    Return direction of the train (odd = Southbound, even = Northbound).
    """
    if num % 2 == 0:
        return 'Northbound'
    else:
        return 'Southbound'


def get_num(re_match):
    """
    Assuming input contains a match, extract and return the numerical data from input.
    """
    num_match = re.search('(?P<num>[0-9]+)', re_match)
    return int(num_match.group('num'))


def make_dict_from_cols(col_names):
    """
    Create dictionary from a list of column names
    """
    dictionary = {col_name: [] for col_name in col_names}
    return dictionary


def get_html_col_names(raw_data, arrive_or_depart):
    """
    Using NYP (station with both arrival times and departure times),
    retrieve column names from the HTML table, located in the 2nd row.
    """
    data_list = raw_data[arrive_or_depart]['NYP']
    page_content = data_list[0]
    doc = lh.fromstring(page_content)
    tr_elements = doc.xpath('//tr')
    html_col_names = [entry.text_content().strip() for entry in tr_elements[1]]
    return html_col_names

#### Main function to convert html data to a basic pandas data frame

In [5]:
def raw_data_to_raw_df(raw_data, arrive_or_depart):
    """
    Function to put the raw html data in a dataframe for ease of processing.
    """
    col_names = get_html_col_names(raw_data, arrive_or_depart)
    N = 7
    data_dict = make_dict_from_cols(['Direction', 'Station'] + col_names)
    for station in raw_data[arrive_or_depart].keys():
        data_list = raw_data[arrive_or_depart][station]
        L = len(data_list)
        for i in range(L):
            page_content = data_list[i]
            doc = lh.fromstring(page_content)
            tr_elements = doc.xpath('//tr')
            if len(tr_elements) > 3:
                title = tr_elements[0].text_content()
                direction = get_direction(get_num(title))
                for j in range(2, len(tr_elements)):
                    table_row = tr_elements[j]
                    if len(table_row) == N:
                        data_dict['Direction'].append(direction)
                        data_dict['Station'].append(station)
                        for col_name, entry in zip(col_names, table_row):
                            data = entry.text_content()
                            data_dict[col_name].append(data)
                    else:
                        continue
            else:
                print("No data for this period, or an error occurred", station, arrive_or_depart)
    return pd.DataFrame.from_dict(data_dict)

## SECTION 1B - Retrieving Data

#### Retrieving new data
* Run cell below and wait for the request to complete

In [6]:
start = date.today()-timedelta(days=1)
end = date.today()
raw_data = retrieve_data(start=start, end=end)

Retrieved data in 18.308887004852295 seconds


In [7]:
print(data.keys())

dict_keys(['Arrive', 'Depart'])


## SECTION 1C - Save Raw Data to File

#### Process departure data to clean format

In [8]:
start_time = time.time()
depart =  raw_data_to_raw_df(data, 'Depart')
print('Elapsed:', time.time() - start_time)
depart.head()

Elapsed: 0.03826594352722168


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Dp,Act Dp,Comments,Service Disruption,Cancellations
0,Southbound,BOS,95,04/09/2021 (Fr),04/09/2021 6:10 AM (Fr),6:10AM,Dp: On time.,,
1,Southbound,BOS,171,04/09/2021 (Fr),04/09/2021 8:15 AM (Fr),8:15AM,Dp: On time.,,
2,Southbound,BOS,93,04/09/2021 (Fr),04/09/2021 9:30 AM (Fr),9:30AM,Dp: On time.,,
3,Southbound,BOS,173,04/09/2021 (Fr),04/09/2021 11:15 AM (Fr),11:16AM,Dp: 1 min late.,,
4,Southbound,BOS,137,04/09/2021 (Fr),04/09/2021 1:50 PM (Fr),1:50PM,Dp: On time.,,


#### Proces arrival data to clean format

In [9]:
start_time = time.time()
arrive = raw_data_to_raw_df(data, 'Arrive')
print('elapsed:', time.time() - start_time)
arrive.head()

elapsed: 0.00764918327331543


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Ar,Act Ar,Comments,Service Disruption,Cancellations
0,Northbound,NYP,66,04/08/2021 (Th),04/09/2021 1:55 AM (Fr),1:51AM,Ar: 4 min early. | Dp: On time.,,
1,Northbound,NYP,170,04/09/2021 (Fr),04/09/2021 8:15 AM (Fr),8:15AM,Ar: On time. | Dp: On time.,,
2,Northbound,NYP,172,04/09/2021 (Fr),04/09/2021 10:44 AM (Fr),10:49AM,Ar: 5 min late. | Dp: On time.,,
3,Northbound,NYP,174,04/09/2021 (Fr),04/09/2021 1:35 PM (Fr),1:35PM,Ar: On time. | Dp: On time.,,
4,Northbound,NYP,94,04/09/2021 (Fr),04/09/2021 5:22 PM (Fr),5:31PM,Ar: 9 min late. | Dp: 4 min late.,,


#### Save the raw data

In [10]:
arrive_filestring = './data/trains/raw_arrive_{}_{}.csv'.format(str(start), str(end))
depart_filestring = './data/trains/raw_depart_{}_{}.csv'.format(str(start), str(end))

arrive.to_csv(arrive_filestring, line_terminator='\n', index=False)
depart.to_csv(depart_filestring, line_terminator='\n', index=False)

In [11]:
print(arrive.shape[0])
arrive.head()

26


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Ar,Act Ar,Comments,Service Disruption,Cancellations
0,Northbound,NYP,66,04/08/2021 (Th),04/09/2021 1:55 AM (Fr),1:51AM,Ar: 4 min early. | Dp: On time.,,
1,Northbound,NYP,170,04/09/2021 (Fr),04/09/2021 8:15 AM (Fr),8:15AM,Ar: On time. | Dp: On time.,,
2,Northbound,NYP,172,04/09/2021 (Fr),04/09/2021 10:44 AM (Fr),10:49AM,Ar: 5 min late. | Dp: On time.,,
3,Northbound,NYP,174,04/09/2021 (Fr),04/09/2021 1:35 PM (Fr),1:35PM,Ar: On time. | Dp: On time.,,
4,Northbound,NYP,94,04/09/2021 (Fr),04/09/2021 5:22 PM (Fr),5:31PM,Ar: 9 min late. | Dp: 4 min late.,,


In [12]:
print(depart.shape[0])
depart.head()

195


Unnamed: 0,Direction,Station,Train #,Origin Date,Sch Dp,Act Dp,Comments,Service Disruption,Cancellations
0,Southbound,BOS,95,04/09/2021 (Fr),04/09/2021 6:10 AM (Fr),6:10AM,Dp: On time.,,
1,Southbound,BOS,171,04/09/2021 (Fr),04/09/2021 8:15 AM (Fr),8:15AM,Dp: On time.,,
2,Southbound,BOS,93,04/09/2021 (Fr),04/09/2021 9:30 AM (Fr),9:30AM,Dp: On time.,,
3,Southbound,BOS,173,04/09/2021 (Fr),04/09/2021 11:15 AM (Fr),11:16AM,Dp: 1 min late.,,
4,Southbound,BOS,137,04/09/2021 (Fr),04/09/2021 1:50 PM (Fr),1:50PM,Dp: On time.,,


## SECTION 1D - Transforming Raw Data Columns to Processed Form

#### Helper functions to process the columns

In [13]:
def get_key_names(arrive_or_depart):
    """
    This function returns the proper keys to create the column names depending on
    whether the data being processed is arrival or departure data.
    """
    if arrive_or_depart == 'Arrive':
        return {'Sch Full Date': 'Full Sch Ar Date', 'Sch Abbr': 'Sch Ar',
                'Act Abbr': 'Act Ar', 'Diff': 'Arrive Diff'}

    elif arrive_or_depart == 'Depart':
        return {'Sch Full Date': 'Full Sch Dp Date', 'Sch Abbr': 'Sch Dp',
                'Act Abbr': 'Act Dp', 'Diff': 'Depart Diff'}

#### Main function to process each column in the semi-raw data frame to a better form

In [14]:
def process_columns(df, arrive_or_depart):
    """
    This function takes an input of the initial data (a pandas data frame) and whether it is
    arrival or departure data. It takes each column of the initial data and does various
    operations to create the fully processed data frame.
    """
    # The specific keys depending on if new_df is for arr or dep data
    ad_keys = get_key_names(arrive_or_depart)

    new_df = pd.DataFrame()
    new_df['Train Num'] = pd.to_numeric(df['Train #'])
    new_df['Station'] = df['Station']
    new_df['Direction'] = df['Direction']

    origin_date = pd.to_datetime(df['Origin Date'], format="%m/%d/%Y", exact=False, errors='coerce')
    new_df['Origin Date'] = origin_date
    new_df['Origin Year'] = origin_date.dt.year
    new_df['Origin Month'] = origin_date.dt.month
    new_df['Origin Week Day'] = origin_date.dt.day_name()

    sched_full_date = pd.to_datetime(df[ad_keys['Sch Abbr']],
                                     format='%m/%d/%Y %I:%M %p',
                                     exact=False, errors='coerce')
    new_df[ad_keys['Sch Full Date']] = sched_full_date
    new_df[ad_keys['Sch Abbr'] + ' Date'] = sched_full_date.dt.date
    new_df[ad_keys['Sch Abbr'] + ' Day'] = sched_full_date.dt.day_name()
    new_df[ad_keys['Sch Abbr'] + ' Time'] = sched_full_date.dt.time
    act_time = pd.to_datetime(df[ad_keys['Act Abbr']], format='%I:%M%p',
                              exact=False, errors='coerce')
    new_df[ad_keys['Act Abbr'] + ' Time'] = act_time.dt.time

    df['Sched Date'] = sched_full_date
    df['Act Date'] = pd.to_datetime(sched_full_date.dt.date.astype(str) + " " +
                                    df[ad_keys['Act Abbr']].astype(str),
                                    exact=False, errors='coerce')
    max_expected_delay = pd.Timedelta(hours=10)
    delta = df['Act Date'] - df['Sched Date']
    m_late = (delta < max_expected_delay) & (-1*max_expected_delay > delta)
    m_early = (-1*delta < max_expected_delay) & (-1*max_expected_delay > -1*delta)
    df.loc[m_late, 'Act Date'] += pd.Timedelta(days=1)
    df.loc[m_early, 'Act Date'] -= pd.Timedelta(days=1)
    diff = (df['Act Date'] - df['Sched Date']).dt.total_seconds()/60
    new_df[ad_keys['Diff']] = np.rint(diff).astype(int)
    new_df['Service Disruption'] = df['Service Disruption'].replace('SD', 1).replace('', 0)
    new_df['Cancellations'] = df['Cancellations'].replace('C', 1).replace('', 0)
    return new_df.replace('', np.nan).dropna()

#### Process the arrival data to final format

In [15]:
full_arrive = process_columns(arrive, 'Arrive')
full_arrive.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Ar Date,Sch Ar Date,Sch Ar Day,Sch Ar Time,Act Ar Time,Arrive Diff,Service Disruption,Cancellations
0,66,NYP,Northbound,2021-04-08,2021,4,Thursday,2021-04-09 01:55:00,2021-04-09,Friday,01:55:00,01:51:00,-4,0,0
1,170,NYP,Northbound,2021-04-09,2021,4,Friday,2021-04-09 08:15:00,2021-04-09,Friday,08:15:00,08:15:00,0,0,0
2,172,NYP,Northbound,2021-04-09,2021,4,Friday,2021-04-09 10:44:00,2021-04-09,Friday,10:44:00,10:49:00,5,0,0
3,174,NYP,Northbound,2021-04-09,2021,4,Friday,2021-04-09 13:35:00,2021-04-09,Friday,13:35:00,13:35:00,0,0,0
4,94,NYP,Northbound,2021-04-09,2021,4,Friday,2021-04-09 17:22:00,2021-04-09,Friday,17:22:00,17:31:00,9,0,0


In [16]:
full_arrive.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Ar Date,Sch Ar Date,Sch Ar Day,Sch Ar Time,Act Ar Time,Arrive Diff,Service Disruption,Cancellations
21,171,WAS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 16:17:00,2021-04-09,Friday,16:17:00,16:39:00,22,0,0
22,93,WAS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 17:18:00,2021-04-09,Friday,17:18:00,17:28:00,10,0,0
23,173,WAS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 19:10:00,2021-04-09,Friday,19:10:00,19:10:00,0,0,0
24,137,WAS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 21:52:00,2021-04-09,Friday,21:52:00,21:46:00,-6,0,0
25,175,WAS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 23:16:00,2021-04-09,Friday,23:16:00,23:11:00,-5,0,0


In [17]:
full_arrive.shape[0]

25

#### Process the departure data to final format

In [18]:
full_depart = process_columns(depart, "Depart")
full_depart.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
0,95,BOS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 06:10:00,2021-04-09,Friday,06:10:00,06:10:00,0,0,0
1,171,BOS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 08:15:00,2021-04-09,Friday,08:15:00,08:15:00,0,0,0
2,93,BOS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 09:30:00,2021-04-09,Friday,09:30:00,09:30:00,0,0,0
3,173,BOS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 11:15:00,2021-04-09,Friday,11:15:00,11:16:00,1,0,0
4,137,BOS,Southbound,2021-04-09,2021,4,Friday,2021-04-09 13:50:00,2021-04-09,Friday,13:50:00,13:50:00,0,0,0


In [19]:
full_depart.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
190,171,NLC,Southbound,2021-04-09,2021,4,Friday,2021-04-09 09:48:00,2021-04-09,Friday,09:48:00,09:48:00,0,0,0
191,93,NLC,Southbound,2021-04-09,2021,4,Friday,2021-04-09 11:12:00,2021-04-09,Friday,11:12:00,11:12:00,0,0,0
192,173,NLC,Southbound,2021-04-09,2021,4,Friday,2021-04-09 12:48:00,2021-04-09,Friday,12:48:00,12:49:00,1,0,0
193,137,NLC,Southbound,2021-04-09,2021,4,Friday,2021-04-09 15:38:00,2021-04-09,Friday,15:38:00,15:39:00,1,0,0
194,175,NLC,Southbound,2021-04-09,2021,4,Friday,2021-04-09 16:57:00,2021-04-09,Friday,16:57:00,16:59:00,2,0,0


In [20]:
full_depart.shape[0]

187

#### Special processing for new 2021 data: concatenate with previously retrieved 2021 data
* It is possible that there will be duplicates for , but these do not matter as the database ignores all repeated entries

In [30]:
arrive_filestring2021 = './data/trains/processed_arrive_2021.csv'
depart_filestring2021 = './data/trains/processed_depart_2021.csv'
        
prev_arrive2021 = pd.read_csv(arrive_filestring2021)
prev_depart2021 = pd.read_csv(depart_filestring2021)

In [22]:
new_arrive2021 = pd.concat([prev_arrive2021, full_arrive], ignore_index=True, axis=0)
new_depart2021 = pd.concat([prev_depart2021, full_depart], ignore_index=True, axis=0)
        

In [31]:
new_depart2021.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
0,99,BOS,Southbound,2021-01-01 00:00:00,2021,1,Friday,2021-01-01 08:40:00,2021-01-01,Friday,08:40:00,08:40:00,0,0,0
1,99,BOS,Southbound,2021-01-02 00:00:00,2021,1,Saturday,2021-01-02 08:40:00,2021-01-02,Saturday,08:40:00,08:40:00,0,0,0
2,99,BOS,Southbound,2021-01-03 00:00:00,2021,1,Sunday,2021-01-03 08:40:00,2021-01-03,Sunday,08:40:00,08:40:00,0,0,0
3,67,BOS,Southbound,2021-01-03 00:00:00,2021,1,Sunday,2021-01-03 21:30:00,2021-01-03,Sunday,21:30:00,21:30:00,0,0,0
4,95,BOS,Southbound,2021-01-04 00:00:00,2021,1,Monday,2021-01-04 06:10:00,2021-01-04,Monday,06:10:00,06:11:00,1,0,0


In [32]:
new_depart2021.tail()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Depart Diff,Service Disruption,Cancellations
18214,171,NLC,Southbound,2021-04-09 00:00:00,2021,4,Friday,2021-04-09 09:48:00,2021-04-09,Friday,09:48:00,09:48:00,0,0,0
18215,93,NLC,Southbound,2021-04-09 00:00:00,2021,4,Friday,2021-04-09 11:12:00,2021-04-09,Friday,11:12:00,11:12:00,0,0,0
18216,173,NLC,Southbound,2021-04-09 00:00:00,2021,4,Friday,2021-04-09 12:48:00,2021-04-09,Friday,12:48:00,12:49:00,1,0,0
18217,137,NLC,Southbound,2021-04-09 00:00:00,2021,4,Friday,2021-04-09 15:38:00,2021-04-09,Friday,15:38:00,15:39:00,1,0,0
18218,175,NLC,Southbound,2021-04-09 00:00:00,2021,4,Friday,2021-04-09 16:57:00,2021-04-09,Friday,16:57:00,16:59:00,2,0,0


In [None]:
new_arrive2021.to_csv(arrive_filestring2021, line_terminator='\n', index=False)
new_depart2021.to_csv(depart_filestring2021, line_terminator='\n', index=False)

# Part 2 - Visual Crossing Weather Data


## SECTION 2A - Setup

#### Imports for retrieving weather data

In [8]:
import requests
import os
import pandas as pd
import numpy as np
from datetime import date, timedelta

In [2]:
assert os.environ.get('VC_TOKEN') is not None , 'empty token!'

## SECTION 2B - Retrieve the data

#### Retrieve and save the raw data to CSV

In [None]:
locations = ['Boston,MA', 'Providence,RI', 'Kingston,RI', 'New%20London,CT',
             'New%20Haven,CT', 'Stamford,CT', 'Manhattan,NY', 'Newark,NJ',
             'Trenton,NJ', 'Philadelphia,PA', 'Wilmington,DE', 'Baltimore,MD',
             'Baltimore%20BWI%20Airport,MD', 'New%20Carrollton,MD', 'Washington,DC']

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT',
                            'New_Haven_CT', 'Stamford_CT', 'Manhattan_NY', 'Newark_NJ',
                            'Trenton_NJ', 'Philadelphia_PA', 'Wilmington_DE', 'Baltimore_MD',
                            'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 'Washington_DC']

yesterday = str(date.today()-timedelta(days=1))

def retrieve_weather_data(start=yesterday, end=yesterday):
    """
    Function to retrieve data from Visual Crossing Weather API for dates in dates list.
    If no params are given, defaults to retrieving data for the previous day only.

    Input:
                start       formatted as 'YYYY-MM-DD'
                end         formatted as 'YYYY-MM-DD'
    Returns:
                successful_retrievals
                    list of (location, filepath) tuples indicating successfully created files
    Example:
                retrieve_weather_data(start='2021-04-09', end='2021-04-09') for April 9, 2021 data
    """
    URL_ROOT = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/'
    QUERY_TYPE = 'weatherdata/history?&aggregateHours=1'
    DATES = '&startDateTime={}T00:00:00&endDateTime={}T23:59:00&unitGroup'.format(start, end)
    URL_BASE = URL_ROOT + QUERY_TYPE + DATES
    URL_KEY = '&key=' + os.environ.get('VC_TOKEN')
    successful_retrievals = []
    failed_retrievals = []
    for locname, filename in zip(locations, location_names_for_files):
        print('Retrieving data for LOCATION: {}'.format(filename))
        print('    and DATE RANGE: {}T00:00:00 to {}T23:59:00'.format(start, end))
        CSVstring = './data/weather_original/{}_weather_data_{}_{}.csv'.format(filename, start, end)
        if not os.path.exists(CSVstring):
            URL_LOC = '=us&contentType=csv&location=' + locname
            URL = URL_BASE + URL_LOC + URL_KEY
            response = requests.get(URL)
            try:
                response.raise_for_status()
            except requests.exceptions.HTTPError as e:
                failed_retrievals.append((CSVstring, str(e)))
                continue
            csv_bytes = response.content
            with open(CSVstring, 'w', newline='\n') as csvfile:
                csvfile.write(csv_bytes.decode())
                csvfile.close()
            successful_retrievals.append((filename, CSVstring))
        elif os.path.exists(CSVstring):
            failed_retrievals.append((CSVstring, 'Error: File Already Exists'))
            continue
    if len(successful_retrievals) > 0:
        print('Successfully collected data has been saved at the following filenames:')
        for location, filestring in successful_retrievals:
            print('        FILE:   {}'.format(filestring))
    if len(failed_retrievals) > 0:
        print('Failed to retrieve data for the following filenames:')
        for filestring, error in failed_retrievals:
            print('        FILE:   {}'.format(filestring))
            print('        REASON: {}'.format(error))
    return successful_retrievals

In [5]:
successful_retrievals = retrieve_weather_data()

[('2021-04-10', '2021-04-10')]
Running urls for Boston_MA
Running urls for Providence_RI
Running urls for Kingston_RI
Running urls for New_London_CT
Running urls for New_Haven_CT
Running urls for Stamford_CT
Running urls for Manhattan_NY
Running urls for Newark_NJ
Running urls for Trenton_NJ
Running urls for Philadelphia_PA
Running urls for Wilmington_DE
Running urls for Baltimore_MD
Running urls for Baltimore_BWI_Airport_MD
Running urls for New_Carrollton_MD
Running urls for Washington_DC


## SECTION 2C - Data Cleaning

#### Processing recent data by year - add new columns, make minor fixes to string format, take subset of full columns list
* This part is assuming 2021 data is being read and concatenates the previously retrieved data with the new data to create a single combined file

In [6]:
def process_weather_data(files_to_process):
    """
    This function is set for processing current (2021) weather data which is being retrieved daily.
    It takes a start and end date, which both default to yesterday if no arguments are given, and
    processes all raw files from the specified dates to a subset of columns. It then concatenates
    the newly processed data and the previously processed data, and then saves the complete 2021
    data to a CSV (with same name as the previously processed 2021 full data).

    Input:
                list of (location, filepath) tuples to process and combine with previous data
    Returns:
                nothing (updates the yearly combined data CSV file on disk)
    Example:
            files_to_process = [
                (
                    'Boston_MA',
                    './data/weather_original/Boston_MA_weather_data_2021-04-11_2021-04-11.csv'
                )
            ]
            process_weather_data(files_to_process)
    """
    successful_processes = []
    for location, CSVstring in files_to_process:
        cols_list = ['Address', 'Date time', 'Latitude', 'Longitude', 'Temperature',
                     'Precipitation', 'Cloud Cover', 'Conditions']
        full_weather = pd.read_csv(CSVstring, usecols=cols_list)
        full_weather['Address'] = full_weather['Address'].str.replace(',', ', ')
        dropna_weather = full_weather.replace('', np.nan).dropna()
        frac_kept = dropna_weather.shape[0]/full_weather.shape[0]
        cond_cols = dropna_weather['Conditions'].str.split(', ', expand=True)
        precip_marker = cond_cols[0].loc[cond_cols[0].isin(['Rain', 'Snow'])]
        not_precip_marker = cond_cols[0].loc[cond_cols.index.difference(precip_marker.index)]
        precip_column = pd.Series(index=dropna_weather.index, dtype='object')
        precip_column.iloc[precip_marker.index] = precip_marker.values
        precip_column.iloc[not_precip_marker.index] = 'No Precipitation'
        dropna_weather['Precipitation Type'] = precip_column
        prev_2021_CSVstring = './data/weather/{}_weather_2021_subset.csv'.format(location)
        prev_weather = pd.read_csv(prev_2021_CSVstring)
        combined_weather = pd.concat([prev_weather, dropna_weather], ignore_index=True, axis=0)
        combined_weather.drop_duplicates(inplace=True, ignore_index=True)
        combined_weather.to_csv(prev_2021_CSVstring, index=False)
        successful_processes.append((CSVstring, frac_kept))
    print('Successfully processed and combined the following raw data files with previous data:')
    for filestring, fraction in successful_processes:
        print('        FILE:          {}'.format(filestring))
        print('        FRACTION KEPT: {}'.format(fraction))

In [None]:
process_weather_data(files_to_process=successful_retrievals)

#### Data sample for viewing

In [11]:
sample = pd.read_csv('./data/weather/Providence_RI_weather_2021_subset.csv')
sample.head()

Unnamed: 0,Address,Date time,Latitude,Longitude,Temperature,Precipitation,Cloud Cover,Conditions,Precipitation Type
0,"Providence, RI",2021-01-01 00:00:00,41.8239,-71.412,31.7,0.0,0.0,Clear,No Precipitation
1,"Providence, RI",2021-01-01 01:00:00,41.8239,-71.412,30.2,0.0,0.0,Clear,No Precipitation
2,"Providence, RI",2021-01-01 02:00:00,41.8239,-71.412,29.8,0.0,0.0,Clear,No Precipitation
3,"Providence, RI",2021-01-01 03:00:00,41.8239,-71.412,27.2,0.0,0.0,Clear,No Precipitation
4,"Providence, RI",2021-01-01 04:00:00,41.8239,-71.412,27.6,0.0,0.0,Clear,No Precipitation


In [12]:
sample.tail()

Unnamed: 0,Address,Date time,Latitude,Longitude,Temperature,Precipitation,Cloud Cover,Conditions,Precipitation Type
2439,"Providence, RI",04/10/2021 19:00:00,41.8239,-71.412,63.4,0.0,0.0,Clear,No Precipitation
2440,"Providence, RI",04/10/2021 20:00:00,41.8239,-71.412,60.8,0.0,0.0,Clear,No Precipitation
2441,"Providence, RI",04/10/2021 21:00:00,41.8239,-71.412,57.7,0.0,0.0,Clear,No Precipitation
2442,"Providence, RI",04/10/2021 22:00:00,41.8239,-71.412,55.9,0.0,0.0,Clear,No Precipitation
2443,"Providence, RI",04/10/2021 23:00:00,41.8239,-71.412,54.8,0.0,0.0,Clear,No Precipitation


# Part 3: Loading Data into Postgres Database

## SECTION 3A - Setup

#### Imports for loading into database

In [1]:
import psycopg2
import csv
import os
import sys 
import time
assert os.environ.get('DB_PASS') != None , 'empty password!'

#### Functions to create and update tables in the database

In [2]:
def create_table(conn, command):
    """
    Create a table in the PostgreSQL database based on given command.
    """
    try:
        cur = conn.cursor()
        cur.execute(command)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
        conn.rollback()

In [3]:
def update_table(conn, command, csv_file):
    """
    Insert rows from a CSV file into table specified by the command.
    """
    cur = conn.cursor()
    with open(csv_file, newline='') as file:
        info_reader = csv.reader(file, delimiter=',')
        next(info_reader) # skip header                                                                          
        for row in info_reader:                                           
            try:
                cur.execute(command, tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                conn.rollback()
        conn.commit() 

#### Commands for creating each table 

In [4]:
create_train_info_table_command = """
                                  DROP TABLE IF EXISTS train_info CASCADE;
                                  CREATE TABLE train_info (
                                      train_info_id SERIAL PRIMARY KEY,
                                      train_num text UNIQUE,
                                      operating_direction text,
                                      reg_operates_on_mon boolean,
                                      reg_operates_on_tues boolean,
                                      reg_operates_on_wed boolean,
                                      reg_operates_on_thurs boolean,
                                      reg_operates_on_fri boolean,
                                      reg_operates_on_sat boolean,
                                      reg_operates_on_sun boolean,
                                      depart_origin_time text,
                                      depart_NY_time text,
                                      arrive_dest_time text
                                  );
                                  """

In [5]:
create_station_info_table_command = """ 
                                    DROP TABLE IF EXISTS station_info CASCADE;

                                    CREATE TABLE station_info (
                                        station_info_id SERIAL PRIMARY KEY,
                                        station_code text,
                                        station_name text,
                                        state text,
                                        amtrak_city text,
                                        weather_loc text,
                                        longitude real,
                                        latitude real,
                                        nb_mile numeric,
                                        sb_mile numeric
                                    );
                                    """

In [6]:
create_arrivals_table_command = """ 
                                DROP TABLE IF EXISTS arrivals CASCADE;

                                CREATE TABLE arrivals (
                                    dataset_id SERIAL PRIMARY KEY,
                                    train_num text REFERENCES train_info (train_num),
                                    station_code text, 
                                    direction text,
                                    origin_date date,
                                    origin_year int,
                                    origin_month int,
                                    origin_week_day text,
                                    full_sched_arr_datetime timestamp,
                                    sched_arr_date date,
                                    sched_arr_week_day text,
                                    sched_arr_time time,
                                    act_arr_time time,
                                    arrive_diff numeric,
                                    service_disruption boolean,
                                    cancellations boolean     
                                );
                                """

In [7]:
create_departures_table_command = """ 
                                  DROP TABLE IF EXISTS departures CASCADE;
                                  CREATE TABLE departures (
                                      dataset_id SERIAL PRIMARY KEY,
                                      train_num text REFERENCES train_info (train_num),
                                      station_code text, 
                                      direction text,
                                      origin_date date,
                                      origin_year int,
                                      origin_month int,
                                      origin_week_day text,
                                      full_sched_dep_datetime timestamp,
                                      sched_dep_date date,
                                      sched_dep_week_day text,
                                      sched_dep_time time,
                                      act_dep_time time,
                                      depart_diff numeric,
                                      service_disruption boolean,
                                      cancellations boolean     
                                  );
                                  """

In [8]:
create_weather_table_command = """
                               DROP TABLE IF EXISTS weather_hourly CASCADE;
                               CREATE TABLE weather_hourly (
                                   weather_id SERIAL PRIMARY KEY,
                                   location text,
                                   date_time timestamp,
                                   latitude real,
                                   longitude real,
                                   temperature real,
                                   precipitation real,
                                   cloud_cover real,
                                   conditions text,
                                   precip_type text
                               );
                               """

In [9]:
create_test_table_command = """ 
                            DROP TABLE IF EXISTS test_dep CASCADE;
                            CREATE TABLE test_dep (
                                dataset_id SERIAL PRIMARY KEY,
                                train_num text REFERENCES train_info (train_num),
                                station_code text, 
                                direction text,
                                origin_date date,
                                origin_year int,
                                origin_quarter int,
                                origin_month int,
                                origin_day int,
                                origin_week_day text,
                                full_sched_dep_datetime timestamp,
                                sched_dep_date date,
                                sched_dep_week_day text,
                                sched_dep_time time,
                                act_dep_time time,
                                depart_diff numeric,
                                service_disruption boolean,
                                cancellations boolean     
                            );
                            """

In [6]:
create_route_table_command = """
                             DROP TABLE IF EXISTS regional_route CASCADE;
                            
                             CREATE TABLE regional_route (
                                 coord_id SERIAL PRIMARY KEY,
                                 longitude real,
                                 latitude real,
                                 path_group numeric,
                                 connecting_path text, 
                                 nb_station_group text,
                                 sb_station_group text
                             );
                             """

In [10]:
create_table_commands = [create_train_info_table_command,
                         create_station_info_table_command,
                         create_arrivals_table_command,
                         create_departures_table_command,
                         create_weather_table_command,
                         create_test_table_command,
                         create_route_table_command]

#### Commands for inserting data into each table

In [11]:
insert_into_train_info_table_command = """
                               INSERT INTO
                                   train_info (
                                       train_num,
                                       operating_direction,
                                       reg_operates_on_mon,
                                       reg_operates_on_tues,
                                       reg_operates_on_wed,
                                       reg_operates_on_thurs,
                                       reg_operates_on_fri,
                                       reg_operates_on_sat,
                                       reg_operates_on_sun,
                                       depart_origin_time,
                                       depart_NY_time,
                                       arrive_dest_time
                                    )
                               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                               ON CONFLICT DO NOTHING;
                               """   

In [12]:
insert_into_station_info_table_command = """
                                         INSERT INTO
                                             station_info (
                                                 station_code,
                                                 station_name,
                                                 state,
                                                 amtrak_city,
                                                 weather_loc,
                                                 longitude,
                                                 latitude,
                                                 nb_mile,
                                                 sb_mile
                                             )
                                         VALUES
                                             (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                                         ON CONFLICT DO NOTHING;
                                         """    

In [13]:
insert_into_arrivals_table_command = """
                                     INSERT INTO
                                         arrivals (
                                             train_num,
                                             station_code,
                                             direction,
                                             origin_date,
                                             origin_year,
                                             origin_month,
                                             origin_week_day,
                                             full_sched_arr_datetime,
                                             sched_arr_date,
                                             sched_arr_week_day,
                                             sched_arr_time,
                                             act_arr_time,
                                             arrive_diff,
                                             service_disruption,
                                             cancellations
                                         )
                                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                                     ON CONFLICT DO NOTHING; 
                                     """  

In [14]:
insert_into_departures_table_command = """
                                       INSERT INTO
                                           departures (
                                               train_num,
                                               station_code,
                                               direction,
                                               origin_date,
                                               origin_year,
                                               origin_month,
                                               origin_week_day,
                                               full_sched_dep_datetime,
                                               sched_dep_date,
                                               sched_dep_week_day,
                                               sched_dep_time,
                                               act_dep_time,
                                               depart_diff,
                                               service_disruption,
                                               cancellations
                                           )
                                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                                       ON CONFLICT DO NOTHING;
                                       """

In [15]:
insert_into_weather_table_command = """
                                    INSERT INTO
                                        weather_hourly (
                                            location,
                                            date_time,
                                            latitude,
                                            longitude,
                                            temperature,
                                            precipitation,
                                            cloud_cover,
                                            conditions,
                                            precip_type
                                        )
                                    VALUES
                                        (%s, %s, %s, %s, %s, %s, %s, %s, %s) 
                                    ON CONFLICT DO NOTHING;
                                    """ 


In [24]:
insert_into_test_table_command = """
                                 INSERT INTO
                                     test_dep (
                                         train_num,
                                         station_code,
                                         direction,
                                         origin_date,
                                         origin_year,
                                         origin_month,
                                         origin_week_day,
                                         full_sched_dep_datetime,
                                         sched_dep_date,
                                         sched_dep_week_day,
                                         sched_dep_time,
                                         act_dep_time,
                                         depart_diff,
                                         service_disruption,
                                         cancellations
                                     )
                                     VALUES 
                                         (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                                     ON CONFLICT DO NOTHING;
                                     """

In [5]:
insert_into_route_table_command = """
                                  INSERT INTO
                                      regional_route (
                                          longitude,
                                          latitude, 
                                          path_group,
                                          connecting_path,
                                          nb_station_group,
                                          sb_station_group
                                      )
                                  VALUES 
                                      (%s, %s, %s, %s, %s, %s) 
                                  ON CONFLICT DO NOTHING;
                                  """

In [17]:
update_table_commands = [insert_into_train_info_table_command,
                         insert_into_station_info_table_command,
                         insert_into_arrivals_table_command,
                         insert_into_departures_table_command,
                         insert_into_weather_table_command,
                         insert_into_test_table_command,
                         insert_into_route_table_command]

## SECTION 3B - Load Data

#### Create connection to database

In [7]:
conn = psycopg2.connect("dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS')))
assert conn is not None, 'need to fix conn!!'

#### Create tables for train arrivals and departures, train info, and hourly weather data

In [19]:
for command in create_table_commands:
    create_table(conn, command)

#### Load train info and station info into database

In [20]:
# Insert all station facts into station info table
update_table(conn, insert_into_station_info_table_command, './data/facts/geo_stations_info.csv')

# Insert all train facts into train info table 
update_table(conn, insert_into_train_info_table_command, './data/facts/train_nums.csv')

#### Load train data into database

In [21]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

begin_everything = time.time()

# Insert all train data into arrival and departure data tables
for year in years:
    start = time.time()
    arrive_csv = './data/trains/processed_arrive_' + str(year) + '.csv'
    depart_csv = './data/trains/processed_depart_' + str(year) + '.csv'
    update_table(conn, insert_into_arrivals_table_command, arrive_csv)
    update_table(conn, insert_into_departures_table_command, depart_csv)
    print('DONE WITH', year, 'in', time.time() - start)
print('COMPLETE in', time.time() - begin_everything)

DONE WITH 2011 in 3.6504440307617188
DONE WITH 2012 in 3.468768835067749
DONE WITH 2013 in 3.6414060592651367
DONE WITH 2014 in 3.806241989135742
DONE WITH 2015 in 3.747544765472412
DONE WITH 2016 in 3.8473920822143555
DONE WITH 2017 in 3.8376760482788086
DONE WITH 2018 in 3.88606595993042
DONE WITH 2019 in 3.8881258964538574
DONE WITH 2020 in 5.339155912399292
DONE WITH 2021 in 0.907721996307373
COMPLETE in 40.0218071937561


#### Load weather data into database

In [22]:
location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 
                            'Stamford_CT', 'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 
                            'Wilmington_DE', 'Baltimore_MD', 'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 
                            'Washington_DC']

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

# Insert all weather data into the weather data table
begin_everything = time.time()
for location in location_names_for_files:
    start = time.time()
    for year in years:
        weather_csv = './data/weather/' + location + '_weather_' + str(year) + '_subset.csv'
        update_table(conn, insert_into_weather_table_command, weather_csv)
    print('Finished adding location', location, 'to the database in', time.time() - start, 'seconds')
print("COMPLETE in", time.time() - begin_everything)

Finished adding location Boston_MA to the database in 2.924304962158203 seconds
Finished adding location Providence_RI to the database in 2.7978601455688477 seconds
Finished adding location Kingston_RI to the database in 2.788480043411255 seconds
Finished adding location New_London_CT to the database in 2.780427932739258 seconds
Finished adding location New_Haven_CT to the database in 2.763170003890991 seconds
Finished adding location Stamford_CT to the database in 2.7738327980041504 seconds
Finished adding location Manhattan_NY to the database in 2.7858219146728516 seconds
Finished adding location Newark_NJ to the database in 2.7966361045837402 seconds
Finished adding location Trenton_NJ to the database in 2.7949910163879395 seconds
Finished adding location Philadelphia_PA to the database in 2.8063650131225586 seconds
Finished adding location Wilmington_DE to the database in 2.7906439304351807 seconds
Finished adding location Baltimore_MD to the database in 2.822316884994507 seconds
F

In [25]:
# Insert small amount of data into test table for test queries
arrive_csv = './data/trains/processed_arrive_2021.csv'
update_table(conn, insert_into_test_table_command, arrive_csv)

#### Create route table to store coordinates

In [8]:
conn = psycopg2.connect("dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS')))
assert conn is not None, 'need to fix conn!!'

In [9]:
create_table(conn, create_route_table_command)

In [10]:
update_table(conn, insert_into_route_table_command, './data/facts/NE_regional_lonlat.csv')

#### Quick stats on the quantity of information loaded

In [26]:
%load_ext sql

In [27]:
%sql postgresql://appuser:test@localhost:5432/amtrakproject

In [28]:
%%sql

SELECT COUNT(*)
FROM weather_hourly;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
1350082


In [29]:
%%sql

SELECT COUNT(*)
FROM departures;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
793126
