# EDA and ETL Notebook


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import date, timedelta
import re
import requests
import lxml.html as lh
from fetch_data import construct_urls, fetch_data_from_urls

## A. Helper Functions for loading/requesting and processing raw data

In [None]:
def pull_data(option='from_saved', start=date.today()-timedelta(days=1), end=date.today()):
    """
    Function to either load raw data from previously saved CSV files, or retrieve it again
    from the website.
    """
    if option == 'request':
        northbound = [[66, 82, 86, 88, 94], [132, 150, 160, 162, 164, 166], [168, 170, 172, 174]]
        southbound = [[67, 83, 93, 95, 99], [135, 137, 139, 161, 163, 165], [167, 171, 173, 175, 195]]
        urls = construct_urls(northbound, southbound, start, end)
        data = fetch_data_from_urls(urls)
    elif option == 'from_saved':
        data = None
        print("Skip this section and go to part B!")
    return data

### Choose an option
* If `option = 'from_saved'`, go to section on Raw Data.
* Otherwise, uncomment the other line and wait for request to complete.


In [None]:
#data = pull_data(option='from_saved')
start = date(2020,11,29)
end = date(2021,2,18)
data = pull_data(option='request', start, end)

In [None]:
def get_direction(num):
    """
    Return direction of the train (odd = Southbound, even = Northbound).
    """
    if num % 2 == 0:
        return 'Northbound'
    else:
        return 'Southbound'


def get_num(re_match):
    """
    Assuming input contains a match , extract and return the numerical data from input.
    """
    num_match = re.search('(?P<num>[0-9]+)', re_match)
    return int(num_match.group('num'))


def make_dict_from_cols(col_names):
    """
    Create dictionary from a list of column names
    """
    dictionary = { col_name: [] for col_name in col_names }
    return dictionary


def get_html_col_names(raw_data, arrive_or_depart):
    """
    Using NYP (station with both arrival times and departure times), 
    retrieve column names from the HTML table, located in the 2nd row.
    """
    data_list = raw_data[arrive_or_depart]['BOS']
    page_content = data_list[0]
    doc = lh.fromstring(page_content)
    tr_elements = doc.xpath('//tr')
    html_col_names = [entry.text_content().strip() for entry in tr_elements[1]]        
    return html_col_names

In [None]:
def raw_data_to_raw_df(raw_data, arrive_or_depart):
    """
    Function to put the raw html data in a dataframe for ease of processing.
    """
    col_names = get_html_col_names(raw_data, arrive_or_depart)
    N = 7
    data_dict = make_dict_from_cols(['Direction', 'Station'] + col_names)
    for station in raw_data[arrive_or_depart].keys():
        data_list = raw_data[arrive_or_depart][station]
        L = len(data_list)
        for i in range(L):
            page_content = data_list[i]
            doc = lh.fromstring(page_content)
            tr_elements = doc.xpath('//tr')
            if len(tr_elements) > 3:
                title = tr_elements[0].text_content()
                direction = get_direction(get_num(title))
                for j in range(2, len(tr_elements)):
                    table_row = tr_elements[j] 
                    if len(table_row) == N:
                        data_dict['Direction'].append(direction)
                        data_dict['Station'].append(station)
                        for col_name, entry in zip(col_names, table_row):
                            data = entry.text_content()
                            data_dict[col_name].append(data)
                    else:
                        continue
                        
            else:
                print("Potentially no data for this time period, or an error occurred", station, arrive_or_depart)
    return pd.DataFrame.from_dict(data_dict)

In [None]:
start_time = time.time()
depart =  raw_data_to_raw_df(data, 'Depart')
print('elapsed:', time.time() - start_time)
depart.head()

In [None]:
arrive = data['Arrive']
start_time = time.time()
arrive = raw_data_to_raw_df(data, 'Arrive')
print('elapsed:', time.time() - start_time)
arrive.head()

In [None]:
start = date(2020,11,29)
end = date(2021,2,18)
arrive_filestring = './data/trains/raw_arrive_' + str(start) + '_' + str(end) + '.csv'
depart_filestring = './data/trains/raw_depart_' + str(start) + '_' +  str(end) + '.csv'
print(arrive_filestring)
print(depart_filestring)

In [None]:
arrive.to_csv(arrive_filestring, line_terminator='\n', index=False)
depart.to_csv(depart_filestring, line_terminator='\n', index=False)

## Raw Train Data - Scraped and Loaded into Pandas DF

The data is scraped from an HTML table, so the raw data doesn't look nice after scraping until it's put back in a dataframe. The data was then processed into an initial dataframe and saved as a CSV for later processing.


In [None]:
arrive = pd.read_csv(arrive_filestring, lineterminator='\n', keep_default_na=False)
depart = pd.read_csv(depart_filestring, lineterminator='\n', keep_default_na=False)

In [None]:
arrive.head()

In [None]:
arrive.shape[0]

In [None]:
depart.head()

In [None]:
depart.shape[0]

In [None]:
def get_col_names(arrive_or_depart):
    if arrive_or_depart == 'Arrive':
        return ['Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year', 'Origin Quarter', 
                 'Origin Month', 'Origin Day', 'Origin Week Day', 'Full Sch Ar Date', 'Sch Ar Date', 
                 'Sch Ar Day', 'Sch Ar Time','Act Ar Time', 'Arrive Diff', 'Service Disruption', 'Cancellations']
    elif arrive_or_depart == 'Depart':
        return [ 'Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year', 'Origin Quarter', 
                 'Origin Month', 'Origin Day', 'Origin Week Day', 'Full Sch Dp Date','Sch Dp Date', 
                 'Sch Dp Day', 'Sch Dp Time','Act Dp Time', 'Depart Diff', 'Service Disruption', 'Cancellations']

    
def get_key_names(arrive_or_depart):
    if arrive_or_depart == 'Arrive':
        return {'Sch Full Date': 'Full Sch Ar Date', 'Sch Abbr': 'Sch Ar', 'Act Abbr': 'Act Ar', 'Diff': 'Arrive Diff'}
    
    elif arrive_or_depart == 'Depart':
        return {'Sch Full Date': 'Full Sch Dp Date', 'Sch Abbr': 'Sch Dp', 'Act Abbr': 'Act Dp', 'Diff': 'Depart Diff'}


def process_columns(df, arrive_or_depart):
    new_cols = get_col_names(arrive_or_depart)
    ad_keys = get_key_names(arrive_or_depart) # the specific keys depending on if new_df is for arr or dep data
    
    new_df = pd.DataFrame(columns=new_cols)
    new_df['Train Num'] = pd.to_numeric(df['Train #'])
    new_df['Station'] = df['Station']
    new_df['Direction'] = df['Direction']
    
    origin_date = pd.to_datetime(df['Origin Date'], format="%m/%d/%Y", exact=False, errors='coerce')    
    new_df['Origin Date'] = origin_date
    new_df['Origin Year'] = origin_date.dt.year
    new_df['Origin Quarter'] = origin_date.dt.quarter
    new_df['Origin Month'] = origin_date.dt.month
    new_df['Origin Day'] = origin_date.dt.day
    new_df['Origin Week Day'] = origin_date.dt.day_name()
    
    sched_full_date = pd.to_datetime(df[ad_keys['Sch Abbr']], format='%m/%d/%Y %I:%M %p', exact=False, errors='coerce')
    new_df[ad_keys['Sch Full Date']] = sched_full_date
    new_df[ad_keys['Sch Abbr'] + ' Date'] = sched_full_date.dt.date
    new_df[ad_keys['Sch Abbr'] + ' Day'] = sched_full_date.dt.day_name()
    new_df[ad_keys['Sch Abbr'] + ' Time'] = sched_full_date.dt.time
    act_time = pd.to_datetime(df[ad_keys['Act Abbr']], format='%I:%M%p', exact=False, errors='coerce')
    new_df[ad_keys['Act Abbr'] + ' Time'] = act_time.dt.time
    
    df['Sched Date'] = sched_full_date 
    df['Act Date'] = pd.to_datetime(sched_full_date.dt.date.astype(str) + " " + df[ad_keys['Act Abbr']].astype(str),exact=False, errors='coerce')
    max_expected_delay = pd.Timedelta(hours=10)
    delta = df['Act Date'] - df['Sched Date']
    m_late = (delta < max_expected_delay) & (-1*max_expected_delay > delta)
    m_early = (-1*delta < max_expected_delay) & (-1*max_expected_delay > -1*delta)
    df.loc[m_late, 'Act Date'] += pd.Timedelta(days=1)
    df.loc[m_early, 'Act Date'] -= pd.Timedelta(days=1)
    new_df[ad_keys['Diff']] = np.rint((df['Act Date'] - df['Sched Date']).dt.total_seconds()/60).astype(int)
    new_df['Service Disruption'] = df['Service Disruption'].replace('SD', 1).replace('', 0)
    new_df['Cancellations'] =  df['Cancellations'].replace('C', 1).replace('', 0)
    return new_df.replace('', np.nan).dropna()


In [None]:
full_depart = process_columns(depart, "Depart")
full_depart.head()

In [None]:
full_depart.tail()

In [None]:
full_depart.shape[0]

In [None]:
full_arrive = process_columns(arrive, 'Arrive')
full_arrive.head()

In [None]:
full_arrive.tail()

In [None]:
full_arrive.shape[0]

#### Below are the number of rows in each df that are omitted in the database 

In [None]:
#full_arrive.loc[(full_arrive['Origin Year'] == 2010)].shape[0]

In [None]:
#full_depart.loc[(full_depart['Origin Year'] == 2010)].shape[0]

### Create CSV files by year to break down data into smaller chunks
* Ignore any data from 2010, this is only 23 rows in the departure and arrival dataframes combined (due to trains that were retrieved with the web request starting 1/1/2011 but originated in 2010). 
* Subset into files by arrival and departure by year


In [None]:
#years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

#for year in years:
#    depart_subset = full_depart.loc[(full_depart['Origin Year'] == year)]
#    arrive_subset = full_arrive.loc[(full_arrive['Origin Year'] == year)]
#    print(depart_subset.shape[0], arrive_subset.shape[0])
#    depart_filestring = './data/trains/processed_depart_' + str(year) + '.csv'
#    arrive_filestring = './data/trains/processed_arrive_' + str(year) + '.csv'
#    depart_subset.to_csv(depart_filestring, line_terminator='\n', index=False)
#    arrive_subset.to_csv(arrive_filestring, line_terminator='\n', index=False)

In [None]:
#prev_arrive_2021 = pd.read_csv('./data/trains/processed_arrive_2021.csv')
#prev_depart_2021 = pd.read_csv('./data/trains/processed_depart_2021.csv')

#new_arrive_2021 = pd.concat([prev_arrive_2021, full_arrive], ignore_index=True, axis=0)
#new_depart_2021 = pd.concat([prev_depart_2021, full_depart], ignore_index=True, axis=0)

#new_arrive_2021.to_csv('./data/trains/processed_arrive_2021.csv', line_terminator='\n', index=False)
#new_depart_2021.to_csv('./data/trains/processed_depart_2021.csv', line_terminator='\n', index=False)

## C. Postgres Database 

In [None]:
import psycopg2
import csv
import os
import sys 

DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)

In [None]:
assert os.environ.get('DB_PASS') != None , 'empty password!'

### Create Tables

In [None]:
def create_tables(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS train_info CASCADE;
        CREATE TABLE train_info (
            train_info_id SERIAL PRIMARY KEY,
            train_num int UNIQUE,
            operating_direction text,
            reg_operates_on_mon boolean,
            reg_operates_on_tues boolean,
            reg_operates_on_wed boolean,
            reg_operates_on_thurs boolean,
            reg_operates_on_fri boolean,
            reg_operates_on_sat boolean,
            reg_operates_on_sun boolean,
            depart_origin_time text,
            depart_NY_time text,
            arrive_dest_time text
            
        );
        """,
        """ 
        DROP TABLE IF EXISTS arrivals CASCADE;
        CREATE TABLE arrivals (
            dataset_id SERIAL PRIMARY KEY,
            train_num int REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_arr_datetime timestamp,
            sched_arr_date date,
            sched_arr_week_day text,
            sched_arr_time time,
            act_arr_time time,
            arrive_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """,
        """ 
        DROP TABLE IF EXISTS departures CASCADE;
        CREATE TABLE departures (
            dataset_id SERIAL PRIMARY KEY,
            train_num int REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_dep_datetime timestamp,
            sched_dep_date date,
            sched_dep_week_day text,
            sched_dep_time time,
            act_dep_time time,
            depart_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """
    ]
    try:
        conn = psycopg2.connect(DSN)
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    finally:
        if conn is not None:
            conn.close()

conn = psycopg2.connect(DSN)
create_tables(conn)

### Add to Database

In [None]:
import csv
from sqlalchemy import text
from psycopg2 import sql 

def update_train_info_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO train_info (train_num, operating_direction, reg_operates_on_mon, 
                   reg_operates_on_tues, reg_operates_on_wed, reg_operates_on_thurs, 
                   reg_operates_on_fri, reg_operates_on_sat, reg_operates_on_sun, 
                   depart_origin_time, depart_NY_time, arrive_dest_time)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""]                
                
    with open(csv_file, newline='') as file:
        info_reader = csv.reader(file, delimiter=',')
        next(info_reader) # skip header                                                                          
        for row in info_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                conn.rollback()
        conn.commit() 

def update_arrive_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO arrivals (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_arr_datetime, sched_arr_date, sched_arr_week_day,
                               sched_arr_time, act_arr_time, arrive_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""]                        
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)     # skip header                                                                         
        for row in train_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

def update_depart_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO departures (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_dep_datetime, sched_dep_date, sched_dep_week_day,
                               sched_dep_time, act_dep_time, depart_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""]                        
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)   # skip header                                                                           
        for row in train_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()


In [None]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
depart_filestrings_list = []
arrive_filestrings_list = []
for year in years:
    depart_filestring = './data/trains/processed_depart_' + str(year) + '.csv'
    arrive_filestring = './data/trains/processed_arrive_' + str(year) + '.csv'
    depart_filestrings_list.append(depart_filestring) 
    arrive_filestrings_list.append(arrive_filestring)

In [None]:
import time

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
conn = psycopg2.connect(DSN)
create_tables(conn)
begin_everything = time.time()
update_train_info_table(conn, './data/trains/train_nums.csv')
for i in range(len(years)):
    start = time.time()
    arrive_csv = arrive_filestrings_list[i]
    depart_csv = depart_filestrings_list[i]
    update_arrive_table(conn, arrive_csv)
    update_depart_table(conn, depart_csv)
    print("DONE WITH", years[i], 'in', time.time() - start)
conn.close()

print("COMPLETE in", time.time() - begin_everything)

## Weather Data

In [1]:
import psycopg2
import os
import sys 
from sqlalchemy import text
from psycopg2 import sql
import time
import csv
import pandas as pd
import numpy as np

### To get all the data

In [None]:
dates_list = [('2011-01-01','2011-12-31'), # Original dates list used to get all historical data
              ('2012-01-01','2012-12-31'),
              ('2013-01-01','2013-12-31'),
              ('2014-01-01','2014-12-31'),
              ('2015-01-01','2015-12-31'),
              ('2016-01-01','2016-12-31'),
              ('2017-01-01','2017-12-31'),
              ('2018-01-01','2018-12-31'),
              ('2019-01-01','2019-12-31'),
              ('2020-01-01','2020-11-31')]

dates_list = [('2011-01-01','2011-12-31'), # New dates list needed for DB
              ('2012-01-01','2012-12-31'),
              ('2013-01-01','2013-12-31'),
              ('2014-01-01','2014-12-31'),
              ('2015-01-01','2015-12-31'),
              ('2016-01-01','2016-12-31'),
              ('2017-01-01','2017-12-31'),
              ('2018-01-01','2018-12-31'),
              ('2019-01-01','2019-12-31'),
              ('2020-01-01','2021-02-18')]

locations = ['Boston,MA', 'Providence,RI', 'Kingston,RI', 'New%20London,CT', 'New%20Haven,CT', 'Stamford,CT', 
             'Manhattan,NY', 'Newark,NJ', 'Trenton,NJ', 'Philadelphia,PA', 'Wilmington,DE', 'Baltimore,MD', 
             'Baltimore%20BWI%20Airport,MD', 'New%20Carrollton,MD', 'Washington,DC']

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 'Stamford_CT', 
             'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 'Wilmington_DE', 'Baltimore_MD', 
             'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 'Washington_DC']

In [None]:
url_base = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=1&startDateTime='

for location, filename in zip(locations, location_names_for_files):
    print('Running urls for', location)
    for startdate, enddate in dates_list:
        url = url_base + startdate + 'T00:00:00&endDateTime=' + enddate + 'T00:00:00&unitGroup=us&contentType=csv&location=' + location + '&key='+os.environ.get('VC_TOKEN')
        csv_bytes = requests.get(url).content
        filestring = './data/weather_original/' + filename + '_weather_data_' + startdate + '_' + enddate + '.csv'
        with open(filestring, 'w', newline='\n') as csvfile:
            csvfile.write(csv_bytes.decode())
        csvfile.close()

In [None]:
dates_list = [('2020-01-01','2020-11-31'), 
              ('2020-12-01', '2021-02-18')] # replace w curr date
for location in location_names_for_files:
    print('Fixing data for ', location)
    startdate1, enddate1 = dates_list[0]
    startdate2, enddate2 = dates_list[1]
    weather_2020_part1 = pd.read_csv('./data/weather_original/' + filename + '_weather_data_' + startdate1 + '_' + enddate1 + '.csv')
    weather_2020_2021_part2 = pd.read_csv('./data/weather_original/' + filename + '_weather_data_' + startdate2 + '_' + enddate2 + '.csv')
    full_weather = pd.concat([weather_2020_part1, weather_2020_2021_part2], ignore_index=True, axis=0)
    full_weather_new = full_weather[['Address', 'Date time', 'Temperature', 'Precipitation', 'Cloud Cover', 
                                    'Latitude', 'Longitude', 'Conditions']].iloc[:]
    nona_df = full_weather_new.replace('', np.nan).dropna()
    print(nona_df.shape[0]/full_weather_new.shape[0])
    full_weather_new.to_csv('./data/weather/' +  location + '_weather_data_' + startdate1 + '_' + enddate2 + '_col_subset.csv', index=False)

### Raw weather data comes well-formatted in CSV already

In [None]:
filestring = './data/weather_original/Boston_MA_weather_data_2011-01-01_2011-12-31.csv'
df_sample = pd.read_csv(filestring)
df_sample.head()

### Drop NA values (very few rows are actually dropped)

In [None]:
for location in location_names_for_files:
    for startdate, enddate in dates_list:
        filestring = './data/weather_original/' + location + '_weather_data_' + startdate + '_' + enddate + '.csv'
        df = pd.read_csv(filestring, usecols=['Address', 'Date time', 'Temperature', 'Precipitation', 'Cloud Cover', 
                                    'Latitude', 'Longitude', 'Conditions'])
        nona_df = df.replace('', np.nan).dropna()
        print(nona_df.shape[0]/df.shape[0])
        nona_df.to_csv('./data/weather/' + location + '_weather_data_' + startdate + '_' + enddate + '_col_subset.csv', index=False)
        

In [None]:
def create_tables(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS weather_hourly CASCADE;
        CREATE TABLE weather_hourly (
            weather_id SERIAL PRIMARY KEY,
            location text DEFAULT NULL,
            date_time timestamp DEFAULT NULL,
            temperature real DEFAULT NUll,
            precipitation real DEFAULT NULL,
            cloud_cover real DEFAULT NULL,
            latitude real DEFAULT NULL,
            longitude real DEFAULT NULL,
            conditions text DEFAULT NULL
        );
        """
    ]
    try:
        conn = psycopg2.connect(DSN)
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    finally:
        if conn is not None:
            conn.close()

DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)
create_tables(conn)

In [None]:
def update_weather_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO weather_hourly (location, date_time, temperature, precipitation, 
                   cloud_cover, latitude, longitude, conditions)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""]                
                   
    with open(csv_file, newline='') as file: 
        data_reader = csv.reader(file, delimiter=',')
        next(data_reader, None)   # skip header                                                                           
        for row in data_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()


In [None]:
conn = psycopg2.connect(DSN)
create_tables(conn)
begin_everything = time.time()
for location in location_names_for_files:
    start = time.time()
    for startdate, enddate in dates_list:
        csv_file = './data/weather/' + location + '_weather_data_' + startdate + '_' + enddate + '_col_subset.csv'
        update_weather_table(conn, csv_file)
    print('Finished adding location', location, 'to the database')
print("COMPLETE in", time.time() - begin_everything)

In [2]:
%load_ext sql

In [3]:
%sql postgresql://appuser:test@localhost:5432/amtrakproject

In [4]:
%%sql

SELECT COUNT(*) from weather_hourly;


 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
1327576


In [5]:
%%sql

SELECT COUNT(*) from departures;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
783486


In [6]:
%%sql

SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'departures'

 * postgresql://appuser:***@localhost:5432/amtrakproject
18 rows affected.


column_name
dataset_id
train_num
station_code
direction
origin_date
origin_year
origin_quarter
origin_month
origin_day
origin_week_day


In [7]:
%%sql

SELECT table_name 
FROM INFORMATION_SCHEMA.TABLES
WHERE table_schema='public'
AND table_type='BASE TABLE';

 * postgresql://appuser:***@localhost:5432/amtrakproject
4 rows affected.


table_name
weather_hourly
train_info
arrivals
departures


In [8]:
%%sql

SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'train_info'

 * postgresql://appuser:***@localhost:5432/amtrakproject
13 rows affected.


column_name
train_info_id
train_num
operating_direction
reg_operates_on_mon
reg_operates_on_tues
reg_operates_on_wed
reg_operates_on_thurs
reg_operates_on_fri
reg_operates_on_sat
reg_operates_on_sun


In [9]:
%%sql

SELECT d.train_num, AVG(depart_diff) from departures d
INNER JOIN (
    SELECT *
    FROM train_info 
    WHERE depart_origin_time = 'OVERNIGHT'
    ) ti
ON d.train_num = ti.train_num
GROUP BY d.train_num
ORDER BY AVG(depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
2 rows affected.


train_num,avg
170,7.298460363913984
150,5.971375685859721


In [10]:
%%sql

SELECT d.train_num, AVG(depart_diff) from departures d
INNER JOIN (
    SELECT *
    FROM train_info 
    WHERE depart_origin_time = 'AM'
    ) ti
ON d.train_num = ti.train_num
GROUP BY d.train_num
ORDER BY AVG(depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
12 rows affected.


train_num,avg
83,13.51632425545469
86,13.471540162947615
93,11.39393656716418
161,10.552868199408149
171,9.9943882483318
172,9.38800313643492
195,9.252204866645416
164,9.087953672381172
95,9.073601915762785
99,8.177022821576763


In [11]:
%%sql

SELECT d.train_num, AVG(depart_diff) from departures d
INNER JOIN (
    SELECT *
    FROM train_info 
    WHERE depart_origin_time = 'PM'
    ) ti
ON d.train_num = ti.train_num
GROUP BY d.train_num
ORDER BY AVG(depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
8 rows affected.


train_num,avg
94,19.981540543152605
166,10.571344270034226
175,10.009828497679383
139,9.130337078651683
165,7.9497769133966365
167,7.658604155567395
168,6.944851862716339
132,6.122840429873487


In [12]:
%%sql

SELECT d.train_num, AVG(depart_diff) from departures d
INNER JOIN (
    SELECT *
    FROM train_info 
    WHERE depart_origin_time = 'MID'
    ) ti
ON d.train_num = ti.train_num
GROUP BY d.train_num
ORDER BY AVG(depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
7 rows affected.


train_num,avg
88,14.003408852527796
82,13.616908850726553
163,10.903897344891131
173,9.634414079413187
135,8.865182897609287
174,8.724925971917088
137,8.271906886120084


In [13]:
%%sql

SELECT d.train_num, AVG(depart_diff)
FROM departures d
INNER JOIN (
    SELECT * 
    FROM train_info
    WHERE reg_operates_on_sat = 't' AND reg_operates_on_sun = 't'
    ) ti
ON d.train_num = ti.train_num
GROUP BY d.train_num
ORDER BY AVG(depart_diff) DESC;


 * postgresql://appuser:***@localhost:5432/amtrakproject
13 rows affected.


train_num,avg
88,14.003408852527796
163,10.903897344891131
161,10.552868199408149
66,10.121705693504795
195,9.252204866645416
164,9.087953672381172
135,8.865182897609287
99,8.177022821576763
165,7.9497769133966365
150,5.971375685859721


In [14]:
%%sql

SELECT  d.train_Num, ti.depart_origin_time, AVG(d.depart_diff)
FROM train_info ti
INNER JOIN(
    SELECT d.train_num, d.depart_diff 
    FROM departures d
) AS d
ON ti.train_num = d.train_num
WHERE reg_operates_on_mon = 't' AND reg_operates_on_thurs = 't'
GROUP BY d.train_num, ti.depart_origin_time
ORDER BY AVG(d.depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
13 rows affected.


train_num,depart_origin_time,avg
94,PM,19.981540543152605
86,AM,13.471540162947615
93,AM,11.39393656716418
66,EVENING,10.121705693504795
175,PM,10.009828497679383
171,AM,9.9943882483318
173,MID,9.634414079413187
172,AM,9.38800313643492
95,AM,9.073601915762785
174,MID,8.724925971917088


In [15]:
%%sql

SELECT  d.train_Num, ti.depart_origin_time, AVG(d.depart_diff)
FROM train_info ti
INNER JOIN(
    SELECT d.train_num, d.depart_diff 
    FROM departures d
) AS d
ON ti.train_num = d.train_num
WHERE reg_operates_on_sat = 't' AND reg_operates_on_sun = 't'
GROUP BY d.train_num, ti.depart_origin_time
ORDER BY AVG(d.depart_diff) DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
13 rows affected.


train_num,depart_origin_time,avg
88,MID,14.003408852527796
163,MID,10.903897344891131
161,AM,10.552868199408149
66,EVENING,10.121705693504795
195,AM,9.252204866645416
164,AM,9.087953672381172
135,MID,8.865182897609287
99,AM,8.177022821576763
165,PM,7.9497769133966365
150,OVERNIGHT,5.971375685859721


In [16]:
%%sql

SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'weather_hourly'

 * postgresql://appuser:***@localhost:5432/amtrakproject
9 rows affected.


column_name
weather_id
location
date_time
temperature
precipitation
cloud_cover
latitude
longitude
conditions


In [17]:
%%sql

#### IN PROGRESS
SELECT * 
FROM departures d
INNER JOIN (
    SELECT * 
    FROM weather_hourly
    WHERE location = 'Providence,RI'
) wh
ON DATE(d.full_sched_dep_datetime) = DATE(wh.date_time)
AND 
WHERE (d.full_sched_dep_datetime - interval '30 minutes', d.full_sched_dep_datetime + interval '30 minutes') 
OVERLAPS (wh.date_time, wh.date_time + interval '1 hour')


 * postgresql://appuser:***@localhost:5432/amtrakproject
(psycopg2.errors.SyntaxError) syntax error at or near "####"
LINE 1: #### IN PROGRESS SELECT * 
        ^

[SQL: #### IN PROGRESS SELECT * 
FROM departures d
INNER JOIN (
    SELECT * 
    FROM weather_hourly
    WHERE location = 'Providence,RI'
) wh
ON DATE(d.full_sched_dep_datetime) = DATE(wh.date_time)
AND 
WHERE (d.full_sched_dep_datetime - interval '30 minutes', d.full_sched_dep_datetime + interval '30 minutes') 
OVERLAPS (wh.date_time, wh.date_time + interval '1 hour')]
(Background on this error at: http://sqlalche.me/e/13/f405)


In [None]:
def create_test_table(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS test_table CASCADE;
        CREATE TABLE test_table (
            weather_id SERIAL PRIMARY KEY,
            location text DEFAULT NULL,
            date_time timestamp DEFAULT NULL,
            temperature real DEFAULT NUll,
            precipitation real DEFAULT NULL,
            cloud_cover real DEFAULT NULL,
            latitude real DEFAULT NULL,
            longitude real DEFAULT NULL,
            conditions text DEFAULT NULL
        );
        """
    ]
    try:
        conn = psycopg2.connect(DSN)
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    finally:
        if conn is not None:
            conn.close()

In [None]:
def update_test_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO test_table (location, date_time, temperature, precipitation, 
                   cloud_cover, latitude, longitude, conditions)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""]                
                   
    with open(csv_file, newline='') as file: 
        data_reader = csv.reader(file, delimiter=',')
        next(data_reader, None)   # skip header                                                                           
        for row in data_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

In [None]:
DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)
create_test_table(conn)
for location in ['Boston_MA']:
    start = time.time()
    for startdate, enddate in dates_list:
        csv_file = './data/weather/' + location + '_weather_data_' + startdate + '_' + enddate + '_col_subset.csv'
        update_test_table(conn, csv_file)
print('Finished adding location', location, 'to the database')


In [18]:
%%sql

UPDATE weather_hourly
SET location = REPLACE(location, ',' , ', ');

 * postgresql://appuser:***@localhost:5432/amtrakproject
1327576 rows affected.


[]

In [19]:
%%sql

SELECT location FROM weather_hourly
GROUP BY location;

 * postgresql://appuser:***@localhost:5432/amtrakproject
15 rows affected.


location
"Baltimore BWI Airport, MD"
"Baltimore, MD"
"Boston, MA"
"Kingston, RI"
"Manhattan, NY"
"New Carrollton, MD"
"New Haven, CT"
"New London, CT"
"Newark, NJ"
"Philadelphia, PA"


In [20]:
%%sql 

SELECT  d.train_num, d.origin_quarter, d.station_code, ROUND( CAST( AVG(d.depart_diff) as numeric), 1) as avg_delay
FROM departures d
GROUP BY d.origin_quarter, d.train_num, d.station_code
ORDER BY ROUND( CAST( AVG(d.depart_diff) as numeric), 2) DESC
LIMIT 30;

 * postgresql://appuser:***@localhost:5432/amtrakproject
30 rows affected.


train_num,origin_quarter,station_code,avg_delay
83,3,NCR,34.8
83,2,NCR,34.4
94,3,STM,33.3
94,3,PVD,31.9
94,3,NLC,31.8
94,2,PVD,31.0
83,3,BWI,30.9
94,3,KIN,30.9
93,3,NCR,30.4
94,2,NLC,30.1


In [21]:
%%sql 

SELECT  d.train_num, d.origin_month, d.station_code, ROUND( CAST( AVG(d.depart_diff) as numeric), 1) as avg_delay
FROM departures d
GROUP BY d.origin_month, d.train_num, d.station_code
ORDER BY ROUND( CAST( AVG(d.depart_diff) as numeric), 2) DESC
LIMIT 30;

 * postgresql://appuser:***@localhost:5432/amtrakproject
30 rows affected.


train_num,origin_month,station_code,avg_delay
94,7,STM,40.6
94,7,PVD,40.5
94,7,NLC,39.5
83,6,NCR,38.4
94,7,KIN,38.3
83,7,NCR,38.3
83,9,NCR,38.0
166,6,KIN,37.9
139,9,NCR,37.3
166,6,PVD,37.1


In [22]:
%%sql 

SELECT  d.train_num, d.origin_year, d.station_code, ROUND( CAST( AVG(d.depart_diff) as numeric), 1) as avg_delay
FROM departures d
GROUP BY d.origin_year, d.train_num, d.station_code
ORDER BY ROUND( CAST( AVG(d.depart_diff) as numeric), 2) DESC
LIMIT 30;

 * postgresql://appuser:***@localhost:5432/amtrakproject
30 rows affected.


train_num,origin_year,station_code,avg_delay
166,2012,KIN,119.0
66,2018,TRE,84.9
166,2011,PVD,62.0
164,2014,RTE,53.5
94,2014,PVD,50.2
94,2015,STM,49.9
94,2015,PVD,49.7
164,2012,BBY,48.3
94,2014,STM,48.0
94,2014,KIN,45.9


In [23]:
%%sql 

SELECT  d.train_num, d.origin_year, d.station_code, ROUND( CAST( AVG(d.depart_diff) as numeric), 1) as avg_delay
FROM departures d
GROUP BY d.origin_year, d.train_num, d.station_code
ORDER BY ROUND( CAST( AVG(d.depart_diff) as numeric), 2) DESC
LIMIT 30;

 * postgresql://appuser:***@localhost:5432/amtrakproject
30 rows affected.


train_num,origin_year,station_code,avg_delay
166,2012,KIN,119.0
66,2018,TRE,84.9
166,2011,PVD,62.0
164,2014,RTE,53.5
94,2014,PVD,50.2
94,2015,STM,49.9
94,2015,PVD,49.7
164,2012,BBY,48.3
94,2014,STM,48.0
94,2014,KIN,45.9
