# EDA and ETL Notebook


In [None]:
import pandas as pd
import numpy as np
import time
from datetime import date, timedelta
import re
import requests
import lxml.html as lh
from fetch_data import construct_urls, fetch_data_from_urls

## A. Helper Functions for loading/requesting and processing raw data

In [None]:
def get_data(option='from_saved', start=date.today()-timedelta(days=1), end=date.today()):
    """
    Function to retrieve new data from the website for specified dates, or else skip to next step.
    """
    if option == 'request':
        northbound = [[66, 82, 86, 88, 94], [132, 150, 160, 162, 164, 166], [168, 170, 172, 174]]
        southbound = [[67, 83, 93, 95, 99], [135, 137, 139, 161, 163, 165], [167, 171, 173, 175, 195]]
        urls = construct_urls(northbound, southbound, start, end)
        data = fetch_data_from_urls(urls)
    elif option == 'from_saved':
        data = None
        print("Skip this section and go to part B!")
    return data

### Choose an option
* If `option = 'from_saved'`, go to section on Raw Data.
* Otherwise, uncomment the other line and wait for request to complete.


In [None]:
#data = get_data(option='from_saved')
start =  date(2021,3,15)
end = date(2021,3,31) 
data = get_data(option='request', start=start, end=end)

### Helper Functions for Data Cleaning

In [None]:
def get_direction(num):
    """
    Return direction of the train (odd = Southbound, even = Northbound).
    """
    if num % 2 == 0:
        return 'Northbound'
    else:
        return 'Southbound'


def get_num(re_match):
    """
    Assuming input contains a match , extract and return the numerical data from input.
    """
    num_match = re.search('(?P<num>[0-9]+)', re_match)
    return int(num_match.group('num'))


def make_dict_from_cols(col_names):
    """
    Create dictionary from a list of column names
    """
    dictionary = { col_name: [] for col_name in col_names }
    return dictionary


def get_html_col_names(raw_data, arrive_or_depart):
    """
    Using NYP (station with both arrival times and departure times), 
    retrieve column names from the HTML table, located in the 2nd row.
    """
    data_list = raw_data[arrive_or_depart]['NYP']
    page_content = data_list[0]
    doc = lh.fromstring(page_content)
    tr_elements = doc.xpath('//tr')
    html_col_names = [entry.text_content().strip() for entry in tr_elements[1]]        
    return html_col_names

In [None]:
def raw_data_to_raw_df(raw_data, arrive_or_depart):
    """
    Function to put the raw html data in a dataframe for ease of processing.
    """
    col_names = get_html_col_names(raw_data, arrive_or_depart)
    N = 7
    data_dict = make_dict_from_cols(['Direction', 'Station'] + col_names)
    for station in raw_data[arrive_or_depart].keys():
        data_list = raw_data[arrive_or_depart][station]
        L = len(data_list)
        for i in range(L):
            page_content = data_list[i]
            doc = lh.fromstring(page_content)
            tr_elements = doc.xpath('//tr')
            if len(tr_elements) > 3:
                title = tr_elements[0].text_content()
                direction = get_direction(get_num(title))
                for j in range(2, len(tr_elements)):
                    table_row = tr_elements[j] 
                    if len(table_row) == N:
                        data_dict['Direction'].append(direction)
                        data_dict['Station'].append(station)
                        for col_name, entry in zip(col_names, table_row):
                            data = entry.text_content()
                            data_dict[col_name].append(data)
                    else:
                        continue
                        
            else:
                print("Potentially no data for this time period, or an error occurred", station, arrive_or_depart)
    return pd.DataFrame.from_dict(data_dict)

### Process raw data to better format

In [None]:
start_time = time.time()
depart =  raw_data_to_raw_df(data, 'Depart')
print('elapsed:', time.time() - start_time)
depart.head()

In [None]:
arrive = data['Arrive']
start_time = time.time()
arrive = raw_data_to_raw_df(data, 'Arrive')
print('elapsed:', time.time() - start_time)
arrive.head()

In [None]:

arrive_filestring = './data/trains/raw_arrive_' + str(start) + '_' + str(end) + '.csv'
depart_filestring = './data/trains/raw_depart_' + str(start) + '_' +  str(end) + '.csv'
print(arrive_filestring)
print(depart_filestring)

In [None]:
arrive.to_csv(arrive_filestring, line_terminator='\n', index=False)
depart.to_csv(depart_filestring, line_terminator='\n', index=False)

## Raw Train Data - Scraped and Loaded into Pandas DF

The data is scraped from an HTML table, so the raw data doesn't look nice after scraping until it's put back in a dataframe. The data was then processed into an initial dataframe and saved as a CSV for later processing.


In [None]:
arrive = pd.read_csv(arrive_filestring, lineterminator='\n', keep_default_na=False)
depart = pd.read_csv(depart_filestring, lineterminator='\n', keep_default_na=False)

In [None]:
arrive.head()

In [None]:
arrive.shape[0]

In [None]:
depart.head()

In [None]:
depart.shape[0]

In [None]:
def get_col_names(arrive_or_depart):
    if arrive_or_depart == 'Arrive':
        return ['Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year', 'Origin Quarter', 
                 'Origin Month', 'Origin Day', 'Origin Week Day', 'Full Sch Ar Date', 'Sch Ar Date', 
                 'Sch Ar Day', 'Sch Ar Time','Act Ar Time', 'Arrive Diff', 'Service Disruption', 'Cancellations']
    elif arrive_or_depart == 'Depart':
        return [ 'Train Num',  'Station', 'Direction', 'Origin Date', 'Origin Year', 'Origin Quarter', 
                 'Origin Month', 'Origin Day', 'Origin Week Day', 'Full Sch Dp Date','Sch Dp Date', 
                 'Sch Dp Day', 'Sch Dp Time','Act Dp Time', 'Depart Diff', 'Service Disruption', 'Cancellations']

    
def get_key_names(arrive_or_depart):
    if arrive_or_depart == 'Arrive':
        return {'Sch Full Date': 'Full Sch Ar Date', 'Sch Abbr': 'Sch Ar', 'Act Abbr': 'Act Ar', 'Diff': 'Arrive Diff'}
    
    elif arrive_or_depart == 'Depart':
        return {'Sch Full Date': 'Full Sch Dp Date', 'Sch Abbr': 'Sch Dp', 'Act Abbr': 'Act Dp', 'Diff': 'Depart Diff'}


def process_columns(df, arrive_or_depart):
    ad_keys = get_key_names(arrive_or_depart) # the specific keys depending on if new_df is for arr or dep data
    
    new_df = pd.DataFrame()
    new_df['Train Num'] = pd.to_numeric(df['Train #'])
    new_df['Station'] = df['Station']
    new_df['Direction'] = df['Direction']
    
    origin_date = pd.to_datetime(df['Origin Date'], format="%m/%d/%Y", exact=False, errors='coerce')    
    new_df['Origin Date'] = origin_date
    new_df['Origin Year'] = origin_date.dt.year
    new_df['Origin Quarter'] = origin_date.dt.quarter
    new_df['Origin Month'] = origin_date.dt.month
    new_df['Origin Day'] = origin_date.dt.day
    new_df['Origin Week Day'] = origin_date.dt.day_name()
    
    sched_full_date = pd.to_datetime(df[ad_keys['Sch Abbr']], format='%m/%d/%Y %I:%M %p', exact=False, errors='coerce')
    new_df[ad_keys['Sch Full Date']] = sched_full_date
    new_df[ad_keys['Sch Abbr'] + ' Date'] = sched_full_date.dt.date
    new_df[ad_keys['Sch Abbr'] + ' Day'] = sched_full_date.dt.day_name()
    new_df[ad_keys['Sch Abbr'] + ' Time'] = sched_full_date.dt.time
    act_time = pd.to_datetime(df[ad_keys['Act Abbr']], format='%I:%M%p', exact=False, errors='coerce')
    new_df[ad_keys['Act Abbr'] + ' Time'] = act_time.dt.time
    
    df['Sched Date'] = sched_full_date 
    df['Act Date'] = pd.to_datetime(sched_full_date.dt.date.astype(str) + " " + df[ad_keys['Act Abbr']].astype(str),exact=False, errors='coerce')
    max_expected_delay = pd.Timedelta(hours=10)
    delta = df['Act Date'] - df['Sched Date']
    m_late = (delta < max_expected_delay) & (-1*max_expected_delay > delta)
    m_early = (-1*delta < max_expected_delay) & (-1*max_expected_delay > -1*delta)
    df.loc[m_late, 'Act Date'] += pd.Timedelta(days=1)
    df.loc[m_early, 'Act Date'] -= pd.Timedelta(days=1)
    new_df[ad_keys['Diff']] = np.rint((df['Act Date'] - df['Sched Date']).dt.total_seconds()/60).astype(int)
    new_df['Service Disruption'] = df['Service Disruption'].replace('SD', 1).replace('', 0)
    new_df['Cancellations'] =  df['Cancellations'].replace('C', 1).replace('', 0)
    return new_df.replace('', np.nan).dropna()


In [None]:
full_depart = process_columns(depart, "Depart")
full_depart.head()

In [None]:
full_depart.tail()

In [None]:
full_depart.shape[0]

In [None]:
full_arrive = process_columns(arrive, 'Arrive')
full_arrive.head()

In [None]:
full_arrive.tail()

In [None]:
full_arrive.shape[0]

### Create CSV files by year to break down data into smaller chunks
* Ignore any data from 2010, this is only 23 rows in the departure and arrival dataframes combined (due to trains that were retrieved with the web request starting 1/1/2011 but originated in 2010). 
* Subset into files by arrival and departure by year


In [None]:
#years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

#for year in years:
#    depart_subset = full_depart.loc[(full_depart['Origin Year'] == year)]
#    arrive_subset = full_arrive.loc[(full_arrive['Origin Year'] == year)]
#    print(depart_subset.shape[0], arrive_subset.shape[0])
#    depart_filestring = './data/trains/processed_depart_' + str(year) + '.csv'
#    arrive_filestring = './data/trains/processed_arrive_' + str(year) + '.csv'
#    depart_subset.to_csv(depart_filestring, line_terminator='\n', index=False)
#    arrive_subset.to_csv(arrive_filestring, line_terminator='\n', index=False)

In [None]:
#prev_arrive_2020 = pd.read_csv('./data/trains/processed_arrive_2020.csv')
#prev_depart_2020 = pd.read_csv('./data/trains/processed_depart_2020.csv')

In [None]:
#prev_depart_2020['Origin Month'].unique()
#prev_arrive_2020['Origin Month'].unique()

In [None]:
# prev_depart_2020['Sch Dp Date'].unique().shape[0] 

# 2020 was a leap year !!!
# and it's 366 + 1 = 367 because the overnight train starts in 2020 and goes until 2021 
# fun times 

In [None]:
prev_arrive_2021 = pd.read_csv('./data/trains/processed_arrive_2021.csv')
prev_depart_2021 = pd.read_csv('./data/trains/processed_depart_2021.csv')

In [None]:
new_arrive_2021 = pd.concat([prev_arrive_2021, full_arrive], ignore_index=True, axis=0)
new_depart_2021 = pd.concat([prev_depart_2021, full_depart], ignore_index=True, axis=0)

In [None]:
new_depart_2021['Sch Dp Date'].unique()

In [None]:
new_arrive_2021.to_csv('./data/trains/processed_arrive_2021.csv', line_terminator='\n', index=False)
new_depart_2021.to_csv('./data/trains/processed_depart_2021.csv', line_terminator='\n', index=False)

## C. Postgres Database 

In [14]:
import psycopg2
import csv
import os
import sys 
assert os.environ.get('DB_PASS') != None , 'empty password!'

In [None]:
## NOTE TO SELF: RUN IF USING INTEL CONDA ENV
## username is "appuser" # no longer
# DSN = "dbname='amtrakproject' user='appuser' password={}".format(os.environ.get('DB_PASS'))
# conn = psycopg2.connect(DSN)

In [15]:
## RUN IF USING arm64 CONDA ENV
## username is "ecc"
# DSN = "dbname='amtrakproject' user='ecc' password={}".format(os.environ.get('DB_PASS'))
#conn = psycopg2.connect(DSN)

In [None]:
## RUN IF FOR HEROKU
assert os.environ.get('DATABASE_URL') != None, 'need to set DATABASE_URL config var'
DATABASE_URL = os.environ['DATABASE_URL']
conn = psycopg2.connect(DATABASE_URL, sslmode='require')

### Create Tables

In [17]:
def create_train_tables(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS train_info CASCADE;
        CREATE TABLE train_info (
            train_info_id SERIAL PRIMARY KEY,
            train_num text UNIQUE,
            operating_direction text,
            reg_operates_on_mon boolean,
            reg_operates_on_tues boolean,
            reg_operates_on_wed boolean,
            reg_operates_on_thurs boolean,
            reg_operates_on_fri boolean,
            reg_operates_on_sat boolean,
            reg_operates_on_sun boolean,
            depart_origin_time text,
            depart_NY_time text,
            arrive_dest_time text
            
        );
        """,
        """ 
        DROP TABLE IF EXISTS arrivals CASCADE;
        CREATE TABLE arrivals (
            dataset_id SERIAL PRIMARY KEY,
            train_num text REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_arr_datetime timestamp,
            sched_arr_date date,
            sched_arr_week_day text,
            sched_arr_time time,
            act_arr_time time,
            arrive_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """,
        """ 
        DROP TABLE IF EXISTS departures CASCADE;
        CREATE TABLE departures (
            dataset_id SERIAL PRIMARY KEY,
            train_num text REFERENCES train_info (train_num),
            station_code text, 
            direction text,
            origin_date date,
            origin_year int,
            origin_quarter int,
            origin_month int,
            origin_day int,
            origin_week_day text,
            full_sched_dep_datetime timestamp,
            sched_dep_date date,
            sched_dep_week_day text,
            sched_dep_time time,
            act_dep_time time,
            depart_diff numeric,
            service_disruption boolean,
            cancellations boolean     
        );
        """
    ]
    try:
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    finally:
        if conn is not None:
            conn.close()

In [None]:
create_train_tables(conn)

### Add to Database

In [18]:
import csv
from sqlalchemy import text
from psycopg2 import sql 

def update_train_info_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO train_info (train_num, operating_direction, reg_operates_on_mon, 
                   reg_operates_on_tues, reg_operates_on_wed, reg_operates_on_thurs, 
                   reg_operates_on_fri, reg_operates_on_sat, reg_operates_on_sun, 
                   depart_origin_time, depart_NY_time, arrive_dest_time)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""]                
                
    with open(csv_file, newline='') as file:
        info_reader = csv.reader(file, delimiter=',')
        next(info_reader) # skip header                                                                          
        for row in info_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                conn.rollback()
        conn.commit() 

def update_arrive_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO arrivals (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_arr_datetime, sched_arr_date, sched_arr_week_day,
                               sched_arr_time, act_arr_time, arrive_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""]                        
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)     # skip header                                                                         
        for row in train_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()

def update_depart_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO departures (train_num, station_code, direction, origin_date, origin_year, origin_quarter, origin_month, 
                               origin_day, origin_week_day, full_sched_dep_datetime, sched_dep_date, sched_dep_week_day,
                               sched_dep_time, act_dep_time, depart_diff, service_disruption, cancellations) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING"""]                        
    with open(csv_file, newline='') as file: 
        train_reader = csv.reader(file, delimiter=',')
        next(train_reader, None)   # skip header                                                                           
        for row in train_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()


In [19]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
depart_filestrings_list = []
arrive_filestrings_list = []
for year in years:
    depart_filestring = './data/trains/processed_depart_' + str(year) + '.csv'
    arrive_filestring = './data/trains/processed_arrive_' + str(year) + '.csv'
    depart_filestrings_list.append(depart_filestring) 
    arrive_filestrings_list.append(arrive_filestring)

In [20]:
print(depart_filestrings_list)
print(arrive_filestrings_list)

['./data/trains/processed_depart_2011.csv', './data/trains/processed_depart_2012.csv', './data/trains/processed_depart_2013.csv', './data/trains/processed_depart_2014.csv', './data/trains/processed_depart_2015.csv', './data/trains/processed_depart_2016.csv', './data/trains/processed_depart_2017.csv', './data/trains/processed_depart_2018.csv', './data/trains/processed_depart_2019.csv', './data/trains/processed_depart_2020.csv', './data/trains/processed_depart_2021.csv']
['./data/trains/processed_arrive_2011.csv', './data/trains/processed_arrive_2012.csv', './data/trains/processed_arrive_2013.csv', './data/trains/processed_arrive_2014.csv', './data/trains/processed_arrive_2015.csv', './data/trains/processed_arrive_2016.csv', './data/trains/processed_arrive_2017.csv', './data/trains/processed_arrive_2018.csv', './data/trains/processed_arrive_2019.csv', './data/trains/processed_arrive_2020.csv', './data/trains/processed_arrive_2021.csv']


In [22]:
## Ran on Apple Silicon arch
## New output
import time

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
conn = psycopg2.connect(DSN)
create_train_tables(conn)

begin_everything = time.time()
update_train_info_table(conn, './data/trains/train_nums.csv')
for i in range(len(years)):
    start = time.time()
    arrive_csv = arrive_filestrings_list[i]
    depart_csv = depart_filestrings_list[i]
    update_arrive_table(conn, arrive_csv)
    update_depart_table(conn, depart_csv)
    print('DONE WITH', years[i], 'in', time.time() - start)
conn.close()

print('COMPLETE in', time.time() - begin_everything)

DONE WITH 2011 in 3.4802911281585693
DONE WITH 2012 in 3.4518070220947266
DONE WITH 2013 in 3.6265180110931396
DONE WITH 2014 in 3.7650251388549805
DONE WITH 2015 in 3.7895209789276123
DONE WITH 2016 in 3.8809731006622314
DONE WITH 2017 in 3.8615291118621826
DONE WITH 2018 in 3.7825160026550293
DONE WITH 2019 in 3.8574459552764893
DONE WITH 2020 in 5.321774959564209
DONE WITH 2021 in 0.8470830917358398
COMPLETE in 39.67019510269165


In [23]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [24]:
%sql postgresql://ecc:test@localhost:5432/amtrakproject

In [25]:
%%sql

SELECT COUNT(*) from departures;

 * postgresql://ecc:***@localhost:5432/amtrakproject
1 rows affected.


count
791497


In [26]:
%%sql

SELECT COUNT(*) from arrivals;

 * postgresql://ecc:***@localhost:5432/amtrakproject
1 rows affected.


count
104047


In [2]:
print('num train rows presently 4/3', 104047 + 791497)

num train rows presently 4/3 895544


In [3]:
print('num weather rows presently 4/3', 1347202)

num weather rows presently 4/3 1347202


In [5]:
print('total rows presntly 4/3', 104047 + 791497 + 1347202)

total rows presntly 4/3 2242746


In [4]:
num_weekly_train =  1300
num_weekly_weather = 15 * 24 * 7
num_yearly = 52 * (num_weekly_train + num_weekly_weather)
print('num yearly added rows', num_yearly)

num yearly added rows 198640


In [6]:
## Ran on Intel emulated arch
## Old output

#import time

#years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
#conn = psycopg2.connect(DSN)
#create_tables(conn)

#begin_everything = time.time()
#update_train_info_table(conn, './data/trains/train_nums.csv')
#for i in range(len(years)):
#    start = time.time()
#    arrive_csv = arrive_filestrings_list[i]
#    depart_csv = depart_filestrings_list[i]
#    update_arrive_table(conn, arrive_csv)
#    update_depart_table(conn, depart_csv)
#    print("DONE WITH", years[i], 'in', time.time() - start)
#conn.close()

#print("COMPLETE in", time.time() - begin_everything)

DONE WITH 2011 in 5.648728132247925
DONE WITH 2012 in 5.568975925445557
DONE WITH 2013 in 5.822044849395752
DONE WITH 2014 in 6.143963813781738
DONE WITH 2015 in 6.068666934967041
DONE WITH 2016 in 6.223670959472656
DONE WITH 2017 in 6.284543991088867
DONE WITH 2018 in 6.0637428760528564
DONE WITH 2019 in 6.20758581161499
DONE WITH 2020 in 8.508411169052124
DONE WITH 2021 in 1.3244810104370117
COMPLETE in 63.870940923690796


## Weather Data

In [1]:
import psycopg2
import os
import sys 
from sqlalchemy import text
from psycopg2 import sql
import time
import csv
import requests
import pandas as pd
import numpy as np

### To get all the data

In [8]:
locations = ['Boston,MA', 'Providence,RI', 'Kingston,RI', 'New%20London,CT', 'New%20Haven,CT', 'Stamford,CT', 
             'Manhattan,NY', 'Newark,NJ', 'Trenton,NJ', 'Philadelphia,PA', 'Wilmington,DE', 'Baltimore,MD', 
             'Baltimore%20BWI%20Airport,MD', 'New%20Carrollton,MD', 'Washington,DC']

location_names_for_files = ['Boston_MA', 'Providence_RI', 'Kingston_RI', 'New_London_CT', 'New_Haven_CT', 
                            'Stamford_CT', 'Manhattan_NY', 'Newark_NJ', 'Trenton_NJ', 'Philadelphia_PA', 
                            'Wilmington_DE', 'Baltimore_MD', 'Baltimore_BWI_Airport_MD', 'New_Carrollton_MD', 
                            'Washington_DC']

In [None]:
OLD_DATES_LIST = """
dates_list = [('2011-01-01','2012-01-01'), 
              ('2012-01-01','2013-01-01'),
              ('2013-01-01','2014-01-01'),
              ('2014-01-01','2015-01-01'),
              ('2015-01-01','2016-01-01'),
              ('2016-01-01','2017-01-01'),
              ('2017-01-01','2018-01-01'),
              ('2018-01-01','2019-01-01'),
              ('2019-01-01','2020-01-01'),
              ('2020-01-01','2021-01-01')]"""

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

dates_list = [('2021-04-02', '2021-04-04')]

# HELLO! Uncomment the requests.get() line when you have quadruple checked the dates!!

In [18]:
url_base = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=1&startDateTime='

for location, filename in zip(locations, location_names_for_files):
    print('Running urls for', location)
    for startdate, enddate in dates_list:
        url = url_base + startdate + 'T00:00:00&endDateTime=' + enddate + 'T00:00:00&unitGroup=us&contentType=csv&location=' + location + '&key='+os.environ.get('VC_TOKEN')
    #    csv_bytes = requests.get(url).content
        filestring = './data/weather_original/' + filename + '_weather_data_' + startdate + '_' + enddate + '.csv'
        with open(filestring, 'w', newline='\n') as csvfile:
            csvfile.write(csv_bytes.decode())
        csvfile.close()

Running urls for Boston,MA
Running urls for Providence,RI
Running urls for Kingston,RI
Running urls for New%20London,CT
Running urls for New%20Haven,CT
Running urls for Stamford,CT
Running urls for Manhattan,NY
Running urls for Newark,NJ
Running urls for Trenton,NJ
Running urls for Philadelphia,PA
Running urls for Wilmington,DE
Running urls for Baltimore,MD
Running urls for Baltimore%20BWI%20Airport,MD
Running urls for New%20Carrollton,MD
Running urls for Washington,DC


# NOW! Re-comment the line out so you don't make a mistake next time!!

## Data cleaning process
* Dropping NA values (very small fraction actually dropped)
* Rename city name to include a space after the comma 
* Select chosen columns

In [144]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for location in location_names_for_files:
    for year in years:
        filename = './data/weather_original/' + location + '_weather_' + str(year) + '.csv'
        full_weather = pd.read_csv(filename)
        full_weather_new = full_weather[['Address', 'Date time', 'Temperature', 'Precipitation', 'Cloud Cover', 
                                    'Latitude', 'Longitude', 'Conditions']]
        dropna_weather = full_weather_new.replace('', np.nan).dropna()
        dropna_weather['Address'] = dropna_weather['Address'].str.replace(',', ', ')
        print(location, year, 'fraction of data kept: ', dropna_weather.shape[0]/full_weather_new.shape[0])
        dropna_weather.to_csv('./data/weather/' + location + '_weather_' + str(year) + '_subset.csv', index=False)

Boston_MA 2011 fraction of data kept:  0.9996575342465753
Boston_MA 2012 fraction of data kept:  0.9995446265938069
Boston_MA 2013 fraction of data kept:  1.0
Boston_MA 2014 fraction of data kept:  1.0
Boston_MA 2015 fraction of data kept:  1.0
Boston_MA 2016 fraction of data kept:  0.9994307832422586
Boston_MA 2017 fraction of data kept:  0.9997717155575847
Boston_MA 2018 fraction of data kept:  0.9997716894977169
Boston_MA 2019 fraction of data kept:  0.9998858577787924
Boston_MA 2020 fraction of data kept:  1.0
Providence_RI 2011 fraction of data kept:  0.9992009132420091
Providence_RI 2012 fraction of data kept:  0.9992030965391621
Providence_RI 2013 fraction of data kept:  0.9998858577787924
Providence_RI 2014 fraction of data kept:  0.9958904109589041
Providence_RI 2015 fraction of data kept:  0.9993151466727542
Providence_RI 2016 fraction of data kept:  0.9990892531876139
Providence_RI 2017 fraction of data kept:  0.9965757333637713
Providence_RI 2018 fraction of data kept:  0.9

Washington_DC 2018 fraction of data kept:  0.9997716894977169
Washington_DC 2019 fraction of data kept:  1.0
Washington_DC 2020 fraction of data kept:  1.0


In [145]:
for location in location_names_for_files:
    filename = './data/weather_original/' + location + '_weather_data_2021-01-01_2021-04-02.csv'
    full_weather = pd.read_csv(filename)
    full_weather_new = full_weather[['Address', 'Date time', 'Temperature', 'Precipitation', 'Cloud Cover', 
                                'Latitude', 'Longitude', 'Conditions']]
    dropna_weather = full_weather_new.replace('', np.nan).dropna()
    dropna_weather['Address'] = dropna_weather['Address'].str.replace(',', ', ')
    print(location, year, 'fraction of data kept: ', dropna_weather.shape[0]/full_weather_new.shape[0])
    dropna_weather.to_csv('./data/weather/' +  location + '_weather_2021_subset.csv', index=False)

Boston_MA 2020 fraction of data kept:  1.0
Providence_RI 2020 fraction of data kept:  1.0
Kingston_RI 2020 fraction of data kept:  1.0
New_London_CT 2020 fraction of data kept:  1.0
New_Haven_CT 2020 fraction of data kept:  1.0
Stamford_CT 2020 fraction of data kept:  1.0
Manhattan_NY 2020 fraction of data kept:  1.0
Newark_NJ 2020 fraction of data kept:  1.0
Trenton_NJ 2020 fraction of data kept:  1.0
Philadelphia_PA 2020 fraction of data kept:  1.0
Wilmington_DE 2020 fraction of data kept:  1.0
Baltimore_MD 2020 fraction of data kept:  1.0
Baltimore_BWI_Airport_MD 2020 fraction of data kept:  1.0
New_Carrollton_MD 2020 fraction of data kept:  1.0
Washington_DC 2020 fraction of data kept:  0.9995511669658886


### Raw weather data comes well-formatted in CSV already

In [132]:
filestring = './data/weather_original/Boston_MA_weather_2011.csv'
df_sample = pd.read_csv(filestring)
df_sample.head()

Unnamed: 0,Address,Date time,Minimum Temperature,Maximum Temperature,Temperature,Dew Point,Relative Humidity,Heat Index,Wind Speed,Wind Gust,...,Visibility,Cloud Cover,Sea Level Pressure,Weather Type,Latitude,Longitude,Resolved Address,Name,Info,Conditions
0,"Boston,MA",2011-01-01 00:00:00,41.1,41.1,41.1,29.0,61.93,,13.9,,...,9.9,0.0,1017.0,,42.3587,-71.0567,"Boston, MA, United States","Boston, MA, United States",,Clear
1,"Boston,MA",2011-01-01 01:00:00,39.8,39.8,39.8,29.9,67.46,,11.4,,...,9.9,0.0,1017.0,,42.3587,-71.0567,"Boston, MA, United States","Boston, MA, United States",,Clear
2,"Boston,MA",2011-01-01 02:00:00,37.1,37.1,37.1,29.0,72.27,,8.1,,...,8.7,30.0,1017.4,,42.3587,-71.0567,"Boston, MA, United States","Boston, MA, United States",,Partially cloudy
3,"Boston,MA",2011-01-01 03:00:00,35.0,35.0,35.0,29.0,78.72,,5.8,,...,8.1,30.0,1017.6,,42.3587,-71.0567,"Boston, MA, United States","Boston, MA, United States",,Partially cloudy
4,"Boston,MA",2011-01-01 04:00:00,37.1,37.1,37.1,29.9,74.97,,6.9,,...,8.7,90.0,1017.5,,42.3587,-71.0567,"Boston, MA, United States","Boston, MA, United States",,Overcast


### Add to database

In [3]:
def create_weather_table(conn):
    """Create tables in the PostgreSQL database"""
    commands = [  
        """
        DROP TABLE IF EXISTS weather_hourly CASCADE;
        CREATE TABLE weather_hourly (
            weather_id SERIAL PRIMARY KEY,
            location text DEFAULT NULL,
            date_time timestamp DEFAULT NULL,
            temperature real DEFAULT NUll,
            precipitation real DEFAULT NULL,
            cloud_cover real DEFAULT NULL,
            latitude real DEFAULT NULL,
            longitude real DEFAULT NULL,
            conditions text DEFAULT NULL
        );
        """
    ]
    try:
        conn = psycopg2.connect(DSN)
        cur = conn.cursor()
        for command in commands:
            cur.execute(command)
        cur.close()
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        err_type, err_obj, traceback = sys.exc_info()
        line_num = traceback.tb_lineno
        print ("\npsycopg2 ERROR:", error, "on line number:", line_num)
        print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    finally:
        if conn is not None:
            conn.close()

DSN = "dbname='amtrakproject' user='ecc' password={}".format(os.environ.get('DB_PASS'))
conn = psycopg2.connect(DSN)
create_weather_table(conn)

In [4]:
def update_weather_table(conn, csv_file):
    c = conn.cursor()
    commands = ["""INSERT INTO weather_hourly (location, date_time, temperature, precipitation, 
                   cloud_cover, latitude, longitude, conditions)
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
                   ON CONFLICT DO NOTHING"""]                
                   
    with open(csv_file, newline='') as file: 
        data_reader = csv.reader(file, delimiter=',')
        next(data_reader, None)   # skip header                                                                           
        for row in data_reader:                                           
            try:
                c.execute(commands[0], tuple(row))
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
                print(row)
                conn.rollback()
        conn.commit()


In [9]:
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
conn = psycopg2.connect(DSN)
create_weather_table(conn)
begin_everything = time.time()
for location in location_names_for_files:
    start = time.time()
    for year in years:
        csv_file = './data/weather/' + location + '_weather_' + str(year) + '_subset.csv'
        update_weather_table(conn, csv_file)
    print('Finished adding location', location, 'to the database in', time.time() - start, 'seconds')
print("COMPLETE in", time.time() - begin_everything)

Finished adding location Boston_MA to the database in 2.488786220550537 seconds
Finished adding location Providence_RI to the database in 2.4389419555664062 seconds
Finished adding location Kingston_RI to the database in 2.436068058013916 seconds
Finished adding location New_London_CT to the database in 2.431364059448242 seconds
Finished adding location New_Haven_CT to the database in 2.425219774246216 seconds
Finished adding location Stamford_CT to the database in 2.478576898574829 seconds
Finished adding location Manhattan_NY to the database in 2.423409938812256 seconds
Finished adding location Newark_NJ to the database in 2.4260599613189697 seconds
Finished adding location Trenton_NJ to the database in 2.422894239425659 seconds
Finished adding location Philadelphia_PA to the database in 2.486855983734131 seconds
Finished adding location Wilmington_DE to the database in 2.4337100982666016 seconds
Finished adding location Baltimore_MD to the database in 2.458324909210205 seconds
Finis

### Above ^ all data from 2011 - April 2,  2021 was inserted into the DB

In [10]:
%load_ext sql

In [12]:
%sql postgresql://ecc:test@localhost:5432/amtrakproject

In [13]:
%%sql

SELECT COUNT(*)
FROM weather_hourly;

 * postgresql://ecc:***@localhost:5432/amtrakproject
1 rows affected.


count
1347202
