In [1]:
import dask.dataframe as dd
import psycopg2
import csv
import os
import sys 
import time
import pandas as pd
import numpy as np
assert os.environ.get('DB_PASS') != None , 'empty password!'

In [2]:
conn = psycopg2.connect("dbname='amtrakproject' user='{}' password={}".format(os.environ.get('USER'), os.environ.get('DB_PASS')))
assert conn is not None, 'need to fix conn!!'

In [3]:
def execute_and_get_results(conn, command):
    try:
        cur = conn.cursor()
        cur.execute(command)
        return cur.fetchall()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        conn.rollback()

In [4]:
def make_categories(station_code):
    shared_with_none = set(['KIN', 'WLY', 'MYS'])
    shared_with_1 = set(['BOS', 'BBY', 'RTE', 'PVD', 'NLC', 
                         'OSB', 'NYP', 'NWK', 'EWR', 
                         'MET', 'TRE', 'PHL', 'WIL', 'ABE',
                         'BAL', 'BWI', 'NCR', 'WAS'])
    shared_with_2 = set(['NHV', 'BRP', 'STM', 'NRO'])
    if station_code in shared_with_none:
        return 0
    elif station_code in shared_with_2: 
        return 2
    elif station_code in shared_with_1:
        return 1

In [5]:
df = pd.read_csv('./data/trains/depart_2021_processed.csv')

In [6]:
station_code = df['Station']

In [7]:
shared_tracks = station_code.apply(make_categories)

In [8]:
shared_tracks.head()

0    1
1    1
2    1
3    1
4    1
Name: Station, dtype: int64

In [9]:
shared_tracks.mean()

1.0841695554339232

In [10]:
departures = dd.read_csv('./data/trains/depart*.csv')

In [11]:
arrivals = dd.read_csv('./data/trains/arrive*.csv')

In [12]:
dep_shared_tracks = departures['Station'].apply(make_categories, meta=('Station', 'object'))

In [13]:
arr_shared_tracks = arrivals['Station'].apply(make_categories, meta=('Station', 'object'))

In [14]:
type(arr_shared_tracks)

dask.dataframe.core.Series

In [15]:
departures.head()

Unnamed: 0,Train Num,Station,Direction,Origin Date,Origin Year,Origin Month,Origin Week Day,Full Sch Dp Date,Sch Dp Date,Sch Dp Day,Sch Dp Time,Act Dp Time,Full Act Dp Date,Depart Diff,Service Disruption,Cancellations
0,99,BOS,Southbound,2011-01-01,2011,1,Saturday,2011-01-01 08:40:00,2011-01-01,Saturday,08:40:00,08:40:00,2011-01-01 08:40:00,0,0,0
1,67,BOS,Southbound,2011-01-01,2011,1,Saturday,2011-01-01 21:30:00,2011-01-01,Saturday,21:30:00,21:30:00,2011-01-01 21:30:00,0,0,0
2,99,BOS,Southbound,2011-01-02,2011,1,Sunday,2011-01-02 08:40:00,2011-01-02,Sunday,08:40:00,08:40:00,2011-01-02 08:40:00,0,0,0
3,67,BOS,Southbound,2011-01-02,2011,1,Sunday,2011-01-02 21:30:00,2011-01-02,Sunday,21:30:00,21:30:00,2011-01-02 21:30:00,0,0,0
4,95,BOS,Southbound,2011-01-03,2011,1,Monday,2011-01-03 06:15:00,2011-01-03,Monday,06:15:00,06:15:00,2011-01-03 06:15:00,0,0,0


In [16]:
departures['Sharing_Tracks_Other_RR'] = dep_shared_tracks
arrivals['Sharing_Tracks_Other_RR'] = arr_shared_tracks

In [17]:
departures.shape

(Delayed('int-b4bb0921-2c12-4dea-92b2-e3fd447dd3a4'), 17)

In [18]:
departures.to_csv('dask-df-departures.csv', index=False)

['/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/00.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/01.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/02.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/03.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/04.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/05.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/06.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/07.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/08.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/09.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-departures.csv/10.part']

In [19]:
arrivals.to_csv('dask-df-arrivals.csv', index=False)

['/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/00.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/01.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/02.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/03.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/04.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/05.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/06.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/07.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/08.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/09.part',
 '/Users/elizabethchen/Documents/amtrak-project/dask-df-arrivals.csv/10.part']

In [22]:
query = """
          SELECT 
              CASE 
                  WHEN station_code IN ('KIN', 'WLY', 'MYS') THEN 0
                  WHEN station_code in ('NHV', 'BRP', 'STM', 'NRO') THEN 2
                  ELSE 1
              END Num_Other_RR_On_Tracks
          FROM all_trains;
"""

In [23]:
num_other_RR = execute_and_get_results(conn, query)

In [26]:
len(num_other_RR)

1238035

In [31]:
import numpy

In [32]:
np.mean(num_other_RR)

1.062661394871712