In [1]:
import psycopg2
import csv
import os
import sys 
import time
assert os.environ.get('DB_PASS') != None , 'empty password!'

In [2]:
conn = psycopg2.connect("dbname='amtrakproject' user='{}' password={}".format(os.environ.get('USER'), os.environ.get('DB_PASS')))
assert conn is not None, 'need to fix conn!!'

### Table_names for reference:
* `station_info`
* `all_trains`
* `weather_hourly`
* `regional_route`

In [3]:
def execute_command(conn, command):
    try:
        cur = conn.cursor()
        cur.execute(command)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        conn.rollback()

In [4]:
%load_ext sql

In [5]:
%sql postgresql://elizabethchen:test@localhost:5432/amtrakproject

### See how many unique values there are for `act_arr_dep_time`

In [6]:
%%sql 

SELECT 
    COUNT(DISTINCT act_arr_dep_time)
FROM 
    all_trains;

 * postgresql://elizabethchen:***@localhost:5432/amtrakproject
1 rows affected.


count
1440


In [7]:
%%sql 

SELECT 
    COUNT(act_arr_dep_time)
FROM 
    all_trains;

 * postgresql://elizabethchen:***@localhost:5432/amtrakproject
1 rows affected.


count
1394064


### Create index on `act_arr_dep_time` for trains table

In [11]:
create_arr_dep_index = """
                       CREATE INDEX act_time ON all_trains (act_arr_dep_time);
                       """
# ALREADY RAN execute_command(conn, create_arr_dep_index)

### ADD COLUMNS TO `all_trains`: weather hourly round up/down weights
* Weights are determined by the following formulas, where `num_mins` is the extracted minutes part of the actual arrival/departure time at a given station
    * `round_up_weight` (round up to nearest hour): $$\frac{\texttt{num_mins}}{60}$$
    * `round_down_weight` (round down to nearest hour): $$\frac{60 - \texttt{num_mins}}{60}$$
    

In [12]:
add_weights_and_round_hours = """
                              ALTER TABLE all_trains
                              ADD COLUMN round_down_weight real,
                              ADD COLUMN round_up_weight real,
                              ADD COLUMN full_act_datetime_round_down timestamp,
                              ADD COLUMN full_act_datetime_round_up timestamp;
                              """

# ALREADY RAN execute_command(conn, add_weights_and_round_hours)

In [13]:
set_weights_and_round_hours_part1 = """
                                    UPDATE all_trains 
                                    SET round_up_weight = subquery.up_weight,
                                        full_act_datetime_round_down = subquery.trunc_hour
                                    FROM (
                                       SELECT
                                           t.dataset_id AS dataset_id,
                                           EXTRACT (MINUTES FROM t.full_act_arr_dep_datetime)/60 AS up_weight,
                                           DATE_TRUNC('hour', t.full_act_arr_dep_datetime) AS trunc_hour
                                        FROM all_trains t
                                   ) AS subquery
                                   WHERE all_trains.dataset_id = subquery.dataset_id;
                               """
# ALREADY RAN execute_command(conn, set_weights_and_round_hours_part1)

In [14]:
set_weights_and_round_hours_part2 = """
                                    UPDATE all_trains 
                                    SET round_down_weight = subquery.down_weight,
                                        full_act_datetime_round_up = subquery.round_up_hour
                                    FROM (
                                       SELECT
                                           t.dataset_id AS dataset_id,
                                           1 - t.round_up_weight AS down_weight,
                                           t.full_act_datetime_round_down +  INTERVAL '1 hour' AS round_up_hour
                                        FROM all_trains t
                                   ) AS subquery
                                   WHERE all_trains.dataset_id = subquery.dataset_id;
                                   """

# ALREADY RAN execute_command(conn, set_weights_and_round_hours_part2)

### Test out the results

In [15]:
%%sql

SELECT  
    act_arr_dep_time,
    round_down_weight,
    round_up_weight,
    full_act_datetime_round_down,
    full_act_datetime_round_up 
FROM 
    all_trains 
LIMIT 15;

 * postgresql://elizabethchen:***@localhost:5432/amtrakproject
15 rows affected.


act_arr_dep_time,round_down_weight,round_up_weight,full_act_datetime_round_down,full_act_datetime_round_up
22:42:00,0.3,0.7,2012-02-05 22:00:00,2012-02-05 23:00:00
11:09:00,0.85,0.15,2012-03-21 11:00:00,2012-03-21 12:00:00
19:03:00,0.95,0.05,2012-03-21 19:00:00,2012-03-21 20:00:00
19:05:00,0.9166667,0.083333336,2012-03-23 19:00:00,2012-03-23 20:00:00
03:37:00,0.38333333,0.6166667,2012-03-24 03:00:00,2012-03-24 04:00:00
19:04:00,0.93333334,0.06666667,2012-04-02 19:00:00,2012-04-02 20:00:00
19:01:00,0.98333335,0.016666668,2012-04-12 19:00:00,2012-04-12 20:00:00
14:39:00,0.35000002,0.65,2012-04-16 14:00:00,2012-04-16 15:00:00
19:10:00,0.8333333,0.16666667,2012-04-17 19:00:00,2012-04-17 20:00:00
19:03:00,0.95,0.05,2012-04-24 19:00:00,2012-04-24 20:00:00


### Create index on `date_time` for weather table

In [16]:
create_dt_index_weather = """
                          CREATE INDEX dt_index ON weather_hourly (date_time);
                          """
# ALREADY RAN execute_command(conn, create_dt_index_weather)

### View the weather columns and data types we will need to join

In [18]:
%%sql

SELECT 
    column_name,
    data_type
FROM 
    information_schema.columns
WHERE
    table_name = 'weather_hourly';

 * postgresql://elizabethchen:***@localhost:5432/amtrakproject
10 rows affected.


column_name,data_type
weather_id,integer
location,text
date_time,timestamp without time zone
temperature,real
precipitation,real
cloud_cover,real
conditions,text
weather_type,text
latitude,real
longitude,real


### Plan for joining/merging weighted average weather for train data:
* `temperature`, `precipitation`, `cloud_cover` are all numeric values and can easily be averaged using the `up` and `down` weights previously calculated for the trains table
* `conditions` is 99%+ not null,  `weather_type` is mostly null but gives additional information when it is not null
    * `DISTINCT VALUES`: `Partially cloudy`, `Overcast`, `Rain, Partially cloudy`, `Rain, Overcast`, `Rain`, `Clear`
    * (`Snow` is included/mentioned in the `weather_type` column)
    * Test #1: take set intersection of previous and next hour `conditions`
        * If no results in some places, continue to next Test
    * Test #2: take set union of previous and next hour `conditions`, order by the assigned `weight`
        * Options for combining/summarizing:
            * Reduce the quantity of observation types by converting very specific weather types to more general ones
                * E.g. "Freezing Drizzle/Freezing Rain" could be converted to a "Rain" column on some sort of scale TBD (Ex: `1 = "Mist", 2 = "Drizzle", ... , K = "Heavy Freezing Rain", ..., Z = "Hail"`)
            * If two variations of the same precipitation or condition type, e.g. "Heavy Rain" and "Freezing Drizzle/Freezing Rain" appear in the same entry, take the one with the higher weight and add associated precipitation value to precip_marker column
            
### Idea: Assign numeric scale to all distinct values attributes, then average the numeric valued combinations of attributes over the set and use this numeric