In [1]:
import psycopg2
import csv
import os
import sys 
import time
import pandas as pd
assert os.environ.get('DB_PASS') != None , 'empty password!'

In [2]:
conn = psycopg2.connect("dbname='amtrakproject' user='{}' password={}".format(os.environ.get('USER'), os.environ.get('DB_PASS')))
assert conn is not None, 'need to fix conn!!'

### Table_names for reference:
* `station_info`
* `stops`
* `weather_hourly`
* `regional_route`
* to create: `trips`

In [3]:
def execute_command(conn, command):
    try:
        cur = conn.cursor()
        cur.execute(command)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        conn.rollback()

In [4]:
%load_ext sql

In [5]:
%sql postgresql://elizabethchen:test@localhost:5432/amtrakproject

### See how many unique values there are for `act_arr_dep_time`

In [7]:
%%sql 

SELECT 
    COUNT(DISTINCT act_arr_dep_time)
FROM 
    stops;

 * postgresql://elizabethchen:***@localhost:5432/amtrakproject
1 rows affected.


count
1440


### Create track sharing column
* Count of number of other railroads sharing the tracks between stations

In [None]:
track_query = """
              SELECT 
                  CASE 
                      WHEN station_code IN ('KIN', 'WLY', 'MYS') THEN 0
                      WHEN station_code in ('NHV', 'BRP', 'STM', 'NRO') THEN 2
                      ELSE 1
                  END Num_Other_RR_On_Tracks
              FROM 
                  all_trains;
              """

In [None]:
add_shared_tracks = """
                    ALTER TABLE all_trains
                    ADD COLUMN Num_Other_RR_Sharing integer;
                    UPDATE stops
                    SET Num_Other_RR_Sharing = (
                                      SELECT 
                                          CASE 
                                              WHEN station_code IN ('KIN', 'WLY', 'MYS') THEN 0
                                              WHEN station_code in ('NHV', 'BRP', 'STM', 'NRO') THEN 2
                                              ELSE 1
                                          END Num_Other_RR_On_Tracks
                                      FROM 
                                          stops;
                    );
                    """

### Create index on `date_time` for weather table

In [None]:
create_dt_index_weather = """
                          CREATE INDEX dt_index ON weather_hourly (date_time);
                          """
# ALREADY RAN execute_command(conn, create_dt_index_weather)

### View the weather columns and data types we will need to join

In [None]:
%%sql

SELECT 
    column_name,
    data_type
FROM 
    information_schema.columns
WHERE
    table_name = 'weather_hourly';

### Plan for joining/merging weighted average weather for train data:
* `temperature`, `precipitation`, `cloud_cover` are all numeric values and can easily be averaged using the `up` and `down` weights previously calculated for the trains table
* `conditions` is 99%+ not null,  `weather_type` is mostly null but gives additional information when it is not null
    * `DISTINCT VALUES`: `Partially cloudy`, `Overcast`, `Rain, Partially cloudy`, `Rain, Overcast`, `Rain`, `Clear`
    * (`Snow` is included/mentioned in the `weather_type` column)
    * Test #1: take set intersection of previous and next hour `conditions`
        * If no results in some places, continue to next Test
    * Test #2: take set union of previous and next hour `conditions`, order by the assigned `weight`
        * Options for combining/summarizing:
            * Reduce the quantity of observation types by converting very specific weather types to more general ones
                * E.g. "Freezing Drizzle/Freezing Rain" could be converted to a "Rain" column on some sort of scale TBD (Ex: `1 = "Mist", 2 = "Drizzle", ... , K = "Heavy Freezing Rain", ..., Z = "Hail"`)
            * If two variations of the same precipitation or condition type, e.g. "Heavy Rain" and "Freezing Drizzle/Freezing Rain" appear in the same entry, take the one with the higher weight and add associated precipitation value to precip_marker column
            
### Idea: Assign numeric scale to all distinct values attributes, then average the numeric valued combinations of attributes over the set and use this numeric

In [None]:
test_explainA = """
                BEGIN;
                EXPLAIN ANALYZE UPDATE all_trains 
                SET round_up_weight = subquery.up_weight,
                    full_act_datetime_round_down = subquery.trunc_hour
                FROM (
                   SELECT
                       t.dataset_id AS dataset_id,
                       EXTRACT (MINUTES FROM t.full_act_arr_dep_datetime)/60 AS up_weight,
                       DATE_TRUNC('hour', t.full_act_arr_dep_datetime) AS trunc_hour
                    FROM all_trains t
               ) AS subquery
               WHERE all_trains.dataset_id = subquery.dataset_id;
               ROLLBACK;
               """

# ALREADY RAN execute_command(conn, set_weights_and_round_hours_part1)

In [None]:
def execute_and_get_results(conn, command):
    try:
        cur = conn.cursor()
        cur.execute(command)
        return cur.fetchall()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        conn.rollback()

In [None]:
query = """
        SELECT
            train_num, 
            arr_or_dep,
            t.station_code,
            full_sched_arr_dep_datetime,
            full_act_arr_dep_datetime,
            timedelta_from_sched,
            temperature, 
            date_time,
            precipitation, 
            cloud_cover, 
            conditions, 
            weather_type, 
            crew_change, 
            sb_mile
        FROM
            all_trains t
        INNER JOIN (
            SELECT
                si.station_code,
                location,
                date_time,
                temperature, 
                precipitation, 
                cloud_cover, 
                conditions, 
                weather_type, 
                crew_change, 
                sb_mile
            FROM 
                weather_hourly
            INNER JOIN (
                SELECT
                    weather_station,
                    station_code,
                    crew_change,
                    sb_mile
                FROM
                    station_info
            ) AS si  
            ON location = si.weather_station
            WHERE DATE_TRUNC('day', date_time) = '2021-04-25'
        ) AS wh
        ON t.station_code = wh.station_code
        WHERE
            DATE_TRUNC('hour', full_act_arr_dep_datetime) = wh.date_time
        AND 
            train_num = '135'
        ORDER BY
            full_act_arr_dep_datetime
        """

In [None]:
results = pd.read_sql(query, conn)

In [None]:
results

In [None]:
results.to_csv('~/Desktop/results.csv', index=False)