# SQL Queries / Data Exploration

### Setup

In [1]:
%load_ext sql

In [2]:
%sql postgresql://appuser:test@localhost:5432/amtrakproject

### Current row counts for each table

In [3]:
%%sql

SELECT
    COUNT(*)
FROM
    departures;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
793126


In [4]:
%%sql 

SELECT
    COUNT(*)
FROM
    arrivals;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
104255


In [5]:
%%sql

SELECT
    COUNT(*)
FROM
    weather_hourly;

 * postgresql://appuser:***@localhost:5432/amtrakproject
1 rows affected.


count
1350082


### Distinct weather conditions values

In [6]:
%%sql

SELECT
    DISTINCT conditions,
    COUNT(conditions)
FROM
    weather_hourly
GROUP BY
    conditions
ORDER BY
    conditions ASC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
9 rows affected.


conditions,count
Clear,655470
Overcast,335367
Partially cloudy,261152
Rain,22498
"Rain, Overcast",54015
"Rain, Partially cloudy",12788
Snow,3620
"Snow, Overcast",3480
"Snow, Partially cloudy",1692


### Column names for each table

In [7]:
%%sql

SELECT
    column_name
FROM
    information_schema.columns
WHERE
    table_name = 'departures';

 * postgresql://appuser:***@localhost:5432/amtrakproject
17 rows affected.


column_name
dataset_id
train_num
station_code
direction
origin_date
origin_year
origin_month
origin_week_day
full_sched_dep_datetime
sched_dep_date


In [8]:
%%sql

SELECT
    column_name
FROM
    information_schema.columns
WHERE
    table_name = 'train_info';

 * postgresql://appuser:***@localhost:5432/amtrakproject
13 rows affected.


column_name
train_info_id
train_num
operating_direction
reg_operates_on_mon
reg_operates_on_tues
reg_operates_on_wed
reg_operates_on_thurs
reg_operates_on_fri
reg_operates_on_sat
reg_operates_on_sun


In [9]:
%%sql

SELECT
    column_name
FROM
    information_schema.columns
WHERE
    table_name = 'weather_hourly';

 * postgresql://appuser:***@localhost:5432/amtrakproject
10 rows affected.


column_name
weather_id
location
date_time
latitude
longitude
temperature
precipitation
cloud_cover
conditions
precip_type


In [10]:
%%sql

SELECT
    column_name
FROM
    information_schema.columns
WHERE
    table_name = 'station_info';

 * postgresql://appuser:***@localhost:5432/amtrakproject
10 rows affected.


column_name
station_info_id
station_code
station_name
state
amtrak_city
weather_loc
longitude
latitude
nb_mile
sb_mile


### Show train #, average depart offset, average temperature, weather conditions, and number of instances
* Largest 20 average depart offsets
* For Providence, RI only
* Rather simplistic join w.r.t timing but it is a starting point
* Group by train numbers and weather conditions

In [11]:
%%sql

SELECT
    d.train_num,
    CAST(AVG(d.depart_diff) AS INTEGER) AS avg_depart_diff,
    CAST(AVG(wh.temperature) AS INTEGER) AS avg_temp,
    wh.conditions,
    COUNT(*) AS n_instances
FROM
    departures d
    INNER JOIN (
        SELECT
            temperature,
            conditions, 
            date_time
        FROM
            weather_hourly
        WHERE
            location = 'Providence, RI'
    ) wh ON DATE_TRUNC('hour', d.full_sched_dep_datetime) = wh.date_time
WHERE
    d.station_code = 'PVD'
GROUP BY
    d.train_num,
    wh.conditions
ORDER BY
    avg_depart_diff DESC
LIMIT 
    20;

 * postgresql://appuser:***@localhost:5432/amtrakproject
20 rows affected.


train_num,avg_depart_diff,avg_temp,conditions,n_instances
168,171,26,"Snow, Partially cloudy",1
99,144,11,"Snow, Partially cloudy",2
88,83,25,"Snow, Partially cloudy",6
167,68,35,Snow,6
94,63,28,"Snow, Partially cloudy",6
166,61,56,"Rain, Partially cloudy",1
139,61,21,"Snow, Overcast",3
161,60,30,Snow,7
166,60,30,Snow,2
82,60,29,"Snow, Partially cloudy",2


### Show direction, average depart offset, average temperature, weather conditions, and number of instances
* Same as previous query but group by direction of travel (and weather conditions) rather than train number

In [12]:
%%sql

SELECT
    d.direction,
    CAST(AVG(d.depart_diff) AS INTEGER) AS avg_depart_diff,
    CAST(AVG(wh.temperature) AS INTEGER) AS avg_temp,
    wh.conditions,
    COUNT(*) AS n_instances
FROM
    departures d
    INNER JOIN (
        SELECT
            temperature,
            conditions,
            date_time
        FROM
            weather_hourly
        WHERE
            location = 'Providence, RI'
    ) wh ON DATE_TRUNC('hour', d.full_sched_dep_datetime) = wh.date_time
WHERE
    d.station_code = 'PVD'
GROUP BY
    wh.conditions, d.direction
ORDER BY
    avg_depart_diff DESC,
    wh.conditions;

 * postgresql://appuser:***@localhost:5432/amtrakproject
18 rows affected.


direction,avg_depart_diff,avg_temp,conditions,n_instances
Northbound,33,26,"Snow, Partially cloudy",95
Northbound,31,27,"Snow, Overcast",188
Northbound,25,29,Snow,127
Northbound,23,49,"Rain, Partially cloudy",88
Northbound,22,52,"Rain, Overcast",917
Southbound,22,26,"Snow, Overcast",184
Southbound,19,25,"Snow, Partially cloudy",107
Northbound,17,57,Overcast,5628
Northbound,16,53,Clear,10416
Northbound,16,59,Partially cloudy,5232


### Show direction, station,  average delays, weather conditions (either rain or clear) filtered by direction of travel and weather conditions include snow
### Show train #, average depart offset, average temperature, weather conditions, and number of instances

In [13]:
%%sql 

SELECT
    d.direction,
    d.station_code,
    CAST(AVG(d.depart_diff) AS INTEGER) AS avg_depart_diff,
    wh.conditions,
    COUNT(*) AS n_instances
FROM
    departures d
    INNER JOIN (
        SELECT
            conditions,
            date_time,
            location,
            si.station_code AS station_code
        FROM
            weather_hourly wh
            INNER JOIN (
                SELECT
                    station_code,
                    weather_loc
                FROM
                    station_info
            ) si ON wh.location = si.weather_loc
    ) wh ON DATE_TRUNC('hour', d.full_sched_dep_datetime) = wh.date_time
    AND wh.station_code = d.station_code
WHERE
    wh.conditions LIKE 'Rain%' OR wh.conditions = 'Clear'
GROUP BY
    d.direction,
    wh.conditions, 
    d.station_code
ORDER BY
    n_instances DESC;

 * postgresql://appuser:***@localhost:5432/amtrakproject
128 rows affected.


direction,station_code,avg_depart_diff,conditions,n_instances
Southbound,KIN,5,Clear,25952
Southbound,BAL,13,Clear,23520
Northbound,KIN,16,Clear,23450
Northbound,BAL,7,Clear,22069
Southbound,NLC,7,Clear,17816
Southbound,NYP,7,Clear,17317
Northbound,NLC,15,Clear,16139
Northbound,NYP,8,Clear,15969
Southbound,WIL,13,Clear,14304
Southbound,NCR,18,Clear,13516
