In [1]:
import json
import pandas as pd
import psycopg2 as pg
import datetime as dt

### set query inputs
helpful link for lat long coordinates:  https://www.latlong.net/

In [2]:
# Inputs
input_radius = 0.5
input_poi = "32.7077° N, 117.1569° W" # padres stadium
input_nw_corner = "32.725064° N, 117.176791° W"
input_se_corner = "32.707094° N, 117.146235° W"
input_street = None # filter to specific street

### get data from postgres

In [3]:
# replace database inputs as appropriate

postgres_password = open('/Users/joshwilson/Documents/DSE/capstone-cohort3-group3/Team/Josh/notebooks/postgres_password.txt', 'r').read()

# replace database inputs as appropriate
conn_str = "host={} dbname={} user={} password={}".format(
    'localhost', 'waze_schema', 'postgres', postgres_password)

#conn_str = "host={} dbname={} user={} password={}".format('localhost', 'waze_schema', 'postgres', 'kilgore431')
conn = pg.connect(conn_str)

In [4]:
def mile_to_meter(mile):
    meter = int(round(mile*1609.344))
    return str(meter)

def bounding_box(nw, se):
    nw = nw.replace("° N","").replace("° W", "")
    nw = nw.split(', ')
    se = se.replace("° N","").replace("° W", "")
    se = se.split(', ')
    
    return "geom @ ST_MakeEnvelope (-{}, {}, -{}, {}) and ST_Length(geom) > 0".format(nw[1],nw[0],se[1],se[0])

def sql_radius(google_lat_lon, mile):
    lat_lon = google_lat_lon.replace("° N","").replace("° W", "")
    lat_lon = lat_lon.split(', ')
    return "ST_DWithin(geom, ST_MakePoint(" + "-"+ lat_lon[1] +"," + lat_lon[0] + ")::geography," + mile_to_meter(mile) +')'

In [5]:
bounding_box(input_nw_corner, input_se_corner)

'geom @ ST_MakeEnvelope (-117.176791, 32.725064, -117.146235, 32.707094) and ST_Length(geom) > 0'

In [6]:
# set input_poi, input_radius, input_street at top of notebook

select_attributes = "segment_id, street, road_type, ST_AsGeoJSON(geom) AS geometry"
#where_conditions = sql_radius(input_poi, input_radius)
where_conditions = bounding_box(input_nw_corner, input_se_corner)
where_conditions_street = "" if input_street==None else " AND street = '" + input_street + "'"
segments_sql = "SELECT " + select_attributes + " FROM segments WHERE " + where_conditions + where_conditions_street
segments_sql

'SELECT segment_id, street, road_type, ST_AsGeoJSON(geom) AS geometry FROM segments WHERE geom @ ST_MakeEnvelope (-117.176791, 32.725064, -117.146235, 32.707094) and ST_Length(geom) > 0'

In [7]:
# get segments
segments_df = pd.read_sql(segments_sql, con=conn)
print(str(len(segments_df)) + " rows")
segments_df.head()

1608 rows


Unnamed: 0,segment_id,street,road_type,geometry
0,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
2,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
3,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
4,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682..."


In [8]:
#Time slices on all days of week (5 minute intervals)
time_df = pd.read_sql('select time_id, date, time, day_of_week from time;', con=conn)
print(str(len(time_df))+" rows")
time_df.head()

36000 rows


Unnamed: 0,time_id,date,time,day_of_week
0,1,2017-02-08,00:00:00,4
1,2,2017-02-08,00:05:00,4
2,3,2017-02-08,00:10:00,4
3,4,2017-02-08,00:15:00,4
4,5,2017-02-08,00:20:00,4


In [9]:
#Cartesian Product of segments and times

time_df['tmp'] = 1
segments_df['tmp'] = 1
segments_time_df = pd.merge(time_df, segments_df, how='outer', on=['tmp'])
print(str(len(segments_time_df))+" rows")
segments_time_df.head()

57888000 rows


Unnamed: 0,time_id,date,time,day_of_week,tmp,segment_id,street,road_type,geometry
0,1,2017-02-08,00:00:00,4,1,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
1,1,2017-02-08,00:00:00,4,1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
2,1,2017-02-08,00:00:00,4,1,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
3,1,2017-02-08,00:00:00,4,1,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682..."
4,1,2017-02-08,00:00:00,4,1,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682..."


In [10]:
#Level for congestion events with associated segment and time information

level_df = pd.read_sql('''select m.segment_id, m.time_id, s.street, t.date, t.time, t.day_of_week, u.level
                        from matrix m, time t, uuid u, segments s
                        where m.time_id = t.time_id and m.uuid_instance_id = u.uuid_instance_id and s.segment_id = m.segment_id''', con=conn)

print(str(len(level_df))+" rows")
level_df.head()

15146101 rows


Unnamed: 0,segment_id,time_id,street,date,time,day_of_week,level
0,13407,4830,E Palomar St,2017-02-24,18:25:00,6,2
1,13407,16631,E Palomar St,2017-04-06,17:50:00,5,2
2,13407,20643,E Palomar St,2017-04-20,16:10:00,5,3
3,13407,26293,E Palomar St,2017-05-10,07:00:00,4,2
4,13407,30759,E Palomar St,2017-05-25,19:10:00,5,2


In [11]:
#Left join Level onto df with cartesian product of all time slices and segments of interest

segments_time_level_df = pd.merge(segments_time_df, level_df[['segment_id','time_id','level']], how='left', on=['segment_id','time_id'])
print(str(len(segments_time_level_df))+" rows")
segments_time_level_df.head()

57906628 rows


Unnamed: 0,time_id,date,time,day_of_week,tmp,segment_id,street,road_type,geometry,level
0,1,2017-02-08,00:00:00,4,1,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",
1,1,2017-02-08,00:00:00,4,1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682...",
2,1,2017-02-08,00:00:00,4,1,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",
3,1,2017-02-08,00:00:00,4,1,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",
4,1,2017-02-08,00:00:00,4,1,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",


In [12]:
#Replace nulls with 0 for assumption of no congestion

segments_time_level_df['level'].fillna(0, inplace=True)
segments_time_level_df.head()

Unnamed: 0,time_id,date,time,day_of_week,tmp,segment_id,street,road_type,geometry,level
0,1,2017-02-08,00:00:00,4,1,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0
1,1,2017-02-08,00:00:00,4,1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0
2,1,2017-02-08,00:00:00,4,1,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0
3,1,2017-02-08,00:00:00,4,1,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0
4,1,2017-02-08,00:00:00,4,1,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0


### add is_traffic binary indicator

In [13]:
segments_time_level_df['is_traffic'] = segments_time_level_df[['tmp','level']].min(axis=1)
segments_time_level_df.head()

Unnamed: 0,time_id,date,time,day_of_week,tmp,segment_id,street,road_type,geometry,level,is_traffic
0,1,2017-02-08,00:00:00,4,1,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0
1,1,2017-02-08,00:00:00,4,1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0
2,1,2017-02-08,00:00:00,4,1,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0
3,1,2017-02-08,00:00:00,4,1,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0
4,1,2017-02-08,00:00:00,4,1,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0


### add time interval

In [14]:
# add column with interval time
time_interval = 30
segments_time_level_df['time_interval'] = segments_time_level_df['time'].apply(lambda t: dt.time(t.hour, time_interval*(t.minute // time_interval), 0))

In [15]:
segments_time_level_df.head()

Unnamed: 0,time_id,date,time,day_of_week,tmp,segment_id,street,road_type,geometry,level,is_traffic,time_interval
0,1,2017-02-08,00:00:00,4,1,295,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0,00:00:00
1,1,2017-02-08,00:00:00,4,1,9132,India St,7,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0,00:00:00
2,1,2017-02-08,00:00:00,4,1,11944,W Cedar St,2,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0,00:00:00
3,1,2017-02-08,00:00:00,4,1,296,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0,00:00:00
4,1,2017-02-08,00:00:00,4,1,17326,W Beech St,1,"{""type"":""LineString"",""coordinates"":[[-117.1682...",0.0,0.0,00:00:00


### group data and calculate geojson properties

In [16]:
# group by date and take max of is_traffic for each time interval to get indicator of traffic in time interval window
grouped_by_date = segments_time_level_df.groupby(['segment_id','date','day_of_week','time_interval'],as_index=False)['is_traffic'].max()

In [17]:
# group by dow and take avg to get pct of days having traffic in each time interval
grouped_by_dow_pct = grouped_by_date.groupby(['segment_id','day_of_week','time_interval'],as_index=False)['is_traffic'].mean()

# rename column to pct_traffic
grouped_by_dow_pct.rename(columns={'is_traffic':'pct_traffic'}, inplace=True)

# check head
grouped_by_dow_pct.head()

Unnamed: 0,segment_id,day_of_week,time_interval,pct_traffic
0,295,1,00:00:00,0.0
1,295,1,00:30:00,0.0
2,295,1,01:00:00,0.0
3,295,1,01:30:00,0.0
4,295,1,02:00:00,0.0


In [18]:
# create df with all possible timestamps to ensure none are left out
ts_df = pd.DataFrame.from_dict({'time_interval':[dt.time(h, m, 0) for h in range(0,24) for m in range(0,60,time_interval)]})

# create day of week dictionary
dow_dict = {
    '1':'Sunday',
    '2':'Monday',
    '3':'Tuesday',
    '4':'Wednesday',
    '5':'Thursday',
    '6':'Friday',
    '7':'Saturday'
}

In [19]:
# function to extract segment data by day of week
def extract_segment_data(seg):
    seg_data_dict = {}
    for dow,dow_name in dow_dict.iteritems():
        # extract pct_traffic for given segment and dow
        seg_dow_df = grouped_by_dow_pct.loc[(grouped_by_dow_pct['segment_id']==seg) & (grouped_by_dow_pct['day_of_week']==dow),['time_interval','pct_traffic']]

        # join with all timestamps to make sure none are left out
        seg_dow_df = ts_df.merge(seg_dow_df, how='left', on='time_interval').drop('time_interval',axis=1)
        seg_dow_df.fillna(0, inplace=True)
        seg_data_dict[dow_name] = seg_dow_df.to_dict(orient='list')
    return seg_data_dict

In [20]:
extract_segment_data(3596).keys()

['Monday', 'Tuesday', 'Friday', 'Wednesday', 'Thursday', 'Sunday', 'Saturday']

### create geojson dict

In [21]:
import ast

In [22]:
# ast.literal_eval is used to keep dict from being coerced to a string
ast.literal_eval(segments_df[segments_df['segment_id']==3596]['geometry'].values[0])

{'coordinates': [[-117.149432, 32.708365], [-117.149451, 32.70941]],
 'type': 'LineString'}

In [23]:
# create dict to fill in with geojson
geojson_dict = {
    'type': 'FeatureCollection',
    'features': []
}

# fill geojson
for seg in segments_time_level_df['segment_id'].unique():
    features_dict = {
        'type': 'Feature',
        'geometry': ast.literal_eval(segments_df[segments_df['segment_id']==seg]['geometry'].values[0]),
        'properties': {
            'segment_id': seg,
            'street': segments_df[segments_df['segment_id']==seg]['street'].values[0],
            'road_type': segments_df[segments_df['segment_id']==seg]['road_type'].values[0],
            'data': extract_segment_data(seg)
        }
    }
    geojson_dict['features'].append(features_dict)

### write geojson dict to geojson and js file

In [24]:
# dump geojson to file
with open('../data/segments.geojson', 'w') as f:
    json.dump(geojson_dict, f)

In [25]:
# write geojson as js file to js folder
geojson_prefix = 'var segments_geojson = '
with open('../js/segments_geojson.js', 'w') as f:
    f.write(geojson_prefix + str(geojson_dict))

### create timestamp lookup js file

In [26]:
# function to format string of type 'hh:mm:ss' to 'hh:mm a.m.'
def ts_conversion(ts_str):
    ts_hour = ts_str[:2]
    ts_min = ts_str[3:5]
    am_pm = 'a.m.' if int(ts_hour)<12 else 'p.m.'
    ts_hour = str((int(ts_hour)+12) % 12)
    ts_hour = '12' if ts_hour=='0' else ts_hour
    ts = '{}:{} {}'.format(ts_hour, ts_min, am_pm)
    return ts

In [27]:
# convert time_interval to string and write to dict
ts_df['time_interval'] = map(ts_conversion, ts_df['time_interval'].astype(str))
timestamp_dict = ts_df.to_dict()['time_interval']

In [28]:
# write timestamp_lookup as js file to js folder
ts_prefix = 'var timestamp_lookup = '
with open('../js/timestamp_lookup.js', 'w') as f:
    f.write(ts_prefix + str(timestamp_dict))

### write boundingbox to js file

In [29]:
# write bounding box bounds as js file to js folder
bb_prefix = 'var bounds = '

nw = input_nw_corner.replace("° N","").replace("° W", "")
nw = nw.split(', ')
nw[1] = '-'+nw[1] # add negative
nw = map(float, nw)

se = input_se_corner.replace("° N","").replace("° W", "")
se = se.split(', ')
se[1] = '-'+se[1] # add negative
se = map(float, se)

bounds = [[nw[0],nw[1]],[se[0],se[1]]]

with open('../js/boundingbox.js', 'w') as f:
    f.write(bb_prefix + str(bounds))

### create colormaps
from https://stackoverflow.com/questions/33596491/extract-matplotlib-colormap-in-hex-format

from pylab import *

cmaps = ['Reds','YlOrBr','magma_r','inferno_r','Greys','OrRd','hot_r']

cmaps_dict = {}
for c in cmaps:
    cmap = cm.get_cmap(c, 9)
    cmap_dict = {}
    for i in range(cmap.N):
        rgb = cmap(i)[:3] # will return rgba, we take only first 3 so we get rgb
        cmap_dict[i] = matplotlib.colors.rgb2hex(rgb).encode('utf-8')
    cmaps_dict[c] = cmap_dict

# write colormaps as js file to js folder
cm_prefix = 'var colormaps = '
with open('../js/colormaps.js', 'w') as f:
    f.write(cm_prefix + str(cmaps_dict))