# Data Transformation
The purpose of this notebook is to transform the raw waze.csv data into a format matching our data schema.

## Load Raw Data

In [1]:
import datetime
import numpy as np
import pandas as pd
import psycopg2 as pg

In [2]:
# read password file
postgres_password = open('postgres_password.txt', 'r').read()

# replace database inputs as appropriate
conn_str = "host={} dbname={} user={} password={}".format(
    'localhost', 'waze', 'postgres', postgres_password)

conn = pg.connect(conn_str)

In [3]:
waze_raw_df = pd.read_sql('select id, uuid, waze_timestamp, street, \
                          start_node, end_node, city, length, delay, \
                          speed, level, road_type, geom, \
                          ST_AsText(geom) as linestring, \
                          ST_NumPoints(geom) as linestring_length \
                          from congestion', con=conn)

In [4]:
waze_raw_df.columns

Index([u'id', u'uuid', u'waze_timestamp', u'street', u'start_node',
       u'end_node', u'city', u'length', u'delay', u'speed', u'level',
       u'road_type', u'geom', u'linestring', u'linestring_length'],
      dtype='object')

## Create Processed DataFrame with Additional Columns

In [5]:
# make a copy of waze_raw_df
waze_processed_df = waze_raw_df.copy()

In [6]:
# extract day of week, date, time, and timestamp rounded to 15 minute interval
waze_processed_df['dow'] = waze_processed_df['waze_timestamp'].dt.dayofweek
waze_processed_df['month'] = waze_processed_df['waze_timestamp'].dt.month
waze_processed_df['date'] = waze_processed_df['waze_timestamp'].dt.date
waze_processed_df['time'] = waze_processed_df['waze_timestamp'].dt.time
waze_processed_df['timestamp_round'] = waze_processed_df['waze_timestamp'].apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour,15*(dt.minute // 15)))
waze_processed_df['time_round'] = waze_processed_df['timestamp_round'].dt.time

In [7]:
# TODO - add is_weekend, is_holiday, is_rushhour
# NOTE:  I think Ken calculated these in his timeseries exploration notebook
waze_processed_df['is_weekend'] = None
waze_processed_df['is_holiday'] = None
waze_processed_df['is_rushhour'] = None

In [8]:
waze_processed_df.head()

Unnamed: 0,id,uuid,waze_timestamp,street,start_node,end_node,city,length,delay,speed,...,linestring_length,dow,month,date,time,timestamp_round,time_round,is_weekend,is_holiday,is_rushhour
0,912774,2146712834,2017-04-14 14:16:35,Poway Rd,,Pomerado Rd,"Poway, CA",2275,776,2.541667,...,41,4,4,2017-04-14,14:16:35,2017-04-14 14:15:00,14:15:00,,,
1,912775,3218080,2017-04-14 14:16:35,I-15 N,,I-15 N,"San Diego, CA",6970,189,17.219444,...,59,4,4,2017-04-14,14:16:35,2017-04-14 14:15:00,14:15:00,,,
2,912776,2145691065,2017-04-14 14:16:35,Paseo Delicias,,El Montevideo,"Rancho Santa Fe, CA",441,97,3.488889,...,5,4,4,2017-04-14,14:16:35,2017-04-14 14:15:00,14:15:00,,,
3,912777,2367323,2017-04-14 14:16:35,Camino Santa Fe,,,"San Diego, CA",277,90,2.230556,...,8,4,4,2017-04-14,14:16:35,2017-04-14 14:15:00,14:15:00,,,
4,912778,1022642,2017-04-14 14:21:22,Orange Ave,,4th St,"Coronado, CA",532,235,1.9,...,4,4,4,2017-04-14,14:21:22,2017-04-14 14:15:00,14:15:00,,,


## Create Time Table

In [9]:
time_cols = ['waze_timestamp','timestamp_round','date','time','time_round','dow','month','is_weekend','is_holiday','is_rushhour']
time_df = waze_processed_df.loc[:,time_cols].drop_duplicates().sort_values(by='waze_timestamp').reset_index()
time_df.drop('index', axis=1, inplace=True)
time_df['time_id'] = time_df.index + 1
time_df.head()

Unnamed: 0,waze_timestamp,timestamp_round,date,time,time_round,dow,month,is_weekend,is_holiday,is_rushhour,time_id
0,2017-02-08 16:12:54,2017-02-08 16:00:00,2017-02-08,16:12:54,16:00:00,2,2,,,,1
1,2017-02-08 16:16:29,2017-02-08 16:15:00,2017-02-08,16:16:29,16:15:00,2,2,,,,2
2,2017-02-08 16:21:14,2017-02-08 16:15:00,2017-02-08,16:21:14,16:15:00,2,2,,,,3
3,2017-02-08 16:27:11,2017-02-08 16:15:00,2017-02-08,16:27:11,16:15:00,2,2,,,,4
4,2017-02-08 16:31:57,2017-02-08 16:30:00,2017-02-08,16:31:57,16:30:00,2,2,,,,5


## Create Segment Table

In [10]:
# helper function
def extract_segments(ls):
    '''function to extract individual lonlat segments from a linestring'''
    ls_0 = ls.split('(')[-1:][0]
    lonlats_str = ls_0[:-1].split(',')
    lons = [float(ll.split()[0]) for ll in lonlats_str]
    lats = [float(ll.split()[1]) for ll in lonlats_str]
    lonlats = list(zip(lons, lats))
    segments = [(lonlats[i],lonlats[i+1]) for i in range(len(lonlats)-1)]
    return segments

In [11]:
# extract segments from linestrings
linestrings = np.array(waze_processed_df['linestring'].values)
waze_segments = map(extract_segments, linestrings)

In [12]:
# check to make sure lengths align
segment_lengths = np.array(map(len, waze_segments)) + 1
linestring_lengths = np.array(waze_raw_df['linestring_length'].values)

print 'should be zero: {}'.format(np.average(linestring_lengths - segment_lengths))

should be zero: 0.0


In [13]:
# add segments to processed dataframe
waze_processed_df['segments'] = waze_segments

In [14]:
# check random rows to make sure linestrings and segments match
nonmatching_row_count = 0
for random_row in np.random.choice(waze_processed_df.index.values, size=1000, replace=False):
    rand_linestring = waze_processed_df.iloc[random_row]['linestring']
    rand_segments = waze_processed_df.iloc[random_row]['segments']
    if rand_segments == extract_segments(rand_linestring):
        continue
    else:
        nonmatching_row_count+=1
        print 'row {} does not match'.format(random_row)

print 'nonmatching rows: {}'.format(nonmatching_row_count)

nonmatching rows: 0


In [15]:
# get list of individual segments
segments_list = waze_processed_df['segments'].values

In [16]:
# flatten list
flat_segments_list = [segment for segments in segments_list for segment in segments]

In [17]:
# get unique segments
unique_segments = np.array(list(set(flat_segments_list)))

In [18]:
len(unique_segments)

174604

In [19]:
unique_segments

array([[[-117.220002,   32.875825],
        [-117.219893,   32.875811]],

       [[-116.978804,   32.616346],
        [-116.978212,   32.614897]],

       [[-117.029244,   33.005363],
        [-117.029237,   33.005589]],

       ..., 
       [[-117.079921,   32.641281],
        [-117.080565,   32.642981]],

       [[-117.127177,   32.747425],
        [-117.127169,   32.748075]],

       [[-117.039935,   32.782489],
        [-117.040041,   32.7824  ]]])

In [20]:
# create dict for dataframe
segments_dict = {
    'segment_id': np.array(range(len(unique_segments))) + 1,
    'segment': [s for s in unique_segments],
    'lat1': [s[0][1] for s in unique_segments],
    'lon1': [s[0][0] for s in unique_segments],
    'lat2': [s[1][1] for s in unique_segments],
    'lon2': [s[1][0] for s in unique_segments]
}

In [21]:
# create segment dataframe
segment_df = pd.DataFrame(segments_dict)

In [22]:
# TODO - add additional columns
segment_df['street'] = None
segment_df['city'] = None
segment_df['road_type'] = None
segment_df['geom'] = None
segment_df['length'] = None

In [23]:
segment_df.head()

Unnamed: 0,lat1,lat2,lon1,lon2,segment,segment_id,street,city,road_type,geom,length
0,32.875825,32.875811,-117.220002,-117.219893,"[[-117.220002, 32.875825], [-117.219893, 32.87...",1,,,,,
1,32.616346,32.614897,-116.978804,-116.978212,"[[-116.978804, 32.616346], [-116.978212, 32.61...",2,,,,,
2,33.005363,33.005589,-117.029244,-117.029237,"[[-117.029244, 33.005363], [-117.029237, 33.00...",3,,,,,
3,32.896659,32.896772,-117.190593,-117.190444,"[[-117.190593, 32.896659], [-117.190444, 32.89...",4,,,,,
4,32.778413,32.778399,-116.985483,-116.985316,"[[-116.985483, 32.778413], [-116.985316, 32.77...",5,,,,,


## Create UUID Table

In [24]:
# TODO

## Create Time/Segment/UUID Mapping Table

In [25]:
# TODO

## Write DataFrames to csv

In [26]:
# TODO