In [1]:
# basic imports
import pandas as pd
import numpy as np

# other
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('modules/')

# custom functions
import process
import route
from credentials import google_api

# plot imports
from bokeh.io import show

# DROMOS

This project focuses on finding the shortest routes in Manhattan for pretty much anyone who doesn't like hills. 
To use the code below, download the datasets from the attached links and insert your Google Dev API in a credentials.py file stored in the same directory as the notebook.

Using data from [NYC Open Data](https://data.cityofnewyork.us/Transportation/Elevation-points/szwg-xci6/data).

#### **Table of Contents**:
1. **Data Cleaning**:
We start off by reading our two dataframes, Elevation and Centerline, clean them and merge them together to create our final dataset. 
    - 1.1 **Reading Dataframe 1**: Elevation.
    - 1.2 **Reading Dataframe 2**: Centerline.
        - Bike Lane.
        - Coordinates.
        - Traffic Direction.
    - 1.3 **Merge**: Merging the two original dataframes.
        - Calculating the Distance.
        - Elevation for Missing Data.
        - Converting: Feet to Meters.
        - Save our Dataframe.
2. **Route Search**:
Our route search will take two addresses as input and will output a map displaying the shortest route between those two addresses. 
    - 2.1 **Input**: Our input has to be converted from addresses to coordinates.
        - Define Origin and find Closest Point
        - Define Destination and find Closest Point
    - 2.2 **Graph**: We need to run the algorithm to find the shortest path and display that on a map. 
        - Narrow down dataframe search 
        - Construct Graph with Shortest Path to every Point from Origin.
        - Find path from Origin to Destination.
        - Display Path on a Map
    
# 1. Data Cleaning

## 1.1 Elevation

In [2]:
df_ele = pd.read_csv('data/ELEVATION.csv')
df_ele.head(2)

Unnamed: 0,FEAT_CODE,ELEVATION,the_geom,SOURCE_ID,SUB_CODE,STATUS
0,3020,129.74,POINT (-73.98256951739029 40.70191431011495),21302000001,302000,Unchanged
1,3000,120.586263,POINT (-73.98777990215136 40.70192587201795),21300000002,300020,Unchanged


In [3]:
# we only select the values with subcode 300000 - road points
df_ele = df_ele[df_ele.SUB_CODE == 300000]
df_ele.reset_index(drop = True, inplace = True)

# parse out our coordinates
df_ele['longitude'], df_ele['latitude'] = process.coords_parser(df_ele.the_geom)

# round the values in our dataframe
df_ele = df_ele.round(8)

# turn our coordinates into a tuple
df_ele['coords_first'] = list(zip(df_ele.latitude, df_ele.longitude))

# rename elevation and select the only variables we need
df_ele['elevation'] = df_ele.ELEVATION
df_ele = df_ele[['coords_first', 'elevation']]

# creating a duplicate dataframe for later use
df_ele_last = df_ele[['elevation']]
df_ele_last['coords_last'] = df_ele.coords_first
df_ele_last = df_ele_last[['coords_last', 'elevation']]

In [4]:
df_ele.head(2)

Unnamed: 0,coords_first,elevation
0,"(40.70197472, -73.9867178)",30.089456
1,"(40.7020084, -73.97983484)",8.51156


## 1.2 Centerline

This dataset gives us information on traffic direction, bike lanes, traffic direction, street width and street name.

#### Dictionary

**Traffic Direction**: code indicating the flow of traffic relative to the street segment's address range.
1. FT - With
2. TF - Against
3. TW - Two-way
4. NV - Non-vehicular

**Coordinates**: our latitude and longitude variables indicate the first and last set of coordinates on each street.

**Full Street**: Street Name

**Physical ID**: unique ID for intersections (might not be necessary)

**Bike Lane**: binary variable s.t 1 = bike lane, 0 = no bike lane

**Street Width**: indicates street width in feet

In [5]:
df_cline = pd.read_csv('data/Centerline.csv')
df_cline.head(2)

Unnamed: 0,L_LOW_HN,the_geom,L_HIGH_HN,PHYSICALID,R_LOW_HN,R_HIGH_HN,L_ZIP,R_ZIP,L_BLKFC_ID,R_BLKFC_ID,...,PRE_MODIFI,PRE_DIRECT,PRE_TYPE,POST_TYPE,POST_DIREC,POST_MODIF,FULL_STREE,ST_NAME,BIKE_TRAFD,SHAPE_Leng
0,,MULTILINESTRING ((-73.87861544017795 40.861915...,,164809,,,10458.0,10458.0,0,0,...,,,,TRL,,,MITSUBISHI WILD WETLAND TRL,MITSUBISHI WILD WETLAND,,1026.077523
1,215-001,MULTILINESTRING ((-73.7729030190404 40.7778042...,215-027,6110,215-001,215-026,11360.0,11360.0,112261166,112262650,...,,,,AVE,,,28 AVE,28,,258.85974


In [6]:
# limit our data for Manhattan
df_cline = df_cline[df_cline.BOROCODE == 1]

# remove driveways
df_cline = df_cline[df_cline.FULL_STREE != 'DRIVEWAY']

#### Bikelane

In [7]:
# edit bike_lane variable s.t 1 = bike lane, 0 = no bike lane
df_cline['bike_lane'] = np.where(df_cline.BIKE_LANE.notnull(), 1, 0)

#### Coordinates

In [8]:
# parse out the coordinates from the_geom using our custom function
df_cline.reset_index(drop=True, inplace=True)
df_cline['lat_f'], df_cline['lon_f'], df_cline['lat_l'], df_cline['lon_l'] = process.coords_parser(df_cline.the_geom)
df_cline = df_cline.round(8)

# zip the coordinates together as tuples
df_cline['coords_first'] = list(zip(df_cline.lat_f, df_cline.lon_f))
df_cline['coords_last'] = list(zip(df_cline.lat_l, df_cline.lon_l))

# filter out observations which point to the same coordinates
df_cline = df_cline[df_cline.coords_first != df_cline.coords_last]

# rename some of our variables
df_cline['full_street'] = df_cline.FULL_STREE
df_cline['street_width'] = df_cline.ST_WIDTH
df_cline['traffic_dir'] = df_cline.TRAFDIR

# select and rearrange our variables of interest
df_cline.reset_index(drop = True, inplace = True)
df_cline = df_cline[['coords_first', 'coords_last', 'full_street', 'street_width', 'traffic_dir', 'bike_lane']]

#### Traffic Direction

We would like to add the reverse direction for two-way streets to our dataframe.

In [9]:
# create a new dataset made up of all of our datapoints with either two way streets or streets facing the other way
cline_rev = df_cline[(df_cline.traffic_dir == 'TF') | (df_cline.traffic_dir == 'TW')]
# create a placeholder for our coordinates and then switch them
cline_rev['_'] = cline_rev.coords_first
cline_rev['coords_first'] = cline_rev.coords_last
cline_rev['coords_last'] = cline_rev._
cline_rev.drop('_', axis = 1, inplace = True)
# reverse the direction
cline_rev.traffic_dir = np.where(cline_rev.traffic_dir == 'TF', 'FT', cline_rev.traffic_dir)

# append our new dataset of reversed directions
df_cline = df_cline.append(cline_rev)

# remove all duplicates
df_cline = df_cline[df_cline.traffic_dir != 'TF']
# specify direction for all datapoints
df_cline['traffic_dir'] = np.where(df_cline.traffic_dir == 'TW', 'FT', df_cline.traffic_dir)

# drop traffic direction as we are no longer interested in it
df_cline.drop('traffic_dir', axis = 1, inplace = True)
df_cline.reset_index(drop = True, inplace = True)

In [10]:
df_cline.head(2)

Unnamed: 0,coords_first,coords_last,full_street,street_width,bike_lane
0,"(40.70454591, -74.01006968)","(40.70477525, -74.00974639)",STONE ST,14,0
1,"(40.84671052, -73.93191507)","(40.84680263, -73.9318501)",AMSTERDAM AVE,60,1


In [11]:
print('Centerline data has {} datapoints and we have {} datapoints on elevation.'.format(df_cline.shape[0], df_ele.shape[0]))

Centerline data has 15848 datapoints and we have 376575 datapoints on elevation.


## 1.3 Merge

At the moment our elevation dataframe, df_ele, stores information on the elevation for a set of coordinates. Our other dataframe, df_cline, stores information for the distance between two sets of coordinates. To proceed with our project, we need to merge the two together, and eventually calculate the elevation difference between the two sets of coordinates. 

In [12]:
# merging our dataframes
df = df_cline.merge(df_ele, on = 'coords_first', how = 'left')
df = df.merge(df_ele_last, on = 'coords_last', how = 'left')

#### Measuring the distance

To find the distance between our two set of coordinates, we use the custom function 'meters_dist' -- we apply that to our dataset to construct the 'distance' variable.

In [13]:
# creating the new distance variable -- represents the distance between our two coordinates in meters
df['distance'] = df.apply(process.meters_dist, axis = 1)

# sorting our values in ascending order and resetting the index
df.sort_values(by = 'coords_first', ascending = True, inplace = True)
df.reset_index(drop = True, inplace = True)

#### Elevation: missing data

For a subset of our df_cline dataset, we don't have the equivalent set of coordinates in the elevation dataset so we will find the closest one. Once we do that, we will add the corresponding elevation to our dataframe.

In [14]:
df.loc[df.elevation_x.isnull(), 'elevation_x'] = df[df.elevation_x.isnull()].coords_first.apply(process.closest_elevation, args=(df_ele,))

In [21]:
df.loc[df.elevation_y.isnull(), 'elevation_y'] = df[df.elevation_y.isnull()].coords_last.apply(process.closest_elevation, args=(df_ele,))

#### Elevation: Feet to Meters

Our elevation data is in feet, so we would like to convert it to meters.

In [22]:
df['elevation_x'] = df.elevation_x * 0.3048
df['elevation_y'] = df.elevation_y * 0.3048

#### Elevation: Difference

In [23]:
# calculate the elevation difference
df['elevation_diff'] = df.elevation_y - df.elevation_x

#### Elevation: Steepness

In [24]:
df['steep'] = abs(df.elevation_diff) / df.distance

In [25]:
df.tail(2)

Unnamed: 0,coords_first,coords_last,full_street,street_width,bike_lane,elevation_x,elevation_y,distance,elevation_diff,steep
15846,"(40.87848113, -73.92535763)","(40.87888837, -73.92490327)",AMTRAK BRG,0,0,18.858403,18.858403,59.30982,0.0,0.0
15847,"(40.87903805, -73.91033194)","(40.87773675, -73.91200665)",TIBBETT AVE,38,0,2.840004,2.518014,202.122844,-0.321991,0.001593


In [30]:
df.to_pickle('data/dataframe.pkl')

# 2. Route Search

Once our data is finally ready -- we will start setting up our program to find the optimal route between two points by using **Dijkstra's** algorithm. 

Route Search:
- Input: Starting Address + Target Address
- Output: Map outlying the ideal route.

**Steps**:
1. Enter the input as an address, convert that to coordinates. 
2. Construct the graph outlying shortest path from origin to other points in dataframe using Dijkstra's.
3. Find the path from the origin to the destination.
4. Display the path on a map.


In [14]:
df = pd.read_pickle('data/dataframe.pkl')

In [15]:
def parser(x, i):
    return x.split('(')[-1].split(')')[0].split()[i].split(',')[0]

# our set of coordinates are loaded as strings so we convert them to ints
df['first_x'] = pd.to_numeric(df.coords_first.apply(parser, args=(0,)))
df['first_y'] = pd.to_numeric(df.coords_first.apply(parser, args=(1,)))
df['last_x'] = pd.to_numeric(df.coords_last.apply(parser, args=(0,)))
df['last_y'] = pd.to_numeric(df.coords_last.apply(parser, args=(1,)))
                              
df['coords_first'] = list(zip(df.first_x, df.first_y))
df['coords_last'] = list(zip(df.last_x, df.last_y))

In [16]:
df['actual_distance'] = df.distance

# construct a new distance variable that also takes the steepness into account
df['distance'] = df.actual_distance * round(df.steep + 1)

In [17]:
df.head(2)

Unnamed: 0,coords_first,coords_last,full_street,street_width,bike_lane,elevation_x,elevation_y,distance,elevation_diff,steep,first_x,first_y,last_x,last_y,actual_distance
0,"(40.67984161, -74.01984812)","(40.69244388, -74.00685151)",MARTHA'S VYD HIGHLANDS FERRY RTE,0,0,1.863303,1.734286,1780.912448,-0.129018,7.2e-05,40.679842,-74.019848,40.692444,-74.006852,1780.912448
1,"(40.67986109, -74.01941948)","(40.67974306, -74.0195594)",IKEA WALL STREET FERRY RTE,0,0,1.863303,1.863303,17.668083,0.0,0.0,40.679861,-74.019419,40.679743,-74.019559,17.668083


## Step 1

We input our origin and destination addresses in the coords_finder function we constructed to find each location's coordinates.

In [18]:
beginning = route.coords_finder('Columbia University New York', google_api)
beginning = (round(beginning[0], 4), round(beginning[1], 4))
if not df[df.coords_first == beginning].any()[0]:
    beginning = route.closest_point(beginning, list(df.coords_first))

end = route.coords_finder('Levain Cookies Upper West Side', google_api)
end = (round(end[0], 4), round(end[1], 4))
if not df[df.coords_last == end].any()[1]:
    end = route.closest_point(end, list(df.coords_last))

## Step 2

We pass the corresponding coordinates as arguments to our d_graph function that uses Dikjstra's to find the shortest path to every point from our origin.

In [19]:
result = route.d_graph(beginning, end, df)
result.head(2)

Unnamed: 0,vertex,short_distance,prev_vertex
0,"(40.80796161, -73.96380137)",0.0,
1,"(40.80804636, -73.96399728)",19.012837,"(40.80796161, -73.96380137)"


## Step 3

We retrace our algorithm to find the shortest path to our destination. 

In [25]:
path = route.route_construction(result, beginning, end)
df_route = pd.Series(path)
df_route = df_route.apply(pd.Series)
df_route.head(2)

Unnamed: 0,0,1
0,40.779925,-73.980674
1,40.780235,-73.981409


## Step 4

Displaying the path:

In [26]:
plot = route.map_plot(df_route, google_api)
show(plot)