In [1]:
import pandas as pd
import numpy as np
from dromos_functions import coords_finder, ele_finder, d_graph, route_construction, map_plot, closest_point, match_value, meters_dist
from bokeh.io import output_file, output_notebook, show

import warnings
warnings.filterwarnings('ignore')

from scipy.spatial.distance import cdist

# DROMOS

This project focuses on finding the shortest routes in Manhattan for cyclists/pedestrians. 
To use the code below, download the datasets from the attached links and insert your Google Dev API in the code block below. 

Using data from [NYC Open Data](https://data.cityofnewyork.us/Transportation/Elevation-points/szwg-xci6/data).

In [14]:
google_api = '___'

#### **Table of Contents**:
1. Data Cleaning.
    - 1.1 Reading Dataframe 1: Elevation.
    - 1.2 Reading Dataframe 2: Centerline.
        - Bike Lane.
        - Coordinates.
        - Traffic Direction.
    - 1.3 Merge: Merging the two original dataframes.
        - Calculating the Distance.
        - Elevation for Missing Data.
        - Converting: Feet to Meters.
        - Save our Dataframe.
2. Route Search.
    - 2.1 Input
        - Define Origin and find Closest Point
        - Define Destination and find Closest Point
    - 2.2 Graph 
        - Narrow down dataframe search 
        - Construct Graph with Shortest Path to every Point from Origin.
        - Find path from Origin to Destination.
        - Display Path on a Map
    
# 1. Data Cleaning

## 1.1 Elevation

In [2]:
df = pd.read_csv('ELEVATION.csv')

# we only select the values with subcode 300000 - road points
df = df[df.SUB_CODE == 300000]
df.reset_index(drop = True, inplace = True)

# parse out our coordinates
df['longitude'] = df.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split()[0])
df['latitude'] = df.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split()[1])
df['longitude'] = pd.to_numeric(df.longitude)
df['latitude'] = pd.to_numeric(df.latitude)
df = df.round(4)

# turn our coordinates into a tuple
df['coords_first'] = list(zip(df.latitude, df.longitude))

# rename elevation
df['elevation'] = df.ELEVATION
df = df[['coords_first', 'elevation']]

# creating a duplicate dataframe for later use
df_last = df[['elevation']]
df_last['coords_last'] = df.coords_first
df_last = df_last[['coords_last', 'elevation']]

## 1.2 Centerline

This dataset gives us information on traffic direction, bike lanes, traffic direction, street width and street name.

#### Dictionary

**Traffic Direction**: code indicating the flow of traffic relative to the street segment's address range.
1. FT - With
2. TF - Against
3. TW - Two-way
4. NV - Non-vehicular

**Coordinates**: our latitude and longitude variables indicate the first and last set of coordinates on each street.

**Full Street**: Street Name

**Physical ID**: unique ID for intersections (might not be necessary)

**Bike Lane**: binary variable s.t 1 = bike lane, 0 = no bike lane

** Street Width**: indicates street width in feet

In [4]:
cline = pd.read_csv('Centerline.csv')

# limit our data for Manhattan
cline = cline[cline.BOROCODE == 1]

# select variables of interest
cline = cline[['the_geom', 'PHYSICALID', 'TRAFDIR', 'ST_WIDTH', 'BIKE_LANE', 'FULL_STREE']]
cline.reset_index(drop = True, inplace = True)

#### Bikelane

In [5]:
# edit bike_lane variable s.t 1 = bike lane, 0 = no bike lane
cline['bike_lane'] = np.where(cline.BIKE_LANE.notnull(), 1, 0)
cline.drop('BIKE_LANE', inplace = True, axis = 1)

#### Coordinates

In [6]:
# transform the_geom and extract the first and last set of coordinates
cline['latitude_first'] = cline.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split(',')[0].split()[1])
cline['longitude_first'] = cline.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split(',')[0].split()[0])
cline['latitude_last'] = cline.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split(',')[-1].split()[1])
cline['longitude_last'] = cline.the_geom.apply(lambda x: x.split('(')[-1].split(')')[0].split(',')[-1].split()[0])
cline['latitude_first'] = pd.to_numeric(cline.latitude_first)
cline['longitude_first'] = pd.to_numeric(cline.longitude_first)
cline['latitude_last'] = pd.to_numeric(cline.latitude_last)
cline['longitude_last'] = pd.to_numeric(cline.longitude_last)
cline = cline.round(8)

# drop the_geom variable
cline.drop('the_geom', axis = 1, inplace = True)
# rename some of our variables
cline['full_street'] = cline.FULL_STREE
cline['street_width'] = cline.ST_WIDTH
cline['traffic_dir'] = cline.TRAFDIR
# select and rearrange our variables of interest
cline = cline[['latitude_first', 'longitude_first', 'latitude_last', 'longitude_last', 'full_street', 'street_width', 'traffic_dir', 'bike_lane']]

# convert our lats and longs into tuples of coordinates for easier data cleaning
cline['coords_first'] = list(zip(cline.latitude_first, cline.longitude_first))
cline['coords_last'] = list(zip(cline.latitude_last, cline.longitude_last))

# drop our individual lats and longs
cline = cline[['coords_first', 'coords_last', 'full_street', 'street_width', 'traffic_dir', 'bike_lane']]

#### Traffic Direction

In [7]:
# create a new dataset made up of all of our datapoints with either two way streets or streets facing the other way
cline_rev = cline[(cline.traffic_dir == 'TF') | (cline.traffic_dir == 'TW')]

# create a placeholder for our coordinates and then switch our coordinates
cline_rev['_'] = cline_rev.coords_first
cline_rev['coords_first'] = cline_rev.coords_last
cline_rev['coords_last'] = cline_rev._
# drop our placeholder
cline_rev.drop('_', axis = 1, inplace = True)

# revert our direction
cline_rev.traffic_dir = np.where(cline_rev.traffic_dir == 'TF', 'FT', cline_rev.traffic_dir)

# append our new dataset of reverted directions
cline = cline.append(cline_rev)

# remove all duplicates
cline = cline[cline.traffic_dir != 'TF']

# specify direction for all datapoints
cline['traffic_dir'] = np.where(cline.traffic_dir == 'TW', 'FT', cline.traffic_dir)
# drop traffic direction as we are no longer interested in it
cline.drop('traffic_dir', axis = 1, inplace = True)


cline.reset_index(drop = True, inplace = True)

In [9]:
print('Centerline data has {} datapoints and we have {} datapoints on elevation.'.format(cline.shape[0], df.shape[0]))

Centerline data has 15927 datapoints and we have 376575 datapoints on elevation.


## 1.3 Merge

At the moment our original dataframe, df, stores information on the elevation for set of coordinates. Our other dataframe, cline, stores information for the distance between two sets of coordinates. To proceed with our project, we need to merge our two dataframes, and eventually calculate the elevation difference between two sets of coordinates. 

In [11]:
# merging our dataframes
data = cline.merge(df, on = 'coords_first', how = 'left')
data = data.merge(df_last, on = 'coords_last', how = 'left')

#### Measuring the distance

To find the distance between our two set of coordinates, we wrote a function called 'meters_dist' -- we apply that to our dataset to construct the 'distance' variable.

In [12]:
# parsing out the latitude and longitude from our dataset 
data_ = data.copy()
data_[['coords_first_x', 'coords_first_y']] = data.coords_first.apply(pd.Series)
data_[['coords_last_x', 'coords_last_y']] = data.coords_last.apply(pd.Series)

# creating the new distance variable -- represents the distance between our two coordinates in meters
data['distance'] = data_.apply(meters_dist, axis = 1)

# sorting our values in ascending order and resetting the index
data.sort_values(by = 'coords_first', ascending = True, inplace = True)
data.reset_index(drop = True, inplace = True)

### Inferring elevation for missing data

For a subset of our cline dataset, we don't have the equivalent set of coordinates in the elevation dataset so we will find the closest one. Once we do that, we will add the corresponding elevation to our dataframe.

In [14]:
# find the closest coordinates in our elevation dataframe for coordinates that we were unable to find the elevation for
closest_x_list = [closest_point(x, list(df['coords_first'])) for x in data[data.elevation_x.isnull()]['coords_first']]
closest_df = pd.DataFrame(data = closest_x_list, index = data[data.elevation_x.isnull()].index, columns = ['x', 'y'])
closest_df['closest_x'] = list(zip(closest_df.x, closest_df.y))
closest_df = closest_df[['closest_x']]
# add the equivalent elevation for each coordinate
closest_df['elevation_x'] = [match_value(df, 'coords_first', x, 'elevation') for x in closest_df['closest_x']]

# do the above for y, too.
closest_y_list = [closest_point(x, list(df_last['coords_last'])) for x in data[data.elevation_y.isnull()]['coords_last']]
closest_df_y = pd.DataFrame(data = closest_y_list, index = data[data.elevation_y.isnull()].index, columns = ['x', 'y'])
closest_df_y['closest_y'] = list(zip(closest_df_y.x, closest_df_y.y))
closest_df_y = closest_df_y[['closest_y']]
# add the equivalent elevation for each coordinate
closest_df_y['elevation_y'] = [match_value(df_last, 'coords_last', x, 'elevation') for x in closest_df_y['closest_y']]

# append the above to our dataframe
data = data.merge(closest_df, right_index = True, left_index = True, how = 'left')
data['elevation_x'] = np.where(data.elevation_x_x.isnull(), data.elevation_x_y, data.elevation_x_x)
data = data.merge(closest_df_y, right_index = True, left_index = True, how = 'left')
data['elevation_y'] = np.where(data.elevation_y_x.isnull(), data.elevation_y_y, data.elevation_y_x)
data = data[['coords_first', 'coords_last', 'full_street', 'street_width', 'bike_lane', 'distance', 'elevation_x', 'elevation_y', 'closest_x', 'closest_y']]

#### Elevation: Feet to Meters

Our elevation data is in feet, so we would like to convert it to meters.

In [15]:
data['elevation_x'] = data.elevation_x * 0.3048
data['elevation_y'] = data.elevation_y * 0.3048

#### Evaluating our accuracy 

If the closest set of coordinates to a point is greater than X, then the elevation might not necessarily be accurate.

In [16]:
data_x_ = data.copy()
data_x_[['coords_first_x', 'coords_first_y']] = data_x_.coords_first.apply(pd.Series)
data_x_[['coords_last_x', 'coords_last_y']] = data_x_.closest_x.apply(pd.Series)

data['x_cl_distance'] = data_x_.apply(meters_dist, axis = 1)

data_x_ = data.copy()
data_x_[['coords_first_x', 'coords_first_y']] = data_x_.coords_last.apply(pd.Series)
data_x_[['coords_last_x', 'coords_last_y']] = data_x_.closest_y.apply(pd.Series)

data['y_cl_distance'] = data_x_.apply(meters_dist, axis = 1)

In [437]:
data[data.x_cl_distance > 100].shape[0] / data[data.x_cl_distance.notnull()].shape[0]

0.07033465683494045

It seems that approximately 7% of our estimated coordinates, have been mapped to another set of coords that are further than 100m away. We're not going to worry about this at the moment, since if the closest point is more than 100m away, then it's likely that the specific set of coords will never be used. 

Since our dataframes are finally merged together, we can overwrite the original df dataset.

In [448]:
df = data.copy()
df = df[['coords_first', 'coords_last', 'full_street', 'street_width', 'bike_lane', 'distance', 'elevation_x', 'elevation_y']]

In [449]:
df.head()

Unnamed: 0,coords_first,coords_last,full_street,street_width,bike_lane,distance,elevation_x,elevation_y
0,"(40.6798, -74.0198)","(40.6924, -74.0069)",MARTHA'S VYD HIGHLANDS FERRY RTE,0,0,1775.70117,1.863303,1.734282
1,"(40.6799, -74.0194)","(40.6797, -74.0196)",IKEA WALL STREET FERRY RTE,0,0,27.942002,1.863303,1.863303
2,"(40.68, -74.0204)","(40.6916, -74.0079)",HIGHLANDS-MIDTOWN EAST FERRY RTE,0,0,1667.585796,1.863303,2.014515
3,"(40.6809, -74.0179)","(40.6799, -74.0194)",IKEA WALL STREET FERRY RTE,0,0,168.603586,1.944837,1.863303
4,"(40.6827, -74.0283)","(40.6904, -74.0247)",RED HOOK-MIDTOWN EAST FERRY RTE,0,0,909.43321,2.121713,2.220407


In [450]:
df.to_csv('merged_dataframe.csv')

# 2. Route Search

Once our data is finally ready -- only steps left are finding the elevation difference between our two sets of coordinates -- we will start setting up our program to find the optimal route between two points by using **Dijkstra's** algorithm. 

Route Search:
- Input: Starting Address + Target Address
- Output: Map outlying the ideal route.

**Steps**:
1. Enter the input as an address, convert that to coordinates. 
2. Construct the graph outlying shortest path from origin to other points in dataframe using Dijkstra's.
3. Find the path from the origin to the destination.
4. Display the path on a map.


In [2]:
df = pd.read_csv('merged_dataframe.csv', index_col = 'Unnamed: 0')

In [3]:
# calculate the elevation difference
df['elevation_diff'] = df.elevation_y - df.elevation_x

# our set of coordinates are loaded as strings so we convert them to ints
df['first_x'] = pd.to_numeric(df.coords_first.apply(lambda x: x.split('(')[-1].split(')')[0].split()[0].split(',')[0]))
df['first_y'] = pd.to_numeric(df.coords_first.apply(lambda x: x.split('(')[-1].split(')')[0].split()[1].split(',')[0]))
df['last_x'] = pd.to_numeric(df.coords_last.apply(lambda x: x.split('(')[-1].split(')')[0].split()[0].split(',')[0]))
df['last_y'] = pd.to_numeric(df.coords_last.apply(lambda x: x.split('(')[-1].split(')')[0].split()[1].split(',')[0]))
df['coords_first'] = list(zip(df.first_x, df.first_y))
df['coords_last'] = list(zip(df.last_x, df.last_y))

# removing coordinates in driveways (closed component)
df = df[df.full_street != 'DRIVEWAY']
# remove rows where they point to same set of coordinates
df = df[df.coords_first != df.coords_last]

In [4]:
df['steep'] = abs(df.elevation_diff) / df.distance
df['steep'] = round(df.steep + 1)
df.head(2)

Unnamed: 0,coords_first,coords_last,full_street,street_width,bike_lane,distance,elevation_x,elevation_y,elevation_diff,first_x,first_y,last_x,last_y,steep
0,"(40.6798, -74.0198)","(40.6924, -74.0069)",MARTHA'S VYD HIGHLANDS FERRY RTE,0,0,1775.70117,1.863303,1.734282,-0.129022,40.6798,-74.0198,40.6924,-74.0069,1.0
1,"(40.6799, -74.0194)","(40.6797, -74.0196)",IKEA WALL STREET FERRY RTE,0,0,27.942002,1.863303,1.863303,0.0,40.6799,-74.0194,40.6797,-74.0196,1.0


In [5]:
df['actual_distance'] = df.distance
df['distance'] = df.actual_distance * df.steep

## Step 1

We input our origin and destination addresses in the coords_finder function we constructed to find each location's coordinates.

In [9]:
beginning = coords_finder('Columbia University New York', google_api)
beginning = (round(beginning[0], 4), round(beginning[1], 4))
if not df[df.coords_first == beginning].any()[0]:
    beginning = closest_point(beginning, list(df.coords_first))

end = coords_finder('Levain Cookies Upper West Side', google_api)
end = (round(end[0], 4), round(end[1], 4))
if not df[df.coords_last == end].any()[1]:
    end = closest_point(end, list(df.coords_last))

## Step 2

We pass the corresponding coordinates as arguments to our d_graph function that uses Dikjstra's to find the shortest path to every point from our origin.

In [10]:
result = d_graph(beginning, end, df)
result.head(2)

Unnamed: 0,vertex,short_distance,prev_vertex
0,"(40.808, -73.9638)",0.0,
1,"(40.808, -73.964)",16.851629,"(40.808, -73.9638)"


## Step 3

We retrace our algorithm to find the shortest path to our destination. 

In [11]:
route = route_construction(result, beginning, end)
df_route = pd.Series(route)
df_route = df_route.apply(pd.Series)
df_route.head(2)

Unnamed: 0,0,1
0,40.7818,-73.9793
1,40.7812,-73.9797


## Step 4

Displaying the path:

In [12]:
plot = map_plot(df_route, google_api)
show(plot)