In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

### 1. Load data

In order to calculate the 'walking distance between employee homes and the closest bus stop', I decide first extract the Latitude and Longitude of the employee homes and bus stop using Google sheet and the add-ons 'Geocode by Awesome Table'. 

Refer to the instruction :https://www.youtube.com/watch?v=ozjSCoJGgVc

In [51]:
stops = pd.read_csv('/Users/ChunyanHao/desktop/github/ds_take_home/data/Bus/Bus_Stops_with_geocode.csv')
addresses = pd.read_csv('/Users/ChunyanHao/desktop/github/ds_take_home/data/Bus/address_with_geocode.csv')

In [52]:
stops.head(3)

Unnamed: 0,Street_One,Street_Two,Latitude,Longitude
0,MISSION ST,ITALY AVE,37.718478,-122.439536
1,MISSION ST,NEW MONTGOMERY ST,37.787456,-122.400523
2,MISSION ST,01ST ST,37.789768,-122.397597


In [61]:
stops.shape

(118, 4)

In [53]:
#Remove bus stops with same Latitude and  Longitude, no need to stop the same location mulitple times
stops = stops.drop_duplicates(subset= ['Latitude', 'Longitude'], keep="first")

In [60]:
stops.shape

(118, 4)

In [54]:
addresses.head(3)

Unnamed: 0,address,Latitude,Longitude,employee_id
0,"98 Edinburgh St, San Francisco, CA 94112, USA",37.72847,-122.426581,206
1,"237 Accacia St, Daly City, CA 94014, USA",37.704205,-122.415878,2081
2,"1835 Folsom St, San Francisco, CA 94103, USA",37.767932,-122.415181,178


In [62]:
addresses.shape

(2191, 4)

### 2. Calculate walking distance between employee address and each bus stop

In [55]:
# Prepare dataset to calculate the distance between each home address and bus stop
data = pd.merge(left = addresses, right = stops, how = 'cross' )

In order to calculat the distance between two location, I use Haversine Distance. 

Refer to the instruction: https://towardsdatascience.com/calculating-distance-between-two-geolocations-in-python-26ad3afe287b 

In [56]:
import haversine as hs

def distance(row):
    loc1=(row[1],row[2])
    loc2=(row[6],row[7])
    return hs.haversine(loc1,loc2)
distance_column = []
for i in range(len(data)):
    row = data.iloc[i, :]
    distance_column.append(distance(row))
data['distance'] = distance_column

In [57]:
data.head(3)

Unnamed: 0,address,Latitude_x,Longitude_x,employee_id,Street_One,Street_Two,Latitude_y,Longitude_y,distance
0,"98 Edinburgh St, San Francisco, CA 94112, USA",37.72847,-122.426581,206,MISSION ST,ITALY AVE,37.718478,-122.439536,1.591478
1,"98 Edinburgh St, San Francisco, CA 94112, USA",37.72847,-122.426581,206,MISSION ST,NEW MONTGOMERY ST,37.787456,-122.400523,6.947443
2,"98 Edinburgh St, San Francisco, CA 94112, USA",37.72847,-122.426581,206,MISSION ST,01ST ST,37.789768,-122.397597,7.27671


### 3. Get the closest bus stop for each stop

In [65]:
closest_stop = data.groupby(['employee_id', 'address'])['distance'].agg('min').reset_index(name = 'closed_distance')
closest_stop = pd.merge(left = closest_stop, 
                  right = data[['employee_id', 'distance', 'Street_One', 'Street_Two']], 
                  how = 'inner', 
                  left_on = ['employee_id', 'closed_distance'], 
                  right_on = ['employee_id', 'distance'])[['employee_id', 'address', 'closed_distance', 'Street_One', 'Street_Two' ]]
closest_stop.head(5)

Unnamed: 0,employee_id,address,closed_distance,Street_One,Street_Two
0,1,"220 Surrey St, San Francisco, CA 94131, USA",0.785148,MISSION ST,TRUMBULL ST
1,2,"110 Madrid St, San Francisco, CA 94112, USA",0.26982,MISSION ST,CASTLE MANOR AVE
2,3,"1224 Athens St, San Francisco, CA 94112, USA",0.587206,MISSION ST,POPE ST
3,4,"171 Accacia St, Daly City, CA 94014, USA",2.563807,MISSION ST,SENECA AVE
4,5,"98 Brosnan St, San Francisco, CA 94103, USA",0.388062,MISSION ST,ERIE ST


In [63]:
closest_stop.shape

(2191, 6)

### 4 Get the top 10 popular bus stops

In [68]:
result = closest_stop.groupby(['Street_One', 'Street_Two'])['employee_id'].count().reset_index(name = 'employee_count')
result.sort_values(by = 'employee_count', ascending = False )[:10]

Unnamed: 0,Street_One,Street_Two,employee_count
86,MISSION ST,TRUMBULL ST,121
9,MISSION ST,15TH ST,116
6,MISSION ST,12TH ST,110
10,MISSION ST,16TH ST,102
44,MISSION ST,GENEVA AVE,79
37,MISSION ST,ERIE ST,70
26,MISSION ST,BOSWORTH ST,64
8,MISSION ST,14TH ST,64
4,MISSION ST,10TH ST,59
18,MISSION ST,ADMIRAL AVE,49
