In [20]:
import pandas as pd
import pickle
from haversine import haversine

In [21]:
with open('foursquare_POIs_only.pickle', 'rb') as f:
    fsq_df = pickle.load(f)

In [22]:
fsq_df = fsq_df[['venueId','venueCategory', 'latitude', 'longitude']].rename(columns={'latitude':'poi_lat', 'longitude':'poi_lon'})

In [23]:
fsq_df.shape

(7523, 4)

In [24]:
#reducing observations to unique venues/POIs for distance calculation
fsq_df_unique = fsq_df.groupby(by='venueId', as_index=False).max()
fsq_df_unique.shape

(2553, 4)

In [25]:
with open('weekly_avg_by_station_clean.pickle', 'rb') as f:
    entry_df = pickle.load(f)

In [26]:
entry_df = entry_df.sort_values('ENTRIES', ascending = False)

#only interested in top 10 stations
entry_df = entry_df.iloc[0:10]

In [27]:
#only interested in the station names and orders
distance_df = pd.DataFrame(entry_df.STATION)
distance_df.rename(columns={'STATION': 'station'}, inplace=True)

In [28]:
#manually update latitude/longitude for top 10
stat_lat_lons =[(40.750326,-73.988067), # 34 st - Herald Square
                (40.752962,-73.977219), # grand central - 42 st
                (40.750497,-73.990877), #34 st- penn station
                (40.741623,-73.989365), #23 st - actually 5th and 23rd st
                (40.755453,-73.987285), # Times Square - 42nd st
                (40.709641,-74.008283), #Fulton St
                (40.758923,-73.981350), #47-50 STS ROCK 
                (40.735019,-73.990698), # 14 st - Union Square
                (40.768868,-73.980959), #59 St Columbus
                (40.712925,-74.009928)] # PATH NEW WTC
                

In [29]:
distance_df['station_lat'] = [x[0] for x in stat_lat_lons]
distance_df['station_lon'] = [x[1] for x in stat_lat_lons]
distance_df

Unnamed: 0,station,station_lat,station_lon
58,34 ST-HERALD SQ,40.750326,-73.988067
232,GRD CNTRL-42 ST,40.752962,-73.977219
60,34 ST-PENN STA,40.750497,-73.990877
45,23 ST,40.741623,-73.989365
352,TIMES SQ-42 ST,40.755453,-73.987285
225,FULTON ST,40.709641,-74.008283
71,47-50 STS ROCK,40.758923,-73.98135
14,14 ST-UNION SQ,40.735019,-73.990698
85,59 ST COLUMBUS,40.768868,-73.980959
314,PATH NEW WTC,40.712925,-74.009928


In [30]:
#add column of all 1s to each df so we can merge to get all distances
distance_df['merge_column'] = 1
fsq_df_unique['merge_column'] = 1

distance_df = distance_df.merge(right=fsq_df_unique, on='merge_column')
distance_df.drop('merge_column', axis=1, inplace=True)

In [31]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon
0,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648
2,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607
3,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128
4,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529


In [32]:
def distance_to_station(df_row):
    stat_coords = (df_row.station_lat, df_row.station_lon)
    poi_coords = (df_row.poi_lat, df_row.poi_lon)
    return haversine(stat_coords, poi_coords, miles=True)

In [33]:
distance_df['distance'] = distance_df.apply(distance_to_station, axis=1)

In [34]:
distance_df.head()

Unnamed: 0,station,station_lat,station_lon,venueId,venueCategory,poi_lat,poi_lon,distance
0,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200eea1ee3,Sushi Restaurant,40.727771,-74.000337,1.685613
1,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5200fea1ee3,Salad Place,40.73998,-73.986648,0.718688
2,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a52010e41ee3,Molecular Gastronomy Restaurant,40.719649,-73.984607,2.127333
3,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201ae61ee3,Sushi Restaurant,40.728898,-73.998128,1.571412
4,34 ST-HERALD SQ,40.750326,-73.988067,3fd66200f964a5201be61ee3,Clothing Store,40.722283,-73.999529,2.028399


In [35]:
distance_df = distance_df[distance_df.distance < 1]
distance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4343 entries, 1 to 25517
Data columns (total 8 columns):
station          4343 non-null object
station_lat      4343 non-null float64
station_lon      4343 non-null float64
venueId          4343 non-null object
venueCategory    4343 non-null object
poi_lat          4343 non-null float64
poi_lon          4343 non-null float64
distance         4343 non-null float64
dtypes: float64(5), object(3)
memory usage: 305.4+ KB


In [36]:
with open('distance_df_v2.pickle','wb') as f:
    pickle.dump(distance_df, f)

In [37]:
distance_df.groupby(by=['venueCategory','station'], as_index=False).count().sort_values(['venueCategory','poi_lat'], ascending=[True,False])

Unnamed: 0,venueCategory,station,station_lat,station_lon,venueId,poi_lat,poi_lon,distance
4,Antique Shop,47-50 STS ROCK,4,4,4,4,4,4
5,Antique Shop,59 ST COLUMBUS,4,4,4,4,4,4
7,Antique Shop,TIMES SQ-42 ST,4,4,4,4,4,4
0,Antique Shop,14 ST-UNION SQ,3,3,3,3,3,3
1,Antique Shop,23 ST,3,3,3,3,3,3
6,Antique Shop,GRD CNTRL-42 ST,3,3,3,3,3,3
2,Antique Shop,34 ST-HERALD SQ,1,1,1,1,1,1
3,Antique Shop,34 ST-PENN STA,1,1,1,1,1,1
10,Bridal Shop,34 ST-HERALD SQ,5,5,5,5,5,5
8,Bridal Shop,14 ST-UNION SQ,4,4,4,4,4,4
