In [35]:
import pandas as pd
import zipfile
import geopandas as gpd
import numpy as np

from shapely.geometry import Point

In [6]:
# Data source url
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("data/raw/public/sources.csv")
df = df[df['location.municipality'] == 'New York City'][['provider','name','urls.latest']]
df

Unnamed: 0,provider,name,urls.latest
208,Academy Bus,,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-academy-bus-gtfs-209.zip?alt=media
509,MTA New York City Transit (MTA),NYC Bus Company,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-mta-gtfs-510.zip?alt=media
510,MTA New York City Transit (MTA),NYC Subway Supplemented,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-mta-gtfs-511.zip?alt=media
511,MTA New York City Transit,Brooklyn Bus,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-gtfs-512.zip?alt=media
512,MTA New York City Transit (MTA),Manhattan Bus,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-mta-gtfs-513.zip?alt=media
513,MTA New York City Transit (MTA),Staten Island Bus,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-mta-gtfs-514.zip?alt=media
514,NYC Ferry,,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-nyc-ferry-gtfs-515.zip?alt=media
515,MTA New York City Transit (MTA),NYC Subway,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-mta-new-york-city-transit-mta-gtfs-516.zip?alt=media
517,Staten Island Ferry,,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-staten-island-ferry-gtfs-518.zip?alt=media
518,Downtown Alliance,,https://storage.googleapis.com/storage/v1/b/mdb-latest/o/us-new-york-downtown-alliance-gtfs-519.zip?alt=media


### Preprocess and merge

In [12]:
zip_path = "data/raw/public/us-new-york-mta-new-york-city-transit-mta-gtfs-510.zip"

with zipfile.ZipFile(zip_path, "r") as zip_file:

    # Bus stops history
    with zip_file.open("stop_times.txt") as file:
        bus = pd.read_csv(file)

    # Stops location
    with zip_file.open("stops.txt") as file:
        stops = pd.read_csv(file)


In [13]:
bus.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,timepoint
0,35671183-BPPB3-BP_B3-Weekday-02,00:00:00,00:00:00,551840,1,0,0,1
1,35671183-BPPB3-BP_B3-Weekday-02,00:00:29,00:00:29,551841,2,0,0,0
2,35671183-BPPB3-BP_B3-Weekday-02,00:01:00,00:01:00,551843,3,0,0,1
3,35671183-BPPB3-BP_B3-Weekday-02,00:01:24,00:01:24,551844,4,0,0,0
4,35671183-BPPB3-BP_B3-Weekday-02,00:01:58,00:01:58,551834,5,0,0,0


In [30]:
stops.head()

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon
0,100025,GRAND CONCOURSE/E 161 ST,GRAND CONCOURSE & E 161 ST,40.826617,-73.922643
1,100027,GRAND CONCOURSE/E 165 ST,GRAND CONCOURSE & E 165 ST,40.83132,-73.91994
2,100033,GRAND CONCOURSE/MT EDEN AV,GRAND CONCOURSE & MT EDEN AV,40.843405,-73.911639
3,100039,GRAND CONCOURSE/E BURNSIDE AV,GRAND CONCOURSE & E BURNSIDE AV,40.852124,-73.90358
4,100045,GRAND CONCOURSE/E FORDHAM RD,GRAND CONCOURSE & E FORDHAM RD,40.862958,-73.896434


In [23]:
# Cut within 24 hours
bus = bus[bus['departure_time'].apply(lambda x:int(x.split(":")[0]))<24]

In [26]:
# Time interval
interval = 15

timestamp = pd.to_datetime(bus['departure_time'])
bus['time'] = (timestamp.dt.hour*60 + timestamp.dt.minute)//interval

  timestamp = pd.to_datetime(bus['departure_time'])


In [42]:
bus_merged = bus[['time','stop_id']].merge(stops,left_on='stop_id',right_on='stop_id',how='left')

### Match taxi zone

In [56]:
zipfile = 'data/taxi_zones.shp'
zones = gpd.read_file(zipfile)
zones_wgs84 = zones.to_crs('EPSG:4326')

pd.set_option('display.max_colwidth', 199)
zones_wgs84.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.69510, -74.18450 40.69519, -74.18438 40.69588, -74.18428 40.69621, -74.18402 40.69708, -74.18391 40.69751, -74.18375 40.69780, -74.18363 40.69833, -74.1..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 40.63558, -73.82265 40.63537, -73.82254 40.63516, -73.82243 40.63495, -73.82234 40.63473, -73.82225 40.63451, -73.82217 40.63429, -73.82210 40.63407,..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.87099, -73.84699 40.87085, -73.84641 40.87055, -73.84609 40.87039, -73.84578 40.87023, -73.84517 40.86991, -73.84488 40.86976, -73.84442 40.86952, -73.8..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.72581, -73.97182 40.72581, -73.97186 40.72558, -73.97187 40.72550, -73.97188 40.72542, -73.97191 40.72529, -73.97193 40.72518, -73.97194 40.72516, -73.9..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.56227, -74.17226 40.56170, -74.17192 40.56155, -74.17185 40.56152, -74.17223 40.56114, -74.17245 40.56089, -74.17322 40.56008, -74.17359 40.55970, -74.1..."


In [67]:
def match(lon,lat):
    """Match point to taxi zone"""
    point = Point(lon, lat)
    for _,row in zones_wgs84.iterrows():
        if row['geometry'].contains(point):
            return row['LocationID']

    return None

In [68]:
matched = bus_merged.iloc[:10000,:].apply(lambda x:match(x['stop_lon'],x['stop_lat']),axis=1)

In [69]:
bus_merged['zone_id'] = matched

0       121
1       121
2       121
3       121
4       135
       ... 
9995    130
9996    130
9997    130
9998    130
9999    130
Length: 10000, dtype: int64