In [149]:
import geopandas as gpd
import pandas as pd
import os

In [150]:
df_pd = pd.read_parquet('../data/landing/subway_data/')

In [151]:
df_gpd = gpd.GeoDataFrame(df_pd, geometry=gpd.points_from_xy(df_pd["gtfs_longitude"], df_pd["gtfs_latitude"]))
df_gpd.crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
df_gpd.dtypes

gtfs_stop_id                     object
station_id                       object
complex_id                       object
division                         object
line                             object
stop_name                        object
borough                          object
cbd                              object
daytime_routes                   object
structure                        object
gtfs_latitude                    object
gtfs_longitude                   object
north_direction_label            object
south_direction_label            object
ada                              object
ada_northbound                   object
ada_southbound                   object
georeference                     object
:@computed_region_yamh_8v7k      object
:@computed_region_wbg7_3whc      object
:@computed_region_kjdx_g34t      object
ada_notes                        object
geometry                       geometry
dtype: object

In [152]:
sf = gpd.read_file("../data/taxi_zones/taxi_zones.shp")
zones = pd.read_csv("../data/taxi_zones/taxi+_zone_lookup.csv")
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)

In [153]:
df_gpd = gpd.sjoin(df_gpd, gdf)

In [154]:
df_gpd.head()

Unnamed: 0,gtfs_stop_id,station_id,complex_id,division,line,stop_name,borough_left,cbd,daytime_routes,structure,...,index_right,LocationID,Borough,Zone,service_zone,OBJECTID,Shape_Leng,Shape_Area,zone,borough_right
0,R01,1,1,BMT,Astoria,Astoria-Ditmars Blvd,Q,False,N W,Elevated,...,222,223,Queens,Steinway,Boro Zone,223,0.166022,0.000764,Steinway,Queens
1,R03,2,2,BMT,Astoria,Astoria Blvd,Q,False,N W,Elevated,...,222,223,Queens,Steinway,Boro Zone,223,0.166022,0.000764,Steinway,Queens
2,R04,3,3,BMT,Astoria,30 Av,Q,False,N W,Elevated,...,6,7,Queens,Astoria,Boro Zone,7,0.107417,0.00039,Astoria,Queens
3,R05,4,4,BMT,Astoria,Broadway,Q,False,N W,Elevated,...,6,7,Queens,Astoria,Boro Zone,7,0.107417,0.00039,Astoria,Queens
4,R06,5,5,BMT,Astoria,36 Av,Q,False,N W,Elevated,...,6,7,Queens,Astoria,Boro Zone,7,0.107417,0.00039,Astoria,Queens


In [155]:
df_gpd.isnull().sum()

gtfs_stop_id                     0
station_id                       0
complex_id                       0
division                         0
line                             0
stop_name                        0
borough_left                     0
cbd                              0
daytime_routes                   0
structure                        0
gtfs_latitude                    0
gtfs_longitude                   0
north_direction_label            0
south_direction_label            0
ada                              0
ada_northbound                   0
ada_southbound                   0
georeference                     0
:@computed_region_yamh_8v7k      0
:@computed_region_wbg7_3whc      9
:@computed_region_kjdx_g34t      3
ada_notes                      488
geometry                         0
index_right                      0
LocationID                       0
Borough                          0
Zone                             0
service_zone                     0
OBJECTID            

In [156]:
df_gpd["num_connected_lines"] = df_gpd["daytime_routes"].apply(lambda x: len(x.split()))
df_gpd["num_connected_lines"].value_counts()

num_connected_lines
1    292
2    154
3     33
4     17
Name: count, dtype: int64

In [157]:
def get_num_connected_lines(lines):
    connected_lines = set()
    for line in lines:
        connected_lines = connected_lines.union(set(line.split()))
    return len(connected_lines)

In [158]:
df_location_aggregated = df_gpd.groupby("LocationID").agg(
    {
        "station_id": "count",
        "daytime_routes": get_num_connected_lines
    }
)
df_location_aggregated = df_location_aggregated.rename(columns={"station_id": "num_stations"})
df_location_aggregated.head()

Unnamed: 0_level_0,num_stations,daytime_routes
LocationID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2,1
6,1,1
7,5,4
14,4,1
17,4,2


In [159]:
output_relative_dir = '../data/raw/subway_data/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

In [160]:
df_location_aggregated.to_parquet(f"{output_relative_dir}/data.parquet")