# Preprocessing of the Legal Operating Business Data
Read in data from landing zone, join with taxi zone data, and aggregate by location for usage in modelling.
Store the aggregated data in the raw zone.

In [1]:
import geopandas as gpd
import pandas as pd
import os

In [2]:
df_pd = pd.read_parquet('../data/landing/lob_data/')

In [3]:
df_gpd = gpd.GeoDataFrame(df_pd, geometry=gpd.points_from_xy(df_pd["longitude"], df_pd["latitude"]))
df_gpd.crs = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
df_gpd.dtypes

license_nbr                      object
license_type                     object
lic_expir_dd                     object
license_status                   object
license_creation_date            object
industry                         object
business_name                    object
business_name_2                  object
address_building                 object
address_street_name              object
address_city                     object
address_state                    object
address_zip                      object
contact_phone                    object
address_borough                  object
detail_2                         object
detail                           object
community_board                  object
council_district                 object
bin                              object
bbl                              object
nta                              object
census_tract                     object
longitude                        object
latitude                         object


In [4]:
sf = gpd.read_file("../data/taxi_zones/taxi_zones.shp")
zones = pd.read_csv("../data/taxi_zones/taxi_zone_lookup.csv")
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)

In [5]:
df_gpd = gpd.sjoin(df_gpd, gdf)

In [6]:
df_gpd.head()

Unnamed: 0,license_nbr,license_type,lic_expir_dd,license_status,license_creation_date,industry,business_name,business_name_2,address_building,address_street_name,...,index_right,LocationID,Borough,Zone,service_zone,OBJECTID,Shape_Leng,Shape_Area,zone,borough
18,0365714-DCA,Business,2005-03-31T00:00:00.000,Inactive,1997-04-08T00:00:00.000,Garage,THE GARDENS 75TH STREET OWNERS CORP.,,3538,75TH ST,...,128,129,Queens,Jackson Heights,Boro Zone,129,0.093273,0.000474,Jackson Heights,Queens
19,1076184-DCA,Business,2004-12-31T00:00:00.000,Inactive,2001-04-02T00:00:00.000,Tobacco Retail Dealer,"JALIL, MOHAMMED ABDUL",SUNRISE DELICATESSEN,2017,CHURCH AVE,...,88,89,Brooklyn,Flatbush/Ditmas Park,Boro Zone,89,0.122795,0.000448,Flatbush/Ditmas Park,Brooklyn
23,2061591-DCA,Business,2021-12-31T00:00:00.000,Inactive,2017-11-25T00:00:00.000,Laundries,SING WAH LAUNDROMAT INC.,,6914,4TH AVE,...,13,14,Brooklyn,Bay Ridge,Boro Zone,14,0.175214,0.001382,Bay Ridge,Brooklyn
24,2009334-DCA,Business,2020-05-23T00:00:00.000,Inactive,2014-06-09T00:00:00.000,Sidewalk Cafe,DCB DELANCEY CORPORATION,SEL ROSE,1,DELANCEY ST,...,147,148,Manhattan,Lower East Side,Yellow Zone,148,0.039131,7e-05,Lower East Side,Manhattan
26,2038840-DCA,Business,2021-07-31T00:00:00.000,Inactive,2016-06-09T00:00:00.000,Secondhand Dealer - General,MD BROADWAY ELECTRONICS LLC,,1888,JEROME AVE,...,168,169,Bronx,Mount Hope,Boro Zone,169,0.060105,0.000146,Mount Hope,Bronx


In [7]:
df_gpd.isnull().sum()

license_nbr                          0
license_type                         0
lic_expir_dd                     40879
license_status                       0
license_creation_date                0
industry                             0
business_name                       40
business_name_2                1180670
address_building                  9388
address_street_name                  0
address_city                       120
address_state                     2077
address_zip                        188
contact_phone                    62390
address_borough                     11
detail_2                       1362582
detail                            2112
community_board                   7719
council_district                  3053
bin                              25462
bbl                              25492
nta                             309912
census_tract                    309912
longitude                            0
latitude                             0
location                 

In [8]:
df_gpd.value_counts("industry")

industry
Tobacco Retail Dealer             383519
Home Improvement Contractor       320231
Secondhand Dealer - General       161957
Electronics Store                 145728
Stoop Line Stand                   95334
Electronic & Appliance Service     67854
Laundry                            67503
Laundries                          46185
Laundry Jobber                     42840
Sidewalk Cafe                      37209
Garage                             33536
Electronic Cigarette Dealer        31023
Secondhand Dealer - Auto           28179
Dealer In Products                 26587
Parking Lot                        16691
Tow Truck Company                  15015
Employment Agency                  13640
Amusement Device Temporary         10952
Pawnbroker                         10374
Special Sale                       10153
Amusement Device Portable           9707
Pedicab Business                    7875
Debt Collection Agency              7483
Newsstand                           6468
Cabaret

In [9]:
df_location_aggregated = df_gpd.groupby("LocationID").agg(
    count=("license_nbr", "count"),
)

df_location_aggregated = df_location_aggregated.rename(columns={
    "license_nbr": "num_businesses",
})
df_location_aggregated

Unnamed: 0_level_0,count
LocationID,Unnamed: 1_level_1
3,5001
4,3140
5,1758
6,2794
7,22048
...,...
259,6480
260,9772
261,2198
262,4089


In [10]:
output_relative_dir = '../data/raw/lob_data/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

In [11]:
df_location_aggregated.to_parquet(f"{output_relative_dir}/data.parquet")