# Load Data



## Set env variables

assumes that `SOCRATA_APP_TOKEN`, `SOCRATA_API_KEY_ID`, and `SOCRATA_API_KEY_SECRET` are set in `.env`

In [1]:
from dotenv import load_dotenv

load_dotenv()


True

## Load Packages

In [2]:
import os
import sys

INGESTION_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, INGESTION_PATH)

from ingestion import fetch
from ingestion import config



In [3]:
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path

import geopandas as gpd
from shapely.geometry import Point

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, date

## Load Data

In [4]:
data_path = Path("../data/landing/311-service-requests")

lf = pl.scan_parquet(
    str(data_path / "**/*.parquet"),
    hive_partitioning=True,
)

## Subset Data to DOHMH

In [5]:
# Filter for DOHMH agency and add derived columns
dohmh_data_only = lf.filter(pl.col("agency") == "DOHMH").select([
    "due_date",
    "landmark",
    "intersection_street_1",
    "intersection_street_2",
    "facility_type",
    "cross_street_2",
    "cross_street_1",
    "bbl",
    "location_type",
    "street_name",
    "incident_address",
    "address_type",
    "longitude",
    "latitude",
    "x_coordinate_state_plane",
    "y_coordinate_state_plane",
    "city",
    "incident_zip",
    "resolution_description",
    "closed_date",
    "resolution_action_updated_date",
    "descriptor",
    "community_board",
    "park_borough",
    "borough",
    "unique_key",
    "created_date",
    "agency_name",
    "agency",
    "complaint_type",
    "status",
    "open_data_channel_type",
    "park_facility_name",
    "year",
    "month"
]).collect()

df_orig = dohmh_data_only.to_pandas()

## Merge Census Data

In [6]:
data_path = os.path.abspath(os.path.join(config.LOCAL_OUTPUT_DIR, "acs_population", "combined_population_data.csv"))
df_pop = pd.read_csv(data_path)
df_pop['GEOID'] = df_pop['GEOID'].astype(str)


In [7]:
bg_shapefile = os.path.abspath(os.path.join(INGESTION_PATH, 'resources', "tl_2022_36_bg"))  # replace with your path
gdf_bg = gpd.read_file(bg_shapefile)

In [8]:
geometry = [Point(xy) for xy in zip(df_orig.longitude, df_orig.latitude)]
gdf_orig = gpd.GeoDataFrame(df_orig, geometry=geometry, crs="EPSG:4326")

gdf_bg = gdf_bg.to_crs("EPSG:4326")  # make sure CRS matches
gdf_orig_bg = gpd.sjoin(gdf_orig, gdf_bg[['GEOID', 'geometry']], how="left")

df_orig_merged = gdf_orig_bg.merge(df_pop, on=['GEOID', 'year'], how='left')


In [9]:
df_orig_merged

Unnamed: 0,due_date,landmark,intersection_street_1,intersection_street_2,facility_type,cross_street_2,cross_street_1,bbl,location_type,street_name,...,complaint_type,status,open_data_channel_type,park_facility_name,year,month,geometry,index_right,GEOID,population
0,2010-02-07 08:20:22,,,,,WEST 33 STREET,WEST 32 STREET,1007810002,Restaurant/Bar/Deli/Bakery,PENN PLAZA,...,Food Establishment,Pending,PHONE,Unspecified,2010,1,POINT (-73.99135 40.75002),3930.0,360610101001,
1,2010-01-31 08:29:58,,,,,THROOP AVENUE,TOMPKINS AVENUE,3017760036,3+ Family Apt. Building,PULASKI STREET,...,Rodent,Pending,PHONE,Unspecified,2010,1,POINT (-73.94377 40.69299),3090.0,360470261002,
2,2010-02-07 09:59:50,,,,,WARWICK STREET,JEROME STREET,3040920004,Building (Non-Residential),NEW LOTS AVENUE,...,Non-Residential Heat,Assigned,PHONE,Unspecified,2010,1,POINT (-73.88422 40.66573),3057.0,360471124001,
3,2010-01-15 10:35:38,,BROADWAY,WEST 108 STREET,,,,,Restaurant/Bar/Deli/Bakery,,...,Food Poisoning,Pending,ONLINE,Unspecified,2010,1,POINT (-73.96757 40.80293),5435.0,360610195003,
4,2010-01-31 10:43:52,,,,,20 AVENUE,19 AVENUE,3054620071,1-2 Family Dwelling,51 STREET,...,Unsanitary Animal Pvt Property,Closed,PHONE,Unspecified,2010,1,POINT (-73.97901 40.6253),11414.0,360470464001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030032,NaT,EVERGREEN AVENUE,MADISON STREET,PUTNAM AVENUE,,PUTNAM AVENUE,MADISON STREET,3033680007,Residential Building,EVERGREEN AVENUE,...,Smoking or Vaping,In Progress,ONLINE,Unspecified,2025,9,POINT (-73.91667 40.69096),3017.0,360470415002,
1030033,NaT,GRAND CONCOURSE,EAST 153 STREET,EAST 156 STREET,,EAST 156 STREET,EAST 153 STREET,2024587501,Residential Building,GRAND CONCOURSE,...,Smoking or Vaping,In Progress,ONLINE,Unspecified,2025,9,POINT (-73.92514 40.82196),2120.0,360050063014,
1030034,NaT,WEST 35 STREET,7 AVENUE,8 AVENUE,,8 AVENUE,7 AVENUE,1007840054,Commercial Building,WEST 35 STREET,...,Smoking or Vaping,In Progress,PHONE,Unspecified,2025,9,POINT (-73.99093 40.75198),2823.0,360610109001,
1030035,NaT,WEST 41 STREET,BROADWAY,7 AVENUE,,7 AVENUE,BROADWAY,,Mobile Food Vendor,WEST 41 STREET,...,Mobile Food Vendor,In Progress,MOBILE,Unspecified,2025,9,POINT (-73.98732 40.75533),3975.0,360610113001,


### Merge Weather Data

In [24]:
data_path = os.path.abspath(os.path.join(config.LOCAL_OUTPUT_DIR, "noaa-nclimgrid-daily", "nyc_fips_weather_data.csv"))
df_weather = pd.read_csv(data_path)
df_weather['fips'] = df_weather['fips'].astype(str)

In [None]:
df_orig_merged['fips'] = df_orig_merged['GEOID'].apply(lambda x: str(x)[:5])
df_orig_merged = df_orig_merged.merge(df_weather[['fips', 'year', 'month', 'tmax', 'tmin', 'tavg', 'prcp']], on = ['fips', 'year', 'month'], how = 'left')

In [None]:
df_orig_merged