# Load Data



### Set env variables

assumes that `SOCRATA_APP_TOKEN`, `SOCRATA_API_KEY_ID`, and `SOCRATA_API_KEY_SECRET` are set in `.env`

In [1]:
from dotenv import load_dotenv

load_dotenv()


True

### Load Packages

In [2]:
import os
import sys

INGESTION_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, INGESTION_PATH)

from ingestion import fetch
from ingestion import config



### Full Pull of Data
Takes about 48 minutes

In [None]:
await fetch.fetch_all_data(save = True)

### Incremental Pull of Data

In [None]:
# await fetch.fetch_current_month(save = True)

Fetching current month: 2025-09
Fetching 2025-09 ...
Saved c:\Users\gorav\GitHub\nyc-311-service-requests\data\landing\311-service-requests\year=2025/month=09/part-0000.parquet (280,722 rows)
Current month data updated successfully!


### Function

In [5]:
# def _create_socrata_client(api_endpoint, app_token, username, password):
#     """"
    
#     """
#     client = Socrata(api_endpoint, app_token=app_token, username=username, password=password, timeout = 600)
#     return client


In [6]:
# def _fetch_and_save_month_sync(year: int, month: int):
#     """Runs in a worker thread. Creates its own Socrata client (no sharing)."""
#     # per-thread client
#     # client = Socrata("data.cityofnewyork.us", APP_TOKEN)
#     client = _create_socrata_client(api_endpoint="data.cityofnewyork.us", app_token=APP_TOKEN, username=API_KEY_ID, password=API_KEY_SECRET)

#     start = f"{year}-{month:02d}-01T00:00:00"
#     if month == 12:
#         end = f"{year+1}-01-01T00:00:00"
#     else:
#         end = f"{year}-{month+1:02d}-01T00:00:00"

#     where_clause = f"{date_column} >= '{start}' AND {date_column} < '{end}'"
#     print(f"Fetching {year}-{month:02d} ...")

#     # client.get_all handles paging internally (blocking)
#     results = list(client.get_all(dataset_id, where=where_clause))

#     if results:
#         df = pd.DataFrame.from_records(results)
#         # write to year/month partition
#         file_path = os.path.join(output_dir, f"year={year}/month={month:02d}/part-0000.parquet")
#         os.makedirs(os.path.dirname(file_path), exist_ok=True)
#         # choose your engine; fastparquet avoids some pyarrow quirks in notebooks
#         df.to_parquet(file_path, index=False, engine="fastparquet")
#         print(f"Saved {file_path} ({len(df):,} rows)")

#         # clean up memory
#         del df
#         del results
#         gc.collect()
#     else:
#         print(f"No data for {year}-{month:02d}")

# async def fetch_and_save(year: int, month: int, sem: asyncio.Semaphore):
#     async with sem:
#         # run the sync worker in a thread
#         await asyncio.to_thread(_fetch_and_save_month_sync, year, month)

# async def pull_latest_data():
#     sem = asyncio.Semaphore(MAX_CONCURRENCY)
#     tasks = [fetch_and_save(y, m, sem) for y in years for m in months]
#     await asyncio.gather(*tasks)


### Apply Function

In [7]:
# dataset_id = "erm2-nwe9"
# date_column = "created_date"
# output_dir = "../data/311-service-requests"
# os.makedirs(output_dir, exist_ok=True)

# years = range(2010, 2026)   # e.g., 2010–2011
# months = range(1, 13)       # 1..12
# MAX_CONCURRENCY = 20         # tune for your machine / API limits

In [8]:
# await pull_latest_data()

### Load Data

In [7]:
import polars as pl
import numpy as np
from pathlib import Path


## Exploratory Data Analysis with Polars


### Load all parquet files using Polars lazy API


In [9]:
data_path = Path("../data/landing/311-service-requests")

lf = pl.scan_parquet(
    str(data_path / "**/*.parquet"),
    hive_partitioning=True,
    # schema=config.SCHEMA
)

### Basic Information


In [10]:
# Get shape and basic info
print("Dataset Shape:")
print(f"Rows: {lf.select(pl.len()).collect().item():,}")
print(f"Columns: {len(lf.collect_schema())}")
print(f"\nColumn Names and Types:")
for col, dtype in lf.collect_schema().items():
    print(f"  {col}: {dtype}")


Dataset Shape:
Rows: 280,722
Columns: 43

Column Names and Types:
  unique_key: String
  created_date: Datetime(time_unit='ms', time_zone=None)
  closed_date: Datetime(time_unit='ms', time_zone=None)
  agency: String
  agency_name: String
  complaint_type: String
  descriptor: String
  location_type: String
  incident_zip: String
  incident_address: String
  street_name: String
  cross_street_1: String
  cross_street_2: String
  intersection_street_1: String
  intersection_street_2: String
  address_type: String
  city: String
  landmark: String
  facility_type: String
  status: String
  due_date: Datetime(time_unit='ms', time_zone=None)
  resolution_description: String
  resolution_action_updated_date: Datetime(time_unit='ms', time_zone=None)
  community_board: String
  bbl: String
  borough: String
  x_coordinate_state_plane: Float64
  y_coordinate_state_plane: Float64
  open_data_channel_type: String
  park_facility_name: String
  park_borough: String
  vehicle_type: String
  taxi_c

### Preview the Data


In [11]:
# Collect a sample to view
lf.head(10).collect()


unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,location_type,incident_zip,incident_address,street_name,cross_street_1,cross_street_2,intersection_street_1,intersection_street_2,address_type,city,landmark,facility_type,status,due_date,resolution_description,resolution_action_updated_date,community_board,bbl,borough,x_coordinate_state_plane,y_coordinate_state_plane,open_data_channel_type,park_facility_name,park_borough,vehicle_type,taxi_company_borough,taxi_pick_up_location,bridge_highway_name,bridge_highway_direction,road_ramp,bridge_highway_segment,latitude,longitude,location,year,month
str,datetime[ms],datetime[ms],str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,datetime[ms],str,datetime[ms],str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,f64,f64,str,i64,i64
"""66023011""",2025-09-01 00:00:00,2025-09-02 12:27:00,"""DSNY""","""Department of Sanitation""","""Derelict Vehicles""","""Derelict Vehicles""","""Street""","""11421""","""91-50 90 STREET""","""90 STREET""","""91 AVENUE""","""ATLANTIC AVENUE""",,,"""ADDRESS""","""WOODHAVEN""",,"""DSNY Garage""","""Closed""",,"""The Department of Sanitation (…",2025-09-02 12:00:00,"""09 QUEENS""","""4089830030""","""QUEENS""",1025078.0,189996.0,"""PHONE""","""Unspecified""","""QUEENS""",,,,,,,,40.688078,-73.852782,"""{'latitude': '40.6880777339179…",2025,9
"""66008529""",2025-09-01 00:00:10,2025-09-01 00:26:39,"""NYPD""","""New York City Police Departmen…","""Noise - Vehicle""","""Car/Truck Music""","""Street/Sidewalk""","""11412""","""116 ROAD""","""116 ROAD""","""205 STREET""","""FRANCIS LEWIS BOULEVARD""","""205 STREET""","""FRANCIS LEWIS BOULEVARD""","""BLOCKFACE""",,,,"""Closed""",,"""The Police Department responde…",2025-09-01 00:26:45,"""Unspecified QUEENS""",,"""QUEENS""",,,"""PHONE""","""Unspecified""","""QUEENS""","""Car""",,,,,,,,,,2025,9
"""66003967""",2025-09-01 00:00:11,2025-09-01 03:08:50,"""NYPD""","""New York City Police Departmen…","""Noise - Residential""","""Banging/Pounding""","""Residential Building/House""","""10467""","""3320 KOSSUTH AVENUE""","""KOSSUTH AVENUE""","""EAST 208 STREET""","""EAST 210 STREET""","""EAST 208 STREET""","""EAST 210 STREET""","""ADDRESS""","""BRONX""","""KOSSUTH AVENUE""",,"""Closed""",,"""The Police Department responde…",2025-09-01 03:08:54,"""07 BRONX""","""2033260064""","""BRONX""",1016890.0,259709.0,"""ONLINE""","""Unspecified""","""BRONX""",,,,,,,,40.879455,-73.881968,"""{'latitude': '40.8794550894661…",2025,9
"""66006371""",2025-09-01 00:00:13,2025-09-01 04:22:30,"""NYPD""","""New York City Police Departmen…","""Noise - Street/Sidewalk""","""Loud Music/Party""","""Street/Sidewalk""","""10461""","""1370 BLONDELL AVENUE""","""BLONDELL AVENUE""","""PONTON AVENUE""","""ROBERTS AVENUE""","""PONTON AVENUE""","""ROBERTS AVENUE""","""ADDRESS""","""BRONX""","""BLONDELL AVENUE""",,"""Closed""",,"""The Police Department responde…",2025-09-01 04:22:34,"""11 BRONX""","""2041390106""","""BRONX""",1027821.0,246255.0,"""PHONE""","""Unspecified""","""BRONX""",,,,,,,,40.842481,-73.842527,"""{'latitude': '40.8424807759071…",2025,9
"""66006304""",2025-09-01 00:00:15,2025-09-01 16:40:30,"""NYPD""","""New York City Police Departmen…","""Illegal Parking""","""Blocked Hydrant""","""Street/Sidewalk""","""11370""","""21-38 80 STREET""","""80 STREET""","""21 AVENUE""","""DITMARS BOULEVARD""","""21 AVENUE""","""DITMARS BOULEVARD""","""ADDRESS""","""EAST ELMHURST""","""80 STREET""",,"""Closed""",,"""The Police Department responde…",2025-09-01 16:40:34,"""01 QUEENS""","""4009750023""","""QUEENS""",1014745.0,219840.0,"""ONLINE""","""Unspecified""","""QUEENS""",,,,,,,,40.770034,-73.889906,"""{'latitude': '40.7700337862021…",2025,9
"""66004821""",2025-09-01 00:00:17,2025-09-01 00:30:55,"""NYPD""","""New York City Police Departmen…","""Noise - Street/Sidewalk""","""Loud Music/Party""","""Street/Sidewalk""","""11102""","""1-05 ASTORIA BOULEVARD""","""ASTORIA BOULEVARD""","""1 STREET""","""DEAD END""","""1 STREET""","""DEAD END""","""ADDRESS""","""ASTORIA""","""ASTORIA BOULEVARD""",,"""Closed""",,"""The Police Department responde…",2025-09-01 00:31:00,"""01 QUEENS""","""4004900101""","""QUEENS""",1001827.0,221234.0,"""MOBILE""","""Unspecified""","""QUEENS""",,,,,,,,40.773895,-73.936539,"""{'latitude': '40.7738950751967…",2025,9
"""66011673""",2025-09-01 00:00:18,2025-09-01 01:19:43,"""NYPD""","""New York City Police Departmen…","""Illegal Parking""","""Double Parked Blocking Traffic""","""Street/Sidewalk""","""10451""","""825 GERARD AVENUE""","""GERARD AVENUE""","""EAST 157 STREET""","""EAST 158 STREET""","""EAST 157 STREET""","""EAST 158 STREET""","""ADDRESS""","""BRONX""","""GERARD AVENUE""",,"""Closed""",,"""The Police Department responde…",2025-09-01 01:19:46,"""04 BRONX""","""2024830015""","""BRONX""",1004754.0,240171.0,"""MOBILE""","""Unspecified""","""BRONX""","""Car""",,,,,,,40.825866,-73.925913,"""{'latitude': '40.8258656451982…",2025,9
"""66005759""",2025-09-01 00:00:21,2025-09-01 01:12:14,"""NYPD""","""New York City Police Departmen…","""Noise - Residential""","""Banging/Pounding""","""Residential Building/House""","""10454""","""518 EAST 138 STREET""","""EAST 138 STREET""","""BROOK AVENUE""","""ST ANNS AVENUE""","""BROOK AVENUE""","""ST ANNS AVENUE""","""ADDRESS""","""BRONX""","""EAST 138 STREET""",,"""Closed""",,"""The Police Department responde…",2025-09-01 01:12:18,"""01 BRONX""","""2022650029""","""BRONX""",1006740.0,233427.0,"""ONLINE""","""Unspecified""","""BRONX""",,,,,,,,40.80735,-73.91876,"""{'latitude': '40.8073504307348…",2025,9
"""66008008""",2025-09-01 00:00:21,2025-09-01 01:03:26,"""NYPD""","""New York City Police Departmen…","""Illegal Parking""","""Posted Parking Sign Violation""","""Street/Sidewalk""","""10039""","""348 WEST 145 STREET""","""WEST 145 STREET""","""EDGECOMBE AVENUE""","""ST NICHOLAS AVENUE""","""EDGECOMBE AVENUE""","""ST NICHOLAS AVENUE""","""ADDRESS""","""NEW YORK""","""WEST 145 STREET""",,"""Closed""",,"""The Police Department responde…",2025-09-01 01:03:31,"""09 MANHATTAN""","""1020510056""","""MANHATTAN""",999711.0,239400.0,"""MOBILE""","""Unspecified""","""MANHATTAN""",,,,,,,,40.82376,-73.944137,"""{'latitude': '40.8237597367640…",2025,9
"""66007110""",2025-09-01 00:00:26,2025-09-01 01:05:30,"""NYPD""","""New York City Police Departmen…","""Noise - Residential""","""Loud Music/Party""","""Residential Building/House""","""11229""","""2148 EAST 13 STREET""","""EAST 13 STREET""","""AVENUE U""","""AVENUE V""","""AVENUE U""","""AVENUE V""","""ADDRESS""","""BROOKLYN""","""EAST 13 STREET""",,"""Closed""",,"""The Police Department responde…",2025-09-01 01:05:33,"""15 BROOKLYN""","""3073450126""","""BROOKLYN""",995895.0,156995.0,"""MOBILE""","""Unspecified""","""BROOKLYN""",,,,,,,,40.597584,-73.958067,"""{'latitude': '40.5975836087298…",2025,9


### Missing Values Analysis


In [None]:
# Calculate null counts and percentages
total_rows = lf.select(pl.len()).collect().item()

null_counts = lf.select([
    pl.col(col).is_null().sum().alias(col) 
    for col in lf.collect_schema().names()
]).collect()

# Convert to long format for better readability
null_summary = pl.DataFrame({
    "Column": null_counts.columns,
    "Null_Count": null_counts.row(0),
}).with_columns([
    (pl.col("Null_Count") / total_rows * 100).round(2).alias("Null_Percentage")
]).sort("Null_Count", descending=True)

print(f"Missing Values Summary (Total Rows: {total_rows:,})")
null_summary


Missing Values Summary (Total Rows: 280,722)


Column,Null_Count,Null_Percentage
str,i64,f64
"""vehicle_type""",267183,95.18


In [None]:
null_summary.filter(pl.col('Column') == 'vehicle_type')


### Top Complaint Types


In [None]:
# Top complaint types
top_complaints = df.group_by("complaint_type").agg([
    pl.len().alias("count")
]).sort("count", descending=True).limit(20).collect()

print("Top 20 Complaint Types:")
top_complaints


### Temporal Analysis


In [None]:
# Requests over time (by year-month)
temporal = df.with_columns([
    pl.col("created_date").str.to_datetime().alias("created_dt")
]).with_columns([
    pl.col("created_dt").dt.year().alias("year"),
    pl.col("created_dt").dt.month().alias("month")
]).group_by(["year", "month"]).agg([
    pl.len().alias("num_requests")
]).sort(["year", "month"]).collect()

print("Service Requests Over Time:")
temporal


### Geographic Distribution


In [None]:
# Requests by borough
by_borough = df.group_by("borough").agg([
    pl.len().alias("count")
]).sort("count", descending=True).collect()

print("Service Requests by Borough:")
by_borough


### Agency Analysis


In [None]:
# Top agencies handling requests
top_agencies = df.group_by("agency").agg([
    pl.len().alias("count")
]).sort("count", descending=True).limit(15).collect()

print("Top 15 Agencies:")
top_agencies


### Status Distribution


In [None]:
# Request status distribution
status_dist = df.group_by("status").agg([
    pl.len().alias("count")
]).sort("count", descending=True).collect()

print("Status Distribution:")
status_dist


### Response Time Analysis


In [None]:
# Calculate response times where both created_date and closed_date exist
response_times = df.filter(
    pl.col("closed_date").is_not_null() & pl.col("created_date").is_not_null()
).with_columns([
    pl.col("created_date").str.to_datetime().alias("created_dt"),
    pl.col("closed_date").str.to_datetime().alias("closed_dt")
]).with_columns([
    (pl.col("closed_dt") - pl.col("created_dt")).dt.total_hours().alias("response_hours")
]).filter(
    pl.col("response_hours") >= 0  # Only positive response times
).select([
    pl.col("response_hours").mean().alias("mean_hours"),
    pl.col("response_hours").median().alias("median_hours"),
    pl.col("response_hours").std().alias("std_hours"),
    pl.col("response_hours").min().alias("min_hours"),
    pl.col("response_hours").max().alias("max_hours"),
    pl.col("response_hours").quantile(0.25).alias("q25_hours"),
    pl.col("response_hours").quantile(0.75).alias("q75_hours"),
    pl.col("response_hours").quantile(0.90).alias("q90_hours"),
    pl.len().alias("count_with_close_date")
]).collect()

print("Response Time Statistics (in hours):")
response_times


### Year-over-Year Growth


In [None]:
# Yearly trends
yearly = df.with_columns([
    pl.col("created_date").str.to_datetime().dt.year().alias("year")
]).group_by("year").agg([
    pl.len().alias("num_requests")
]).sort("year").collect()

print("Year-over-Year Requests:")
yearly


### Top Complaint Types by Borough


In [None]:
# Top complaints by borough
complaints_by_borough = df.group_by(["borough", "complaint_type"]).agg([
    pl.len().alias("count")
]).sort(["borough", "count"], descending=[False, True]).collect()

# Get top 5 complaints per borough
top_per_borough = complaints_by_borough.group_by("borough").head(5)

print("Top 5 Complaint Types per Borough:")
top_per_borough
