# Step 5: Feature Engineering for NYC 311 Modeling

This notebook demonstrates the complete feature engineering pipeline for three modeling tracks:
1. **Forecast** - Time-series forecasting of ticket arrivals
2. **Triage** - Ticket prioritization at creation time
3. **Duration** - Survival modeling for time-to-close

All features are **leakage-safe** and use **H3-based spatial grouping**.


In [1]:
import os
import sys

PACKAGE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, PACKAGE_PATH)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


from src import preprocessing
from src import features
from src import utils


pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')
from importlib import reload

## Usage Instructions

This notebook uses the feature engineering module from `src/features.py`.

To run this notebook:
1. Ensure you have data in `data/landing/311-service-requests/`
2. Run `pip install -r requirements.txt` to install dependencies
3. Execute cells sequentially

For detailed documentation, see `src/FEATURE_ENGINEERING_README.md`


In [13]:
df_orig = preprocessing.preprocess_and_merge_external_data()

Loading DOHMH data...
Data Shape: (1029643, 27)
Preprocessing DOHMH data...
Data Shape: (614831, 44)
Merging census data...
Data Shape: (614831, 48)
Merging weather data...
Data Shape: (601913, 59)

Final Data Shape: (601913, 59)


In [15]:
df = df_orig.copy()

In [16]:
reload(features)
reload(utils)


<module 'src.utils' from 'c:\\Users\\gorav\\GitHub\\nyc-311-service-requests\\src\\utils.py'>

In [17]:
df = features.add_h3_keys(df, lat='latitude', lon='longitude', res=8)

In [18]:
forecast_panel = features.build_forecast_panel(df)

  panel = panel.groupby(['hex', 'complaint_family'], group_keys=False).apply(


In [19]:
forecast_panel.sort_values(['day', 'hex', 'complaint_family'])

Unnamed: 0,hex,complaint_family,day,y,dow,month,lag1,lag7,roll7,roll28,momentum,days_since_last,tavg,prcp,heating_degree,cooling_degree,rain_3d,rain_7d,log_pop,nbr_roll7,nbr_roll28
19509,882a10013dfffff,vector_control,2010-01-01,1.0,4,1,,,,,,,33.206,0.457480,31.794,0.000,0.457480,0.457480,8.854522,0.0,0.0
57976,882a100889fffff,food_safety,2010-01-01,1.0,4,1,,,,,,,33.170,0.345276,31.830,0.000,0.345276,0.345276,10.712883,0.0,0.0
81552,882a1008c3fffff,vector_control,2010-01-01,1.0,4,1,,,,,,,33.170,0.345276,31.830,0.000,0.345276,0.345276,10.553832,0.0,0.0
115570,882a100a91fffff,animal_control,2010-01-01,1.0,4,1,,,,,,,33.206,0.457480,31.794,0.000,0.457480,0.457480,10.294211,0.0,0.0
121686,882a100a9dfffff,vector_control,2010-01-01,1.0,4,1,,,,,,,33.206,0.457480,31.794,0.000,0.457480,0.457480,10.424659,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516366,882a107739fffff,food_safety,2025-07-31,1.0,3,7,0.0,2.0,4.0,13.0,0.307692,2.0,87.314,0.000000,0.000,22.314,0.000000,0.417323,10.414813,6.0,19.0
517450,882a10773dfffff,animal_control,2025-07-31,1.0,3,7,2.0,0.0,5.0,6.0,0.833333,1.0,87.314,0.000000,0.000,22.314,0.000000,0.417323,10.238709,5.0,6.0
517805,882a10773dfffff,food_safety,2025-07-31,1.0,3,7,0.0,0.0,2.0,6.0,0.333333,5.0,87.314,0.000000,0.000,22.314,0.000000,0.417323,10.238709,6.0,19.0
522625,882a107749fffff,food_safety,2025-07-31,1.0,3,7,0.0,0.0,3.0,7.0,0.428571,2.0,87.314,0.000000,0.000,22.314,0.000000,0.417323,10.287593,5.0,14.0


In [20]:
triage_features, tfidf_matrix, vectorizer = features.build_triage_features(df)

  history_panel = history_panel.groupby(['hex', 'complaint_family'], group_keys=False).apply(compute_history)
  site_panel = site_panel.groupby('site_key', group_keys=False).apply(compute_site_history)


In [21]:
duration_labels = features.build_duration_survival_labels(df)


In [22]:
duration_features = features.build_duration_features(df, triage_features)

  df = df.groupby(group_cols, group_keys=False).apply(rolling_count)
  df = df.groupby(group_cols, group_keys=False).apply(rolling_count)


In [23]:
duration_features

Unnamed: 0,unique_key,hour,dow,month,is_created_at_midnight,is_weekend,due_gap_hours,due_crosses_weekend,tavg,prcp,heat_flag,freeze_flag,geo_family_roll7,geo_family_roll28,days_since_last_geo_family,repeat_site_14d,repeat_site_28d,complaint_family_air_smoke_mold,complaint_family_animal_control,complaint_family_food_safety,complaint_family_vector_control,open_data_channel_type_MOBILE,open_data_channel_type_ONLINE,open_data_channel_type_OTHER,open_data_channel_type_PHONE,open_data_channel_type_UNKNOWN,location_type_Restaurant/Bar/Deli/Bakery,location_type_3+ Family Apt. Building,location_type_1-2 Family Dwelling,location_type_3+ Family Apartment Building,location_type_Other (Explain Below),location_type_Commercial Building,location_type_Residential Building,location_type_3+ Family Mixed Use Building,location_type_Mobile Food Vendor,location_type_Public/Unfenced Area,borough_BRONX,borough_BROOKLYN,borough_MANHATTAN,borough_QUEENS,borough_STATEN ISLAND,borough_Unspecified,borough__missing,facility_type_N/A,facility_type__missing,intake_6h,intake_24h,open_7d_geo_family
0,15633315,21,4,1,False,0,720.0,1,33.170,0.345276,0,1,1.0,1.0,999.0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,0,0,0
1,15633066,8,4,1,False,0,888.0,1,33.170,0.345276,0,1,1.0,1.0,999.0,0.0,0.0,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,0,0,0
2,15634668,13,4,1,False,0,336.0,1,33.170,0.345276,0,1,2.0,2.0,999.0,0.0,0.0,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,2,2,0
3,15634638,23,4,1,False,0,1440.0,1,33.170,0.345276,0,1,1.0,1.0,999.0,0.0,0.0,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,1,6,0
4,15633386,17,4,1,False,0,1440.0,1,33.170,0.345276,0,1,1.0,1.0,999.0,0.0,0.0,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,2,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601908,65715962,9,3,7,False,0,0.0,0,87.008,0.000000,1,0,8.0,31.0,767.0,87.0,188.0,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,0,1,0
601909,65722718,12,3,7,False,0,0.0,0,86.648,0.000000,1,0,8.0,34.0,1.0,87.0,188.0,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,3,15,0
601910,65717056,9,3,7,False,0,0.0,0,87.260,0.000000,1,0,10.0,33.0,5.0,87.0,188.0,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,2,18,0
601911,65723886,11,3,7,False,0,0.0,0,87.314,0.000000,1,0,7.0,29.0,43.0,87.0,188.0,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,0,0,0
