# Step 5: Feature Engineering for NYC 311 Modeling

This notebook demonstrates the complete feature engineering pipeline for three modeling tracks:
1. **Forecast** - Time-series forecasting of ticket arrivals
2. **Triage** - Ticket prioritization at creation time
3. **Duration** - Survival modeling for time-to-close

All features are **leakage-safe** and use **H3-based spatial grouping**.


In [3]:
import os
import sys

PACKAGE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, PACKAGE_PATH)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


from src import preprocessing
from src import features
from src import utils
from src import config
from models import forecast




pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')
from importlib import reload

## Usage Instructions

This notebook uses the feature engineering module from `src/features.py`.

To run this notebook:
1. Ensure you have data in `data/landing/311-service-requests/`
2. Run `pip install -r requirements.txt` to install dependencies
3. Execute cells sequentially

For detailed documentation, see `src/FEATURE_ENGINEERING_README.md`


In [4]:
df_orig = preprocessing.preprocess_and_merge_external_data()

Loading DOHMH data...
Data Shape: (1029643, 27)
Preprocessing DOHMH data...
Data Shape: (614831, 44)
Merging census data...
Data Shape: (614831, 48)
Merging weather data...
Data Shape: (601913, 59)

Final Data Shape: (601913, 59)


In [5]:
df = df_orig.copy()

In [6]:
df = features.add_h3_keys(df, lat='latitude', lon='longitude', res=8)
forecast_panel = features.build_forecast_panel(df)



  panel = panel.groupby(['hex', 'complaint_family'], group_keys=False).apply(


### Explore Feature Interactions

In [22]:
reload(forecast)
panel = forecast_panel.copy()

In [None]:
def transform_data(X):
    """
    Apply transformation needed to create features for fitting

        Parameters:
            X: DataFrame of Features
        Returns:
            X: X with calculated features
    """
    X["effective_year_built"] = np.where(X["renovyear"] != 0, X["renovyear"], X["yearbuilt"])
    X["property_age"] = datetime.datetime.now().year - X["yearbuilt"]
    X["effective_property_age"] = datetime.datetime.now().year - X["effective_year_built"]

    if "totalassessedvalue" in X.columns:
        X["totalassessedvalue_per_size"] = X["totalassessedvalue"] / X["size"]

    if set(config.CENSUSSTATSINPUTCOLS).issubset(set(config.INPUTCOLUMNS)):
        # create census stats features
        X = create_census_stat_features(X)

    return X


def select_features(X, feature_list):
    """
    Select columns from X based off of list

        Parameters:
            X: DataFrame of features transformation
            feature_list: list of features
        Returns:
            X: DataFrame of subsetted columns
    """
    X = X[feature_list].copy()

    return X


def filter_data(X, y):
    """
    Filter and X and Y based off of null values in X

        Parameters:
            X: DataFrame of features
            y: Series of target values
        Returns:
            X_transformed: subsetted DataFrame
            y_transformed: subsetted Series
    """
    nan_mask = pd.isnull(X).any(axis=1)
    X_transformed = X[~nan_mask]
    y_transformed = y[~nan_mask]
    print("X shape post-filtering:", X_transformed.shape)
    return X_transformed, y_transformed


def fit_pipeline(
    df_input, regressor, target_column, input_columns, numerical_columns, categorical_columns
):
    """
    Fit pipeline on data given features, target and model

        Parameters:
            df_input: input DataFrame
            regressor: regression model compatible with sklearn
            target_column: string of target name
            input_columns: list of columns needed for transform_data and feature_list
            numerical_columns: list of numerical columns to standardize
            categorical_columns: list of categorical columns to OHE
        Returns:
            pipeline: sklearn pipeline object
            X_train: DataFrame of features for training
            X_test: DataFrame of features for testing
            y_train: Series of target for training
            y_test: Series of target for testing
    """
    feature_list = numerical_columns + categorical_columns
    X = df_input[input_columns].copy()
    print("X shape pre-filtering:", X.shape)
    y = df_input[target_column].copy()
    X, y = filter_data(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("X training shape:", X_train.shape)

    preprocessor = ColumnTransformer(
        transformers=[
            (
                "one_hot_encoder",
                OneHotEncoder(handle_unknown="ignore", drop="first"),
                categorical_columns,
            ),
            ("scaler", StandardScaler(), numerical_columns),
        ]
    )

    pipeline = Pipeline(
        [
            ("transform_data", FunctionTransformer(transform_data)),
            (
                "select_features",
                FunctionTransformer(select_features, kw_args={"feature_list": feature_list}),
            ),
            ("preprocessor", preprocessor),
            ("regressor", regressor),
        ],
        memory=None,
    )
    pipeline.fit(X_train, y_train)

    return pipeline, X_train, X_test, y_train, y_test

In [23]:
bundles = forecast.train_all_families(
    panel, 
    families=config.COMPLAINT_FAMILIES,
    horizons=range(1, 8),  # 1-7 days
    val_days=30  # Last 30 days for validation
)

Training forecast for vector_control: 221962 rows
  h=1: RMSE=0.229, MAE=0.128, Poisson Dev=0.038
  h=2: RMSE=0.321, MAE=0.196, Poisson Dev=0.071
  h=3: RMSE=0.330, MAE=0.209, Poisson Dev=0.075
  h=4: RMSE=0.480, MAE=0.384, Poisson Dev=0.152
Training forecast for food_safety: 162987 rows
  h=1: RMSE=0.691, MAE=0.386, Poisson Dev=0.211
  h=2: RMSE=0.775, MAE=0.435, Poisson Dev=0.251
  h=3: RMSE=0.802, MAE=0.480, Poisson Dev=0.266
  h=4: RMSE=0.855, MAE=0.527, Poisson Dev=0.309
  h=5: RMSE=0.953, MAE=0.579, Poisson Dev=0.362
  h=6: RMSE=1.022, MAE=0.638, Poisson Dev=0.401
  h=7: RMSE=1.140, MAE=0.720, Poisson Dev=0.488
Training forecast for air_smoke_mold: 111981 rows
  h=1: RMSE=0.350, MAE=0.160, Poisson Dev=0.083
  h=2: RMSE=0.419, MAE=0.200, Poisson Dev=0.115
  h=3: RMSE=0.406, MAE=0.208, Poisson Dev=0.113
  h=4: RMSE=0.490, MAE=0.246, Poisson Dev=0.160
  h=5: RMSE=0.547, MAE=0.285, Poisson Dev=0.195
  h=6: RMSE=0.508, MAE=0.269, Poisson Dev=0.164
  h=7: RMSE=0.133, MAE=0.112, Poisson

In [24]:
last_rows = panel.groupby(['hex', 'complaint_family']).last().reset_index()

# Predict 7 days ahead
predictions = forecast.predict_forecast(bundles, last_rows, horizon=7)

In [27]:
last_rows

Unnamed: 0,hex,complaint_family,day,y,dow,month,lag1,lag7,roll7,roll28,momentum,days_since_last,tavg,prcp,heating_degree,cooling_degree,rain_3d,rain_7d,log_pop,nbr_roll7,nbr_roll28
0,882a100003fffff,animal_control,2015-05-21,1.0,3,5,0.0,0.0,1.0,1.0,0.999999,34.0,57.254,0.000000,7.746,0.000,0.226378,1.540551,0.000000,1.0,1.0
1,882a100003fffff,vector_control,2017-07-19,1.0,2,7,0.0,0.0,1.0,1.0,0.999999,46.0,81.356,0.000000,0.000,16.356,0.000000,1.099606,0.000000,1.0,1.0
2,882a100005fffff,air_smoke_mold,2020-06-06,1.0,5,6,0.0,0.0,1.0,1.0,0.999999,159.0,75.398,0.372441,0.000,10.398,0.949213,1.191732,6.735780,1.0,1.0
3,882a100005fffff,food_safety,2024-02-27,1.0,1,2,0.0,0.0,1.0,1.0,0.999999,861.0,44.744,0.011417,20.256,0.000,0.011417,0.147244,6.735780,1.0,1.0
4,882a100005fffff,vector_control,2019-05-21,1.0,1,5,0.0,0.0,1.0,1.0,0.999999,251.0,71.708,0.027165,0.000,6.708,0.268504,0.541339,6.735780,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,882a10776bfffff,vector_control,2016-06-20,1.0,0,6,0.0,0.0,1.0,1.0,0.999999,45.0,74.750,0.000000,0.000,9.750,0.000000,0.075197,0.000000,1.0,1.0
4205,882a10776dfffff,air_smoke_mold,2024-10-18,1.0,4,10,0.0,0.0,1.0,1.0,0.999999,33.0,52.484,0.000000,12.516,0.000,0.000000,0.000000,7.706613,1.0,1.0
4206,882a10776dfffff,animal_control,2020-02-27,1.0,3,2,0.0,0.0,1.0,1.0,0.999999,321.0,44.240,0.426772,20.760,0.000,0.781496,0.781496,7.706613,1.0,1.0
4207,882a10776dfffff,food_safety,2025-05-26,1.0,0,5,0.0,0.0,1.0,2.0,0.500000,24.0,59.396,0.000000,5.604,0.000,0.074409,1.226772,7.706613,1.0,2.0


In [25]:
predictions

Unnamed: 0,hex,complaint_family,day,p50,p10,p90
0,882a100003fffff,vector_control,2017-07-26,1.395854,0.0,3.0
1,882a100005fffff,vector_control,2019-05-28,1.068246,0.0,2.0
2,882a100007fffff,vector_control,2021-07-14,1.082451,0.0,2.0
3,882a100025fffff,vector_control,2023-09-12,1.086483,0.0,2.0
4,882a10002dfffff,vector_control,2021-11-25,1.335202,0.0,3.0
...,...,...,...,...,...,...
4204,882a107759fffff,animal_control,2024-12-22,1.047336,0.0,2.0
4205,882a10775bfffff,animal_control,2025-06-06,1.093199,0.0,2.0
4206,882a10775dfffff,animal_control,2023-05-29,0.841666,0.0,2.0
4207,882a107769fffff,animal_control,2019-10-07,1.350560,0.0,3.0


In [8]:
# triage_features, tfidf_matrix, vectorizer = features.build_triage_features(df)
# duration_labels = features.build_duration_survival_labels(df)
# duration_features = features.build_duration_features(df, triage_features)
