# notebooks/pulse\_score.ipynb

**Overview**
This notebook ingests Buffalo raw data from Athena, spatially joins to 2020 Census tracts, computes tract-level metrics (crime, vacancy, permits, licences, 311), derives a composite score, and stores results for visualization and LLM narration.

---

## 1. Setup & Imports


In [None]:
import os
import awswrangler as wr
import pandas as pd
import geopandas as gpd
import shapely.geometry as geom
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import shap


Configure AWS:


In [None]:
os.environ['AWS_REGION'] = 'us-east-1'
wr.config.athena_workgroup = 'primary'
DATABASE = 'civic_pulse'


---

## 2. Load Raw Tables from Athena


In [None]:
# helper: read partition for a specific date
def read_last_n_days(table, n=7):
    query = f"""
      SELECT *
      FROM {DATABASE}.{table}
      WHERE pulled_utc >= date_add('day', -{n}, current_timestamp)
    """
    return wr.athena.read_sql_query(query, database=DATABASE)

# example: 7-day window ending today
yesterday = pd.Timestamp.utcnow().normalize() - pd.Timedelta(days=1)
y, m, d = yesterday.year, f"{yesterday.month:02}", f"{yesterday.day:02}"

crime_df = read_table('raw_buf_crime', y, m, d)
viol_df  = read_table('raw_buf_viol',  y, m, d)
perm_df  = read_table('raw_buf_permits',y, m, d)
biz_df   = read_table('raw_buf_biz',    y, m, d)
calls_df = read_table('raw_buf_311',    y, m, d)


---

## 3. Load 2020 Tract Shapefile


In [None]:
# NY State tracts (state FIPS 36) from TIGER 2024
url = ("https://www2.census.gov/geo/tiger/TIGER2024/TRACT/"
       "tl_2024_36_tract.zip")

tracts = gpd.read_file(url)[["GEOID", "geometry"]].to_crs(epsg=4326)

---

## 4. Spatial Join Points → Tracts


In [None]:
def join_points(df, lon='longitude', lat='latitude'):
    gdf = gpd.GeoDataFrame(df,
        geometry=gpd.points_from_xy(df[lon], df[lat]),
        crs='EPSG:4326')
    return gpd.sjoin(gdf, tracts, how='left', predicate='within')

crime_gdf = join_points(crime_df)
viol_gdf  = join_points(viol_df)
perm_gdf  = join_points(perm_df, lon=None, lat=None)  # if no coords
biz_gdf   = join_points(biz_df)
calls_gdf = join_points(calls_df)




---

## 5. Compute Tract-Level Metrics


In [None]:

# initialize metrics DataFrame with all tracts

metrics = pd.DataFrame({'tract': tracts['GEOID20']})

# 5.1 Crime per 1k
crime_counts = crime_gdf.groupby('GEOID20').size().rename('crime_count')
metrics = metrics.merge(crime_counts, left_on='tract', right_index=True, how='left').fillna(0)
# TODO: load tract population from ACS baseline
metrics['crime_per_1k'] = metrics['crime_count'] / (metrics['population']/1000)

# 5.2 Vacant cases
vac_counts = viol_gdf.groupby('GEOID20').size().rename('open_vacant_cases')
metrics = metrics.merge(vac_counts, left_on='tract', right_index=True, how='left').fillna(0)

# 5.3 Permits count
perm_counts = perm_gdf.groupby('GEOID20').size().rename('permit_count')
metrics = metrics.merge(perm_counts, left_on='tract', right_index=True, how='left').fillna(0)

# 5.4 New licences
dict_counts = biz_gdf.groupby('GEOID20').size().rename('new_licences')
metrics = metrics.merge(dict_counts, left_on='tract', right_index=True, how='left').fillna(0)

# 5.5 311 volume
call_counts = calls_gdf.groupby('GEOID20').size().rename('311_volume')
metrics = metrics.merge(call_counts, left_on='tract', right_index=True, how='left').fillna(0)


---

## 6. Composite Score & SHAP Explanation

In [None]:
# scale features
def compute_score(df, features):
    scaler = StandardScaler()
    X = scaler.fit_transform(df[features])
    df['score'] = X.sum(axis=1)
    return df

features = ['crime_per_1k','open_vacant_cases','permit_count','new_licences','311_volume']
metrics = compute_score(metrics, features)

# optional: train XGBoost to predict next-week 311 and compute SHAP values
model = xgb.XGBRegressor()
# TODO: build training set
# fit, compute shap values, store per tract



---

## 7. Save Results to S3


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import boto3

metrics['run_date'] = pd.Timestamp.utcnow()

# write Parquet locally or to S3
table = pa.Table.from_pandas(metrics)
buf = pa.BufferOutputStream()
pq.write_table(table, buf, compression='zstd')
key = f"analytics/buf_pulse_score/{y}/{m}/{d}/pulse_score.parquet"

boto3.client('s3').put_object(
    Bucket=os.getenv('BUCKET'),
    Key=key,
    Body=buf.getvalue().to_pybytes()
)
print("Wrote composite metrics → s3://{}/{}".format(os.getenv('BUCKET'),key))



---

*Next:* Fill in ACS population load, training data for SHAP, and refine model fitting.
