# 02 — Peer finder & residuals (MVP)

Computes a simple **PIP index**, finds **K‑nearest peers** (same nation, ±2 IMD deciles),
and writes `peers.csv`, `peer_benchmark.csv`, and `features_with_peers.csv` to `data/processed/`.


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.neighbors import NearestNeighbors

REPO = Path('..').resolve()
SEED = REPO / 'data' / 'seed'
OUT = REPO / 'data' / 'processed'
OUT.mkdir(parents=True, exist_ok=True)

places = pd.read_csv(SEED / 'places_seed_v0_0_1.csv')
feats = pd.read_csv(SEED / 'features_template_v0_0_2.csv')
df = feats.merge(places[['place','nation']], on='place', how='left')

def zscore(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors='coerce')
    return (s - s.mean(skipna=True)) / (s.std(ddof=0, skipna=True) or 1.0)

def pip_index_fn(df: pd.DataFrame) -> pd.Series:
    belong = zscore(df.get('belonging_baseline_value'))
    civic = zscore(pd.to_numeric(df.get('charities_active_count'), errors='coerce').fillna(0)
                   + pd.to_numeric(df.get('recent_grants_24m_gbp'), errors='coerce').fillna(0))
    asb = zscore(df.get('crime_rate_per_1k'))
    return 0.4*belong + 0.3*civic + 0.3*(-asb)

df['pip_index'] = pip_index_fn(df)
df[['place','pip_index']].head()

In [None]:
SIM_FEATURES = ['imd2019_decile','pop_density_per_km2','pct_under_18','pct_65_plus','pct_social_rent','asb_trend_12m']
sim = df[SIM_FEATURES].copy()
for c in SIM_FEATURES:
    sim[c] = pd.to_numeric(sim[c], errors='coerce').fillna(sim[c].median())

X = StandardScaler().fit_transform(sim.values)
X = normalize(X)
nn = NearestNeighbors(metric='cosine', algorithm='brute').fit(X)

peers_rows = []
expected, residuals, lifts = [], [], []

for i in range(len(df)):
    same_nation = df['nation'] == df.loc[i, 'nation']
    imd_base = pd.to_numeric(df.loc[i, 'imd2019_decile'], errors='coerce')
    imd_ok = (pd.to_numeric(df['imd2019_decile'], errors='coerce') - imd_base).abs() <= 2 if pd.notna(imd_base) else True
    mask = same_nation & imd_ok
    mask.iloc[i] = False
    idxs = np.where(mask.values)[0]
    if len(idxs) == 0:
        expected.append(np.nan); residuals.append(np.nan); lifts.append(np.nan)
        continue

    dists, inds = nn.kneighbors(X[i:i+1], n_neighbors=min(6, len(df)))
    order = [j for j in inds[0] if j in idxs][:5]
    for rank, j in enumerate(order, 1):
        peers_rows.append({'place': df.loc[i,'place'], 'peer_place': df.loc[j,'place'], 'rank': rank})

    peer_pip = df.loc[order, 'pip_index'].dropna()
    if len(peer_pip) >= 1 and pd.notna(df.loc[i, 'pip_index']):
        exp = peer_pip.mean(); residual = df.loc[i,'pip_index'] - exp
        p75 = np.percentile(peer_pip, 75) if len(peer_pip) > 1 else exp
        lift = max(0.0, p75 - df.loc[i,'pip_index'])
    else:
        exp = np.nan; residual = np.nan; lift = np.nan
    expected.append(exp); residuals.append(residual); lifts.append(lift)

peers = pd.DataFrame(peers_rows)
peer_bm = pd.DataFrame({'place': df['place'], 'peer_expected_pip': expected, 'pip_residual': residuals, 'peer_lift_to_p75': lifts})

OUT.joinpath('peers.csv').write_text(peers.to_csv(index=False))
OUT.joinpath('peer_benchmark.csv').write_text(peer_bm.to_csv(index=False))
OUT.joinpath('features_with_peers.csv').write_text(
    df.assign(peer_expected_pip=expected, pip_residual=residuals, peer_lift_to_p75=lifts).to_csv(index=False)
)
peers.head(10), peer_bm.head(10)

## Visual check (optional)

In [None]:
import matplotlib.pyplot as plt
res = pd.read_csv(OUT/'peer_benchmark.csv')
vals = res['pip_residual'].dropna().values
plt.figure()
plt.hist(vals, bins=10)
plt.title('Distribution of PIP residuals')
plt.xlabel('Residual (actual − peer-expected)')
plt.ylabel('Count')
plt.show()