In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("data/final/test_demo.csv")

In [9]:

gifted_grades = list(range(4, 9))
matched_dfs = []
match_id_counter = 0
caliper = 0.05

for grade in gifted_grades:
    print(f"\nProcessing grade {grade}...")

    treated_all = df[df['gifted_grade'] == grade].copy()
    controls_all = df[df['gifted_grade'] == 0].copy()

    if treated_all.empty:
        print(f"  Skipping grade {grade}: no treated units.")
        continue

    for lea in treated_all['lea'].unique():
        treated = treated_all[treated_all['lea'] == lea].copy()
        controls = controls_all[controls_all['lea'] == lea].copy()

        if treated.empty or controls.empty:
            continue

        sub_df = pd.concat([treated, controls], axis=0).copy()
        sub_df['treatment'] = (sub_df['gifted_grade'] == grade).astype(int)

        test_cols = [col for col in sub_df.columns if col.startswith(('MA', 'RD'))]
        pre_cols = [col for col in test_cols if int(col.split('0')[1]) < grade]
        dummy_cols = [col for col in sub_df.columns if col.startswith(('sex_', 'ethnic_', 'eds_'))]
        covariates = pre_cols + dummy_cols

        sub_df_clean = sub_df.dropna(subset=covariates, how='any').reset_index(drop=True)

        math_cols = [col for col in pre_cols if col.startswith('MA')]
        reading_cols = [col for col in pre_cols if col.startswith('RD')]

        treated_df = sub_df_clean[sub_df_clean['treatment'] == 1]
        if treated_df.empty:
            continue

        math_min = treated_df[math_cols].min().min()
        reading_min = treated_df[reading_cols].min().min()

        sub_df_clean = sub_df_clean[
            sub_df_clean[math_cols].ge(math_min).all(axis=1) &
            sub_df_clean[reading_cols].ge(reading_min).all(axis=1)
        ]

        if sub_df_clean['treatment'].sum() == 0 or sub_df_clean['treatment'].sum() == len(sub_df_clean):
            continue

        # Prepare covariate matrix
        X = sub_df_clean[covariates].copy()
        for col in X.select_dtypes(exclude='number').columns:
            if X[col].dtype == 'bool':
                X[col] = X[col].astype(int)
            elif X[col].dtype == 'object':
                X[col] = X[col].map({'True': 1, 'False': 0, True: 1, False: 0})
        X = X.fillna(0)

        # Standardize covariates
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Estimate propensity scores
        ps_model = LogisticRegression(max_iter=5000)
        ps_model.fit(X_scaled, sub_df_clean['treatment'])
        sub_df_clean['pscore'] = ps_model.predict_proba(X_scaled)[:, 1]

        # Track row alignment
        row_index_map = dict(zip(sub_df_clean.index, range(len(sub_df_clean))))

        treated = sub_df_clean[sub_df_clean['treatment'] == 1].copy()
        controls = sub_df_clean[sub_df_clean['treatment'] == 0].copy()

        matched_pairs = []

        for idx_t in treated.index:
            pscore_t = sub_df_clean.at[idx_t, 'pscore']
            pool = controls[np.abs(controls['pscore'] - pscore_t) <= caliper]

            if pool.empty:
                continue

            X_t = X_scaled[row_index_map[idx_t]].reshape(1, -1)
            X_pool = X_scaled[[row_index_map[i] for i in pool.index]]

            nn = NearestNeighbors(n_neighbors=1)
            nn.fit(X_pool)
            _, nn_idx = nn.kneighbors(X_t)

            matched_idx = pool.index[nn_idx[0][0]]
            matched_pairs.append((idx_t, matched_idx))

        if not matched_pairs:
            continue

        match_ids = range(match_id_counter, match_id_counter + len(matched_pairs))
        match_id_counter += len(matched_pairs)

        treated_matches = sub_df_clean.loc[[i for i, _ in matched_pairs]].copy()
        control_matches = sub_df_clean.loc[[j for _, j in matched_pairs]].copy()

        treated_matches['match_id'] = match_ids
        control_matches['match_id'] = match_ids

        matched_result = pd.concat([treated_matches, control_matches])
        matched_result['match_grade'] = grade
        if matched_result['match_id'].nunique() < 20:
            print(f"  Skipping Grade {grade} | LEA {lea}: too few matched pairs ({matched_result['match_id'].nunique()})")
            continue

        matched_dfs.append(matched_result)

        print(f"  Grade {grade} | LEA {lea} | Matches: {len(matched_result)}")

# Final combined match set
all_matches = pd.concat(matched_dfs, axis=0).reset_index(drop=True)


all_matches_trimmed = all_matches[(all_matches['pscore'] >= 0.15) & (all_matches['pscore'] <= 0.85)]

counts = all_matches_trimmed["match_id"].value_counts()

# Keep only rows where matchid appears more than once
pairs = all_matches_trimmed[all_matches_trimmed["match_id"].isin(counts[counts > 1].index)]



Processing grade 4...
  Grade 4 | LEA 970 | Matches: 278
  Grade 4 | LEA 491 | Matches: 416
  Grade 4 | LEA 020 | Matches: 524
  Grade 4 | LEA 990 | Matches: 524
  Grade 4 | LEA 190 | Matches: 830
  Grade 4 | LEA 862 | Matches: 68
  Grade 4 | LEA 790 | Matches: 922
  Grade 4 | LEA 410 | Matches: 1944
  Grade 4 | LEA 110 | Matches: 442
  Grade 4 | LEA 590 | Matches: 410
  Grade 4 | LEA 111 | Matches: 132
  Grade 4 | LEA 030 | Matches: 94
  Grade 4 | LEA 290 | Matches: 682
  Grade 4 | LEA 300 | Matches: 516
  Grade 4 | LEA 291 | Matches: 172
  Grade 4 | LEA 760 | Matches: 550
  Grade 4 | LEA 620 | Matches: 136
  Grade 4 | LEA 340 | Matches: 360
  Grade 4 | LEA 360 | Matches: 278
  Grade 4 | LEA 250 | Matches: 798
  Grade 4 | LEA 995 | Matches: 88
  Grade 4 | LEA 270 | Matches: 82
  Grade 4 | LEA 490 | Matches: 618
  Grade 4 | LEA 800 | Matches: 668
  Grade 4 | LEA 180 | Matches: 1368
  Grade 4 | LEA 740 | Matches: 1404
  Grade 4 | LEA 070 | Matches: 224
  Grade 4 | LEA 320 | Matches: 51

In [12]:
pairs.to_csv("data/final/pairs.csv",index=False)