In [44]:
#stick together the necessary ones for matching -- masterbuild and curtest
import numpy as np
import pandas as pd
import os
import re


In [45]:
masterbuild = pd.read_csv("data/interim/masterbuild_master.csv")

  masterbuild = pd.read_csv("data/interim/masterbuild_master.csv")


In [46]:
masterbuild["grade"] = pd.to_numeric(masterbuild["grade"], errors="coerce")

# Only gifted rows
gifted = masterbuild[masterbuild["aig"] != "N"].copy()

# Earliest gifted grade per student
first_gifted = (
    gifted.groupby("mastid")["grade"]
    .min()
    .reset_index()
    .rename(columns={"grade": "gifted_grade"})
)

# Merge back
masterbuild = masterbuild.merge(first_gifted, on="mastid", how="left")

# Fill missing with 0
masterbuild["gifted_grade"] = masterbuild["gifted_grade"].fillna(0).astype(int)

In [47]:
#clean up
consistent_lea_ids = (
    masterbuild.groupby("mastid")["lea"]
    .nunique()
    .reset_index()
    .query("lea == 1")["mastid"]
)

masterbuild = masterbuild[masterbuild["mastid"].isin(consistent_lea_ids)]

# 2. Drop students with any non-numeric schlcode
# Try converting schlcode → numeric, flag failures
masterbuild["schlcode_numeric"] = pd.to_numeric(masterbuild["schlcode"], errors="coerce")

bad_ids = masterbuild.loc[masterbuild["schlcode_numeric"].isna(), "mastid"].unique()
masterbuild = masterbuild[~masterbuild["mastid"].isin(bad_ids)]

# (Optional: drop helper column)
masterbuild = masterbuild.drop(columns=["schlcode_numeric"])

masterbuild = masterbuild.sort_values(["mastid", "reporting_year"], ascending=[True, False])

# Drop duplicates, keeping the first (latest reporting_year)
masterbuild = masterbuild.drop_duplicates(subset=["mastid"], keep="first")

masterbuild = masterbuild.drop(columns=['grade','reporting_year','schlcode'])
masterbuild = masterbuild.drop_duplicates()
masterbuild = masterbuild.dropna()
#should be good

In [48]:
#one hot encode
masterbuild["eds_Y"] = masterbuild["eds"].map({"Y": True, "N": False})

# 2. Convert sex (M/F) into binary
masterbuild["sex_M"] = masterbuild["sex"].map({"M": True, "F": False})

# 3. One-hot encode ethnic
ethnic_dummies = pd.get_dummies(masterbuild["ethnic"], prefix="ethnic")

# 4. Concatenate back to masterbuild
masterbuild = pd.concat([masterbuild, ethnic_dummies], axis=1)

masterbuild = masterbuild.drop(columns=['ethnic'])
masterbuild = masterbuild.drop(columns=['eds','sex'])


In [49]:
curtest = pd.read_csv("data/interim/curtest_master.csv")
merged = pd.merge(curtest, masterbuild, on="mastid", how="inner")

In [50]:
merged = merged.drop(columns=['BIOL','SC05','SC08','MTH1'])

In [51]:
test = merged[merged.isna().sum(axis=1) <= 2]
forbidden_cols = [c for c in test.columns if c.endswith(("03", "08"))]

# Drop rows where those cols are NaN
mask_forbidden = test[forbidden_cols].isna().any(axis=1)
test = test[~mask_forbidden]

def impute_with_neighbors(df):
    df = df.copy()
    num_cols = [c for c in df.columns if re.search(r"\d{2}$", c)]  # cols ending in 2 digits
    
    for col in num_cols:
        match = re.search(r"(.*?)(\d{2})$", col)  # split base + number
        if not match:
            continue
        base, num = match.groups()
        num = int(num)

        # Look for neighbors
        prev_col = f"{base}{num-1:02d}"
        next_col = f"{base}{num+1:02d}"

        if prev_col in df.columns and next_col in df.columns:
            mask = df[col].isna()
            df.loc[mask, col] = (
                df.loc[mask, [prev_col, next_col]].mean(axis=1, skipna=False)
            )
    return df

# Apply
test = impute_with_neighbors(test)
test = test.dropna()

In [55]:
test.to_csv("data/final/test_demo.csv",index=False)