In [1]:
import pandas as pd
import numpy as np
import os
import re

In [4]:
collegeboard = pd.DataFrame()
for year in range(15, 23):  # range is exclusive at the end
    filename = f"/Users/danielmatten/desktop/m/collegeboard{year}pub.sas7bdat"
    try:
        # Read SAS file
        df = pd.read_sas(filename)

        # Keep only relevant columns
        df.columns = df.columns.str.lower()

        # Drop missing values

        
        # Append to the main DataFrame
        collegeboard = pd.concat([collegeboard, df], ignore_index=True)
        print("{year} done")

    except FileNotFoundError:
        print(f"File {filename} not found. Skipping.")
    except KeyError:
        print(f"Required columns not found in {filename}. Skipping.")
        
def clean_bytes(val):
    if isinstance(val, bytes):
        return str(val)[2:-1]  # str(b'xyz') => "b'xyz'" → "xyz"
    return val
for col in collegeboard.columns:
    collegeboard[col] = collegeboard[col].apply(clean_bytes)
    print(col)

cols_to_drop = [col for col in collegeboard.columns if col.endswith('gr')]
collegeboard = collegeboard.drop(columns=cols_to_drop)

collegeboard = collegeboard.dropna(subset=['mastid'])
        

{year} done
{year} done
{year} done
{year} done
{year} done
{year} done
{year} done
{year} done
birthdt
sex
grad_dte
ethnic
blang
instname
psat_test_dt
psat_cr
psat_math
psat_write
sat_test_dt_mr
sat_cr_mr
sat_math_mr
sat_write_mr
sat_write_essay_mr
sat_write_mc_mr
sat_test_dt_hc
sat_cr_hc
sat_math_hc
sat_write_hc
sat_write_essay_hc
sat_write_mc_hc
arthisgr
artstdgr
artst2gr
art3dgr
biolgr
calcabgr
calcbcgr
chemgr
chinesgr
comscagr
econmagr
econmigr
englangr
englitgr
envscigr
eurhisgr
frnlangr
gerlagr
govcomgr
govusgr
humgeogr
italgr
japangr
latinvgr
musictgr
physmgr
physemgr
psychgr
spanlagr
spanltgr
statgr
ushistgr
wdhistgr
phys1gr
phys2gr
cpstnsgr
lea
schlcode
mastid
grad_date
psat_8_9_test_dt
psat_8_9_total
psat_8_9_ebrw
psat_8_9_math
psat_nmsqt_total
psat_nmsqt_ebrw
psat_10_test_dt
psat_10_total
psat_10_ebrw
psat_10_math
rsat_asmt_dt_mr
rsat_total_score_mr
rsat_ebrw_score_mr
rsat_math_score_mr
rsat_essay_reading_mr
rsat_essay_analysis_mr
rsat_essay_writing_mr
rsat_asmt_dt_hc
rsat_

In [6]:
# 1) Pick the raw columns you want to keep:
keep = [
    'mastid', 'lea', 'schlcode', 'sex', 'ethnic',
    'psat_8_9_total', 'psat_8_9_ebrw', 'psat_8_9_math',
    'psat_nmsqt_total', 'psat_nmsqt_ebrw', 'psat_nmsqt_math',
    'psat_10_total', 'psat_10_ebrw', 'psat_10_math',
    'sat_total_score_mr', 'sat_ebrw_score_mr', 'sat_math_score_mr',
    'sat_total_score_hc', 'sat_ebrw_score_hc', 'sat_math_score_hc'
]
sub = collegeboard[keep].copy()

# 2) Define which are numeric vs. categorical
num_cols = [
    'psat_8_9_total', 'psat_8_9_ebrw', 'psat_8_9_math',
    'psat_nmsqt_total','psat_nmsqt_ebrw','psat_nmsqt_math',
    'psat_10_total','psat_10_ebrw','psat_10_math',
    'sat_total_score_mr','sat_ebrw_score_mr','sat_math_score_mr',
    'sat_total_score_hc','sat_ebrw_score_hc','sat_math_score_hc'
]
cat_cols = ['lea', 'schlcode', 'sex', 'ethnic']

# 3) Coerce numeric columns
for c in num_cols:
    sub[c] = pd.to_numeric(sub[c], errors='coerce')

# 4) Build aggregation dict: 'first' for cats, 'max' for nums
agg_dict = {c: 'first' for c in cat_cols}
agg_dict.update({c: 'max' for c in num_cols})

# 5) Group by mastid and aggregate
best = (
    sub
    .groupby('mastid', as_index=False)
    .agg(agg_dict)
)

# 6) Compute best‐of‐all PSAT & SAT composites
psat_totals = ['psat_8_9_total','psat_nmsqt_total','psat_10_total']
psat_ebrw   = ['psat_8_9_ebrw','psat_nmsqt_ebrw','psat_10_ebrw']
psat_math   = ['psat_8_9_math','psat_nmsqt_math','psat_10_math']

best['psat_best_total'] = best[psat_totals].max(axis=1, skipna=True)
best['psat_best_ebrw']  = best[psat_ebrw].max(axis=1, skipna=True)
best['psat_best_math']  = best[psat_math].max(axis=1, skipna=True)

sat_totals = ['sat_total_score_mr','sat_total_score_hc']
sat_erw    = ['sat_ebrw_score_mr','sat_ebrw_score_hc']
sat_math   = ['sat_math_score_mr','sat_math_score_hc']

best['sat_best_total'] = best[sat_totals].max(axis=1, skipna=True)
best['sat_best_erw']   = best[sat_erw].max(axis=1, skipna=True)
best['sat_best_math']  = best[sat_math].max(axis=1, skipna=True)

# ─── FIX: fill missing subscore from total minus the other subscore ────────────

# PSAT: if math is missing but total & ebrw exist
mask = best['psat_best_math'].isna() & best['psat_best_total'].notna() & best['psat_best_ebrw'].notna()
best.loc[mask, 'psat_best_math'] = best.loc[mask, 'psat_best_total'] - best.loc[mask, 'psat_best_ebrw']

# PSAT: if ebrw is missing but total & math exist
mask = best['psat_best_ebrw'].isna() & best['psat_best_total'].notna() & best['psat_best_math'].notna()
best.loc[mask, 'psat_best_ebrw'] = best.loc[mask, 'psat_best_total'] - best.loc[mask, 'psat_best_math']

# SAT: if math is missing but total & erw exist
mask = best['sat_best_math'].isna() & best['sat_best_total'].notna() & best['sat_best_erw'].notna()
best.loc[mask, 'sat_best_math'] = best.loc[mask, 'sat_best_total'] - best.loc[mask, 'sat_best_erw']

# SAT: if erw is missing but total & math exist
mask = best['sat_best_erw'].isna() & best['sat_best_total'].notna() & best['sat_best_math'].notna()
best.loc[mask, 'sat_best_erw'] = best.loc[mask, 'sat_best_total'] - best.loc[mask, 'sat_best_math']

# 7) Build taken_* flags
best['taken_psat'] = best[psat_totals + psat_ebrw + psat_math] \
    .notna().any(axis=1).astype(int)
best['taken_sat']  = best[sat_totals + sat_erw + sat_math] \
    .notna().any(axis=1).astype(int)

# 8) Select final feature set and save
final_cols = [
    'mastid','lea','schlcode','sex','ethnic',
    'psat_best_total','psat_best_ebrw','psat_best_math',
    'sat_best_total','sat_best_erw','sat_best_math',
    'taken_psat','taken_sat'
]
pivoted_final = best[final_cols].copy()




In [7]:
pivoted_final.drop(inplace=True,columns=['lea','schlcode','sex','ethnic'])

In [10]:
na_counts = pivoted_final.isna().sum(axis=1)
missing_summary = na_counts.value_counts().sort_index()
print(missing_summary)

0    160220
3    276517
6    183406
Name: count, dtype: int64


In [None]:
pivoted_final.to_csv("../data/collegeboard_pivoted.csv", index=False)