In [32]:
import pandas as pd
import numpy as np
import os
import re

In [33]:
# 1) Load each year
dfs = {}
for year in range(18, 23):    # 2018–2022
    path = f"/Users/adamcartwright/ncerdc/Student Data/Misc Student Data/exit_pub20{year}.sas7bdat"
    try:
        df = pd.read_sas(path)
        df.columns = df.columns.str.lower()
        dfs[year] = df
        print(f"20{year} loaded")
    except FileNotFoundError:
        print(f"File {path} not found, skipping.")
    except Exception as e:
        print(f"Error in 20{year}: {e}, skipping.")

# 2) Find the core columns present in every year
core_cols = set(dfs[next(iter(dfs))].columns)
for df in dfs.values():
    core_cols &= set(df.columns)
core_cols = sorted(core_cols)

# 3) Trim, tag year, drop missing mastid
normalized = []
for yy, df in dfs.items():
    sub = df[core_cols].copy()
    sub['year'] = 2000 + year
    sub = sub.dropna(subset=['mastid'])
    normalized.append(sub)

exit = pd.concat(normalized, ignore_index=True)
def clean_bytes(val):
    if isinstance(val, bytes):
        return str(val)[2:-1]  # str(b'xyz') => "b'xyz'" → "xyz"
    return val
for col in exit.columns:
    exit[col] = exit[col].apply(clean_bytes)
    print(col)

exit.to_csv("/Users/adamcartwright/NCERDC-MODEL/data/exit_master.csv",index=False)

2018 loaded
2019 loaded
2020 loaded
2021 loaded
2022 loaded
acchome_accepted
aig
auto_xfer_school
base_school
count_row_pre
count_row_state
dc_g9year
den
eds
effective_exit_code
effective_g9year
els
els_year_exited
en_five_year
en_g9school
en_g9year
ethnic
ex_exit_code
ex_xfer_school
exit_exempt
fcs
g9year_missing
gdvr
gdvr_document_type
gdvr_school
gdvr_year
hms
in_cohort
init_collection
init_grade
init_year
last_collection
last_year
lea
mastid
mig
num
ps_g9year
rc_g9year
schlcode
sex
swd
swd_latest_school
swd_latest_year
year


In [34]:
# Combine “effective” and raw exit codes into a single, complete field
exit['final_exit_code'] = (
    exit['effective_exit_code']        # the cleaned/standardized code
        .fillna(exit['ex_exit_code'])  # if missing, fall back to the raw code
)

# Once coalesced, drop the old source columns
exit = exit.drop(columns=['ex_exit_code', 'effective_exit_code'])

# Define human‐readable labels for each code (0–9 and A–D)
exit_code_map = {
    '0': 'Has not left school',
    '1': 'Transferred to another school in same LEA',
    '2': 'Transferred to another system',
    '3': 'Left the state',
    '4': 'Dropped Out',
    '5': 'Temporary Leave',
    '6': 'Death',
    '7': 'Other',
    '8': 'Visiting Student',
    '9': 'Graduated',
    'A': 'Transferred to a private school within the state',
    'B': 'Transferred to home school within the state',
    'C': 'Transferred to a community college',
    'D': 'Certificate recipient',
}

# Map each code to its description; any unmapped code becomes “Other”
exit['exit_code_desc'] = (
    exit['final_exit_code']
        .map(exit_code_map)
        .fillna('Other')
)

# List of demographic/timing vars that each have exactly 968 missing values
vars_968 = [
    'aig',
    'eds',
    'effective_g9year',
    'els',
    'en_g9school',
    'init_collection',
    'init_grade',
    'init_year',
    'last_collection',
    'last_year',
    'mig',
    'sex',
    'swd'
]

# Identify rows where _all_ of those vars are missing
mask_all_missing = exit[vars_968].isna().all(axis=1)

# Drop those 960 completely-blank-demographic rows
exit = exit.loc[~mask_all_missing].copy()

# Columns with extremely high missingness (>80%) that we no longer need
drop_columns = [
    'acchome_accepted', 'base_school', 'dc_g9year', 'ex_xfer_school',
    'els_year_exited', 'en_five_year', 'g9year_missing',
    'rc_g9year', 'auto_xfer_school'
]

# Remove those high-missingness fields
exit = exit.drop(columns=drop_columns)

# Write out the cleaned master file
exit.to_csv("/Users/adamcartwright/NCERDC-MODEL/data/exit_master.csv", index=False)