In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat as stat
import seaborn as sns
from pathlib import Path

In [2]:
data_dir = Path("/Users/adamcartwright/ncerdc/Student Data/Demographics Data")

demo16 = pd.read_csv(data_dir / "mergedemo_pub2016.csv")
demo16["year"] = 2016
demo17 = pd.read_csv(data_dir / "mergedemo_pub2017.csv")
demo17["year"] = 2017
demo18 = pd.read_csv(data_dir / "mergedemo_pub2018.csv")
demo18["year"] = 2018
demo19 = pd.read_csv(data_dir / "mergedemo_pub2019.csv")
demo19["year"] = 2019
demo20 = pd.read_csv(data_dir / "mergedemo_pub2020.csv")
demo20["year"] = 2020
demo21 = pd.read_csv(data_dir / "mergedemo_pub2021.csv")
demo21["year"] = 2021
demo22 = pd.read_csv(data_dir / "mergedemo_pub2022.csv")
demo22["year"] = 2022
demo23 = pd.read_csv(data_dir / "mergedemo_pub2023.csv")
demo23["year"] = 2023

demo_master = pd.concat([demo16, demo17, demo18, demo19, demo20, demo21, demo22, demo23], ignore_index=True)

# 1) Normalize column names: strip whitespace, lowercase, replace spaces with underscores
demo_master.columns = (
    demo_master.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
)

# 2) Drop any rows missing critical demographic keys: mastid, year, lea, schlcode
#    (If those columns might contain literal "<NA>" or "nan", convert them first.)
demo_master['mastid']   = demo_master['mastid'].replace(['<NA>', 'nan'], np.nan)
demo_master['lea']      = demo_master['lea'].replace(['<NA>', 'nan'], np.nan)
demo_master['schlcode'] = demo_master['schlcode'].replace(['<NA>', 'nan'], np.nan)

demo_master = demo_master.dropna(subset=['mastid', 'year', 'lea', 'schlcode'])

# 3) Convert mastid and year to integers
demo_master['mastid'] = demo_master['mastid'].round(0).astype(int)
demo_master['year']   = demo_master['year'].round(0).astype(int)

# 4) Ensure 'lea' is string (strip whitespace), zero-pad 'schlcode' to three digits
demo_master['lea']      = demo_master['lea'].astype(str).str.strip()
demo_master['schlcode'] = demo_master['schlcode'].astype(str).str.zfill(3)

# 5) Create 'unique_identifier' by concatenating lea and schlcode
demo_master['unique_identifier'] = demo_master['lea'] + '-' + demo_master['schlcode']

# 1) Convert 'mastid' from float to integer
demo_master['mastid'] = demo_master['mastid'].round(0).astype(int)

# 2) Keep only the specified columns
keep_cols = ['mastid', 'grade', 'ethnic', 'sex', 'eds', 'aig', 'swd', 'year']
demo_master = demo_master[keep_cols]
demo_master = demo_master.dropna(subset=keep_cols)

# 1) Make sure grade is a string and strip whitespace
grades = demo_master['grade'].astype(str).str.strip()

# 2) Build a boolean mask of rows that are only digits
mask_numeric = grades.str.isdigit()

# 3) Create a copy and zero‐pad only the numeric ones
grades_padded = grades.copy()
grades_padded[mask_numeric] = grades[mask_numeric].str.zfill(2)

# 4) Put it back into your DataFrame
demo_master['grade'] = grades_padded


  demo16 = pd.read_csv(data_dir / "mergedemo_pub2016.csv")
  demo17 = pd.read_csv(data_dir / "mergedemo_pub2017.csv")
  demo18 = pd.read_csv(data_dir / "mergedemo_pub2018.csv")
  demo19 = pd.read_csv(data_dir / "mergedemo_pub2019.csv")
  demo20 = pd.read_csv(data_dir / "mergedemo_pub2020.csv")
  demo21 = pd.read_csv(data_dir / "mergedemo_pub2021.csv")
  demo22 = pd.read_csv(data_dir / "mergedemo_pub2022.csv")
  demo23 = pd.read_csv(data_dir / "mergedemo_pub2023.csv")


In [3]:
# 10) Save cleaned demographics
demo_master.to_csv(data_dir / "demographics_clean.csv", index=False)