In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat as stat
import seaborn as sns
from pathlib import Path

In [2]:
def fast_parse_dates(series):
    # Try general parse first (fastest)
    parsed = pd.to_datetime(series, errors='coerce')

    # Optionally: fallback to common format if still missing
    fallback = pd.to_datetime(series, format='%m/%d/%Y', errors='coerce')
    parsed = parsed.fillna(fallback)

    return parsed.dt.year

In [3]:
data_dir = Path("/Users/adamcartwright/ncerdc/Student Data/GPA Data")

# 1) Read each year’s file separately and inspect key columns
for yr, fname in [(2018,"gpa2018.csv"),
                  (2019,"gpa2019.csv"),
                  (2020,"gpa2020.csv"),
                  (2021,"gpa2021.csv"),
                  (2022,"gpa2022.csv")]:
    df = pd.read_csv(data_dir / fname, low_memory=False)
    print(f"\n--- Year {yr} ({fname}) ---")
    # a) Which GPA columns does it have?
    for col in ["gpa_unweighted","gpa_weighted"]:
        print(f"  {col}: exists? {'yes' if col in df.columns else 'NO'}; "
              f" non-null count = {df[col].notna().sum() if col in df.columns else 0}")
    # b) How many total rows before any filtering?
    print("  total rows:", len(df))


--- Year 2018 (gpa2018.csv) ---
  gpa_unweighted: exists? yes;  non-null count = 104783
  gpa_weighted: exists? yes;  non-null count = 105808
  total rows: 107550

--- Year 2019 (gpa2019.csv) ---
  gpa_unweighted: exists? yes;  non-null count = 105882
  gpa_weighted: exists? yes;  non-null count = 107120
  total rows: 109151

--- Year 2020 (gpa2020.csv) ---
  gpa_unweighted: exists? yes;  non-null count = 105954
  gpa_weighted: exists? yes;  non-null count = 106775
  total rows: 109157

--- Year 2021 (gpa2021.csv) ---
  gpa_unweighted: exists? yes;  non-null count = 103111
  gpa_weighted: exists? yes;  non-null count = 103969
  total rows: 106025

--- Year 2022 (gpa2022.csv) ---
  gpa_unweighted: exists? yes;  non-null count = 98401
  gpa_weighted: exists? yes;  non-null count = 99190
  total rows: 101956


In [7]:

data_dir = Path("/Users/adamcartwright/ncerdc/Student Data/GPA Data")

# 1) Read each year’s file and tag with 'year'
gpa18 = pd.read_csv(data_dir / "gpa2018.csv", low_memory=False); gpa18["year"] = 2018
gpa19 = pd.read_csv(data_dir / "gpa2019.csv", low_memory=False); gpa19["year"] = 2019
gpa20 = pd.read_csv(data_dir / "gpa2020.csv", low_memory=False); gpa20["year"] = 2020
gpa21 = pd.read_csv(data_dir / "gpa2021.csv", low_memory=False); gpa21["year"] = 2021
gpa22 = pd.read_csv(data_dir / "gpa2022.csv", low_memory=False); gpa22["year"] = 2022

# 2) Concatenate all years
gpa_master = pd.concat([gpa18, gpa19, gpa20, gpa21, gpa22], ignore_index=True)

# 3) Decode any byte‐strings
for col in gpa_master.columns:
    if gpa_master[col].dtype == object:
        gpa_master[col] = gpa_master[col].map(
            lambda x: x[2:-1]
            if (isinstance(x, str) and x.startswith("b'") and x.endswith("'"))
            else x
        )

# 4) Drop rows missing 'lea' or 'schlcode', then zero‐pad 'schlcode'
gpa_master = gpa_master.dropna(subset=['lea', 'schlcode'])
gpa_master['schlcode'] = gpa_master['schlcode'].astype(str).str.zfill(3)

# 5) Create 'school_id' by concatenating lea and schlcode
gpa_master['school_id'] = gpa_master['lea'].astype(str) + '-' + gpa_master['schlcode']

# 6) Drop rows missing the absolutely required fields: 'year', 'mastid',
#    'gpa_unweighted', or 'gpa_weighted'
gpa_master = gpa_master.dropna(subset=['year', 'mastid', 'gpa_unweighted', 'gpa_weighted'])

# 7) Convert 'year' and 'mastid' to integers
gpa_master['year']   = gpa_master['year'].round(0).astype(int)
gpa_master['mastid'] = gpa_master['mastid'].round(0).astype(int)

# 8) Filter by valid GPA ranges, and drop any rows where either GPA is 0.0
gpa_master = gpa_master[
    (gpa_master["gpa_unweighted"] >  0.0) &
    (gpa_master["gpa_weighted"]   >  0.0) &
    (gpa_master["gpa_unweighted"] <= 4.0) &
    (gpa_master["gpa_weighted"]   <= 6.0)
]

# 9) Define core columns to keep (now excluding 'bound_for')
keep_cols = [
    'mastid',
    'school_id',
    'year',
    'gpa_weighted',
    'gpa_unweighted',
    'rank_weighted',
    'rank_unweighted'
]

# 10) Subset to only those columns
gpa_core = gpa_master[keep_cols].copy()

# 11) Drop rows where any of these core columns is NaN
gpa_core = gpa_core.dropna(subset=keep_cols)

# 12) Convert rank columns from float to int
gpa_core['rank_unweighted'] = gpa_core['rank_unweighted'].round(0).astype(int)
gpa_core['rank_weighted']   = gpa_core['rank_weighted'].round(0).astype(int)

# 13) Round GPAs to two decimals
gpa_core['gpa_unweighted'] = gpa_core['gpa_unweighted'].round(2)
gpa_core['gpa_weighted']   = gpa_core['gpa_weighted'].round(2)

gpa_core = gpa_core.drop_duplicates(subset=['mastid', 'year'], keep='first')

# 2) (Optional) Reset the index
gpa_core = gpa_core.reset_index(drop=True)

# 3) Save the deduplicated file
from pathlib import Path
output_path = Path("/Users/adamcartwright/ncerdc/Student Data/GPA Data") / "gpa_clean.csv"
gpa_core.to_csv(output_path, index=False)