In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat as stat
import seaborn as sns
from pathlib import Path

In [2]:
def final_mark_to_letter(mark):
    """
    Convert a final_mark (numeric 0–100 or a letter) into a standardized letter grade:
      - If it can be cast to float, map 90–100→"A", 80–89→"B", 70–79→"C", 60–69→"D", <60→"F"
      - Otherwise, assume it’s already a letter string (e.g. "A", "B+", "C-", etc.)
    """
    try:
        score = float(mark)
        if score >= 90:
            return "A"
        elif score >= 80:
            return "B"
        elif score >= 70:
            return "C"
        elif score >= 60:
            return "D"
        else:
            return "F"
    except:
        return str(mark).strip().upper()

base_points_map = {
    "A+": 4.0, "A": 4.0, "A-": 4.0,
    "B+": 3.0, "B": 3.0, "B-": 3.0,
    "C+": 2.0, "C": 2.0, "C-": 2.0,
    "D+": 1.0, "D": 1.0, "D-": 1.0,
    "F": 0.0
}

def extra_weight(level):
    """
    Map academic_level_desc codes to extra GPA weight:
      - 5 → Honors/Advanced/AIG   → +0.5
      - 7 → Advanced Placement     → +1.0
      - 8 → International Baccalaureate → +1.0
      - Otherwise → 0.0
    """
    try:
        lvl = int(level)
        if lvl == 5:
            return 0.5
        elif lvl in (7, 8):
            return 1.0
        else:
            return 0.0
    except:
        return 0.0




In [3]:
years = range(2017, 2024)  # 2017, 2018, 2019, 2020, 2021, 2022, 2023
all_cleaned = []

for yr in years:
    # 2.1) Load that year’s transcript file
    fname = f"/Users/adamcartwright/ncerdc/Student Data/Transcripts Data/transcripts{yr}.csv"
    df = pd.read_csv(fname)
    
    # 2.2) Normalize column names
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_")
    )
    
    # 2.3) Extract calendar year from completed_date (format “M/D/YYYY” or “MM/DD/YYYY”)
    #      Create a new integer column named 'year' with just the year.
    df["completed_date"] = pd.to_datetime(df["completed_date"], errors="coerce")
    df["year"] = df["completed_date"].dt.year.astype(int)
    
    # 2.4) Convert the student’s grade into an integer (if the transcripts file has a 'grade' column)
    #      Adjust "grade" field name if different in your CSV.
    df["grade_int"] = pd.to_numeric(df["grade"], errors="coerce").astype("Int64")
    
    # 2.5) Keep only high‐school courses (grades 9, 10, 11, 12) and courses that count toward GPA
    df = df[
        (df["grade_int"].between(9, 12)) &
        (df["include_in_gpa"] == "Y")
    ].copy()
    
    # 2.6) Create unique school_id by combining LEA + zero‐padded schlcode
    df["lea"] = df["lea"].astype(str).str.strip()
    df["schlcode_str"] = df["schlcode"].astype(str).str.zfill(3)
    df["school_id"] = df["lea"] + "-" + df["schlcode_str"]
    df.drop(columns=["schlcode_str"], inplace=True)
    
    # 2.7) Convert final_mark → letter_grade
    df["letter_grade"] = df["final_mark"].apply(final_mark_to_letter)
    
    # 2.8) Map letter_grade → base_grade_point
    df["base_grade_point"] = df["letter_grade"].map(base_points_map)
    
    # 2.9) Compute extra weighting for Honors/AP/IB
    df["extra_gpa_weight"] = df["academic_level_desc"].apply(extra_weight)
    
    # 2.10) Compute weighted grade point (cap at 5.0)
    df["weighted_grade_point"] = (
        df["base_grade_point"] + df["extra_gpa_weight"]
    ).clip(upper=5.0)
    
    # 2.11) Parse and choose credit values
    df["credit_value_earned"] = pd.to_numeric(df["credit_value_earned"], errors="coerce")
    df["credit_value"]        = pd.to_numeric(df["credit_value"], errors="coerce")
    df["credits_for_calc"]    = df["credit_value_earned"].fillna(df["credit_value"])
    
    # 2.12) Compute per‐course quality points
    df["quality_points_unweighted"] = df["base_grade_point"] * df["credits_for_calc"]
    df["quality_points_weighted"]   = df["weighted_grade_point"] * df["credits_for_calc"]
    
    # 2.13) Keep only columns needed forward
    #         (mastid, school_id, year, credits_for_calc, quality_points_unweighted, quality_points_weighted)
    df_small = df[
        ["mastid", "school_id", "year",
         "credits_for_calc",
         "quality_points_unweighted",
         "quality_points_weighted"]
    ].copy()
    
    all_cleaned.append(df_small)

# 2.14) Concatenate all years
transcripts_all = pd.concat(all_cleaned, ignore_index=True)


# --------------------------------------------------------------------------------
# 3) Compute cumulative GPA by student, school, year
# --------------------------------------------------------------------------------

# 3.1) Sort by mastid then year
transcripts_all = transcripts_all.sort_values(["mastid", "year"])

# 3.2) Compute cumulative sums for each student
transcripts_all[["csum_credits", 
                 "csum_qp_unweighted",
                 "csum_qp_weighted"]] = (
    transcripts_all
    .groupby("mastid")[
        ["credits_for_calc",
         "quality_points_unweighted",
         "quality_points_weighted"]
    ]
    .cumsum()
)

# 3.3) Compute cumulative GPA through that year
transcripts_all["gpa_unweighted_cum"] = (
    transcripts_all["csum_qp_unweighted"] / transcripts_all["csum_credits"]
).round(3)
transcripts_all["gpa_weighted_cum"] = (
    transcripts_all["csum_qp_weighted"] / transcripts_all["csum_credits"]
).round(3)

# 3.4) Collapse to one row per (mastid, school_id, year)
gpa_yearly = (
    transcripts_all
    .groupby(["mastid", "school_id", "year"], as_index=False)
    .agg({
        "gpa_unweighted_cum": "last",
        "gpa_weighted_cum":   "last",
        "csum_credits":       "last"
    })
    .rename(columns={
        "gpa_unweighted_cum": "gpa_unweighted",
        "gpa_weighted_cum":   "gpa_weighted",
        "csum_credits":       "total_credits_cum"
    })
)

# 3.5) Convert GPA columns to numeric, coercing empty/invalid strings to NaN
gpa_yearly["gpa_unweighted"] = pd.to_numeric(
    gpa_yearly["gpa_unweighted"], errors="coerce"
)
gpa_yearly["gpa_weighted"] = pd.to_numeric(
    gpa_yearly["gpa_weighted"], errors="coerce"
)

# 3.6) Drop any rows where gpa_unweighted is NaN (captures empty GPAs)
gpa_yearly = gpa_yearly.dropna(subset=["gpa_unweighted"]).copy()

# 3.7) Convert mastid from float to int
gpa_yearly["mastid"] = gpa_yearly["mastid"].astype(int)


# --------------------------------------------------------------------------------
# 4) Save the cumulative GPA table and display a sample
# --------------------------------------------------------------------------------

output_path = "/Users/adamcartwright/ncerdc/Student Data/Transcripts Data/gpa_cumulative_2017_2023.csv"
gpa_yearly.to_csv(output_path, index=False)

print(f"Saved cumulative GPA (2017–2023, grades 9–12 only) to:\n  {output_path}\n")
print("Sample rows from gpa_cumulative_2017_2023.csv:")
print(gpa_yearly.head(12))

  df["completed_date"] = pd.to_datetime(df["completed_date"], errors="coerce")
  df = pd.read_csv(fname)


Saved cumulative GPA (2017–2023, grades 9–12 only) to:
  /Users/adamcartwright/ncerdc/Student Data/Transcripts Data/gpa_cumulative_2017_2023.csv

Sample rows from gpa_cumulative_2017_2023.csv:
     mastid school_id  year  gpa_unweighted  gpa_weighted  total_credits_cum
0    414677   400-308  2017           4.000         4.000                6.0
1   1617842   340-454  2017           2.333         3.083                6.0
2   2128630   020-302  2017           4.000         4.000                8.5
3   2154422   320-365  2017           3.000         3.000                4.0
4   2156410   340-336  2017           3.667         3.667                6.0
5   2181391   600-415  2017           4.000         4.000                2.0
6   2183833   600-508  2017           3.500         3.500                2.0
7   2186086   600-302  2017           4.000         4.000                1.0
8   2193716   650-326  2017           3.400         3.400                5.0
9   3587675   080-312  2017          

In [4]:
# 1) Load the cumulative‐GPA table (grades 9–12 only)
gpa_yearly = pd.read_csv("/Users/adamcartwright/ncerdc/Student Data/GPA Data/gpa_cumulative_2017_2023.csv")

# 2) Compute how many distinct school years each student has
year_counts = (
    gpa_yearly
    .groupby("mastid")["year"]
    .nunique()
    .reset_index(name="n_years")
)

# 3) Keep only students with at least four distinct years (i.e., a full 9–12 sequence)
#    If you want exactly four calendar years, use == 4; if “at least” four, use >= 4.
full_4yr_mastids = year_counts.loc[year_counts["n_years"] == 4, "mastid"]

# 4) Filter the GPA table to only those mastids
gpa_full4 = gpa_yearly.loc[gpa_yearly["mastid"].isin(full_4yr_mastids)].copy()

# 5) (Optional) Re‐verify that each remaining student truly has four rows
check = (
    gpa_full4
    .groupby("mastid")["year"]
    .nunique()
    .reset_index(name="n_years")
)
print("Value counts of n_years among filtered students:")
print(check["n_years"].value_counts().sort_index())

# 6) Save the filtered result
gpa_full4.to_csv("/Users/adamcartwright/ncerdc/Student Data/GPA Data/gpa_trajectories.csv", index=False)


Value counts of n_years among filtered students:
n_years
4    392971
Name: count, dtype: int64
