In [31]:
import pandas as pd
import numpy as np
import os
import re

transcripts = pd.DataFrame()
for year in range(18, 24):  # range is exclusive at the end
    filename = f"/Users/danielmatten/Desktop/m/transcripts20{year}.sas7bdat"
    try:
        # Read SAS file
        df = pd.read_sas(filename)

        # Keep only relevant columns
        df.columns = df.columns.str.lower()

        # Drop missing values

        
        # Append to the main DataFrame
        transcripts = pd.concat([transcripts, df], ignore_index=True)
        print(f"{year} done")

    except FileNotFoundError:
        print(f"File {filename} not found. Skipping.")
    except KeyError:
        print(f"Required columns not found in {filename}. Skipping.")
def clean_bytes(val):
    if isinstance(val, bytes):
        return str(val)[2:-1]  # str(b'xyz') => "b'xyz'" → "xyz"
    return val
# Drop unnecessary columns
transcripts.drop(columns=['semester', 'how_taken_desc', 'course_length_desc','school','completed_date','course_code'], inplace=True)

# Drop rows where 'mastid' is NaN
transcripts.dropna(subset=['mastid'], inplace=True)
print("nas")
# Decode bytes to string in all columns (no slicing needed)
for col in transcripts.columns:
    transcripts[col] = transcripts[col].apply(clean_bytes)
    print(col)

transcripts.dropna(inplace=True) #this is pretty safe

transcripts.to_csv("../data/transcripts_master.csv",index=False)

18 done
19 done
20 done
21 done
22 done
23 done
nas
lea
schlcode
course_desc
grade
credit_value
credit_value_earned
final_mark
academic_level_desc
extra_gpa_point
include_in_gpa
include_in_honour_role
include_in_ranking
mastid


In [None]:
# ─── GPA HELPER FUNCTIONS ───────────────────────────────────────────────────────

def final_mark_to_letter(mark):
    """
    Convert a final_mark (numeric 0–100 or a letter) into a standardized letter grade:
      - If it can be cast to float, map 90–100→"A", 80–89→"B", 70–79→"C", 60–69→"D", <60→"F"
      - Otherwise, assume it’s already a letter string (e.g. "A", "B+", "C-", etc.)
    """
    try:
        score = float(mark)
        if score >= 90:
            return "A"
        elif score >= 80:
            return "B"
        elif score >= 70:
            return "C"
        elif score >= 60:
            return "D"
        else:
            return "F"
    except:
        return str(mark).strip().upper()

base_points_map = {
    "A+": 4.0, "A": 4.0, "A-": 4.0,
    "B+": 3.0, "B": 3.0, "B-": 3.0,
    "C+": 2.0, "C": 2.0, "C-": 2.0,
    "D+": 1.0, "D": 1.0, "D-": 1.0,
    "F": 0.0
}

def extra_weight(level):
    """
    Map academic_level_desc codes to extra GPA weight:
      - 5 → Honors/Advanced/AIG   → +0.5
      - 7 → Advanced Placement     → +1.0
      - 8 → International Baccalaureate → +1.0
      - Otherwise → 0.0
    """
    try:
        lvl = int(level)
        if lvl == 5:
            return 0.5
        elif lvl in (7, 8):
            return 1.0
        else:
            return 0.0
    except:
        return 0.0

# ─── 1) Copy & normalize ────────────────────────────────────────────────────────
df = transcripts.copy()
# ensure datetime
df["completed_date"] = pd.to_datetime(df["completed_date"], errors="coerce")
df["year"]           = df["completed_date"].dt.year.astype("Int64")

# ─── 2) Filter to HS grades & GPA-eligible ───────────────────────────────────────
df["grade_int"] = pd.to_numeric(df["grade"], errors="coerce").astype("Int64")
df = df.loc[
    df["grade_int"].between(9, 12) &
    (df["include_in_gpa"] == "Y")
].copy()

# ─── 4) GPA points & quality points ─────────────────────────────────────────────
# letter → base points → extra weight → capped weighted points
df["letter_grade"]         = df["final_mark"].apply(final_mark_to_letter)
df["base_grade_point"]     = df["letter_grade"].map(base_points_map)
df["extra_gpa_weight"]     = df["academic_level_desc"].apply(extra_weight)
df["weighted_grade_point"] = (
    df["base_grade_point"] + df["extra_gpa_weight"]
).clip(upper=5.0)

# credits
df["credit_value_earned"] = pd.to_numeric(df["credit_value_earned"], errors="coerce")
df["credit_value"]        = pd.to_numeric(df["credit_value"], errors="coerce")
df["credits_for_calc"]    = df["credit_value_earned"].fillna(df["credit_value"])

# quality points
df["qp_unweighted"] = df["base_grade_point"]     * df["credits_for_calc"]
df["qp_weighted"]   = df["weighted_grade_point"] * df["credits_for_calc"]

# ─── 5) Cumulative sums & GPAs ──────────────────────────────────────────────────
df = df.sort_values(["mastid", "year"])
df[["csum_credits","csum_qp_uw","csum_qp_w"]] = (
    df.groupby("mastid")[["credits_for_calc","qp_weighted","qp_weighted"]]
      .cumsum()
)

df["gpa_weighted_cum"] = (df["csum_qp_uw"] / df["csum_credits"]).round(3)
df["gpa_weighted_cum"]   = (df["csum_qp_w"]  / df["csum_credits"]).round(3)

# ─── 6) Collapse to one row per student/school/year ──────────────────────────────
gpa_yearly = (
    df.groupby(["mastid","year"], as_index=False)
      .agg({
         "gpa_unweighted_cum":"last",
         "gpa_weighted_cum":"last",
         "csum_credits":"last"
      })
      .rename(columns={
         "csum_credits":"total_credits_cum"
      })
)

# clean up types & drop empties
gpa_yearly["mastid"] = gpa_yearly["mastid"].astype(int)
gpa_yearly.dropna(subset=["gpa_unweighted_cum"], inplace=True)

# ─── 7) Save ────────────────────────────────────────────────────────────────────
out_path = "/Users/adamcartwright/ncerdc-model/data/transcripts_gpa_by_year.csv"
gpa_yearly.to_csv(out_path, index=False)

In [25]:

# Mapping based on the examples you gave
conversion_map = {
    '0': 'Modified Curriculum',
    '1': 'Abridged/Adapted (Remedial)',
    '2': 'Standard Version',
    '5': 'Honors/Advanced/Academically Gifted',
    '6': 'Co-op Education',
    '7': 'Advanced Placement',
    '8': 'International Baccalaureate',
    '9': 'Non-Classroom Activity'
}

def convert_to_description(val):
    val = str(val).strip()
    return conversion_map.get(val[0], val) if val else val

# Apply to your column (replace 'your_column' with the actual name)
transcripts['academic_level_desc'] = transcripts['academic_level_desc'].apply(convert_to_description)

transcripts = transcripts[transcripts['include_in_gpa'] == 'Y']

letter_to_score = {
    'A+': 98,
    'A': 95,
    'A-': 91,
    'B+': 88,
    'B': 85,
    'B-': 81,
    'C+': 78,
    'C': 75,
    'C-': 71,
    'D+': 68,
    'D': 65,
    'D-': 61,
    'F': 50,
    'P': None,   # Pass/fail — handle as missing or special flag
    'W': None,   # Withdrawn — usually not a valid mark
    'INC': None, # Incomplete
    'EX': None   # Exempt
}

def convert_final_mark(val):
    if pd.isna(val):
        return None

    try:
        # Try converting to float (for numeric values)
        num = float(val)
        if 0 <= num <= 100:
            return num
    except:
        pass

    # Handle letter grades
    val_str = str(val).strip().upper()
    return letter_to_score.get(val_str, None)

transcripts['final_mark'] = transcripts['final_mark'].apply(convert_final_mark)

transcripts.dropna(subset=['final_mark'])
max_classes = 50

# Sort and rank
transcripts_sorted = transcripts.sort_values(by=['mastid', 'grade', 'academic_level_desc'])
transcripts_sorted['class_rank'] = transcripts_sorted.groupby('mastid').cumcount() + 1

# Trim long histories
transcripts_trimmed = transcripts_sorted[transcripts_sorted['class_rank'] <= max_classes]

# Pivot helper
def pivot_feature(df, colname):
    out = df.pivot(index='mastid', columns='class_rank', values=colname)
    out.columns = [f"{colname}_{i}" for i in out.columns]
    return out

# Pivot all desired features
features = {
    'course_desc': pivot_feature(transcripts_trimmed, 'course_desc'),
    'grade': pivot_feature(transcripts_trimmed, 'grade'),
    'academic_level_desc': pivot_feature(transcripts_trimmed, 'academic_level_desc'),
    'final_mark': pivot_feature(transcripts_trimmed, 'final_mark')
}

# Base student info — from the full transcripts, not trimmed
student_info = transcripts.groupby('mastid')[['lea', 'schlcode']].first()

# Merge all
student_df = student_info.copy()
for df in features.values():
    student_df = student_df.merge(df, left_index=True, right_index=True, how='left')

# Reset index for ML-ready DataFrame
student_df = student_df.reset_index()

student_df.to_csv("../data/transcripts_pivoted.csv",index=False)

In [7]:
gpa_yearly = pd.read_csv("../data/transcripts_gpa_by_year.csv")

In [8]:
gpa_yearly = gpa_yearly[gpa_yearly['year'] > 2016]

In [25]:
exit_ids = pd.read_csv('../data/exit_list.csv')
filtered = gpa_yearly.merge(exit_ids,on='mastid',how='inner')
filtered = filtered[filtered['mastid'].isin(exit_ids['mastid'])]

In [26]:
# Calculate grade
filtered['grade'] = filtered['year']-filtered['effective_g9year']  + 9

# Drop the original columns
filtered = filtered.drop(columns=['year', 'effective_g9year','total_credits_cum'])



In [27]:
# Pivot on both GPA columns per grade
pivot_df = filtered.pivot(index='mastid', columns='grade', values=['gpa_unweighted_cum', 'gpa_weighted_cum'])

# Flatten MultiIndex columns
pivot_df.columns = [f'{col[0]}_grade_{int(col[1])}' for col in pivot_df.columns]

# Reset index if you want mastid as a column
pivot_df = pivot_df.reset_index()


In [28]:
required_grades = [9, 10, 11, 12]
required_cols = [f'gpa_unweighted_cum_grade_{g}' for g in required_grades]

# Keep only rows with all 4 grades present
pivot_df = pivot_df.dropna(subset=required_cols)


In [29]:
pivot_df.drop(columns=['gpa_unweighted_cum_grade_7','gpa_unweighted_cum_grade_8','gpa_unweighted_cum_grade_13','gpa_unweighted_cum_grade_14','gpa_unweighted_cum_grade_15','gpa_unweighted_cum_grade_16','gpa_unweighted_cum_grade_17','gpa_unweighted_cum_grade_21','gpa_unweighted_cum_grade_22'],inplace=True)
pivot_df.drop(columns=['gpa_weighted_cum_grade_7','gpa_weighted_cum_grade_8','gpa_weighted_cum_grade_13','gpa_weighted_cum_grade_14','gpa_weighted_cum_grade_15','gpa_weighted_cum_grade_16','gpa_weighted_cum_grade_17','gpa_weighted_cum_grade_21','gpa_weighted_cum_grade_22'],inplace=True)

In [30]:
pivot_df.to_csv("../data/transcripts_mastid.csv",index=False)