In [23]:
import pandas as pd
import numpy as np
import os
import re

transcripts = pd.DataFrame()
for year in range(18, 24):  # range is exclusive at the end
    filename = f"/Users/danielmatten/Desktop/m/transcripts20{year}.sas7bdat"
    try:
        # Read SAS file
        df = pd.read_sas(filename)

        # Keep only relevant columns
        df.columns = df.columns.str.lower()

        # Drop missing values

        
        # Append to the main DataFrame
        transcripts = pd.concat([transcripts, df], ignore_index=True)
        print(f"{year} done")

    except FileNotFoundError:
        print(f"File {filename} not found. Skipping.")
    except KeyError:
        print(f"Required columns not found in {filename}. Skipping.")
def clean_bytes(val):
    if isinstance(val, bytes):
        return str(val)[2:-1]  # str(b'xyz') => "b'xyz'" → "xyz"
    return val
# Drop unnecessary columns
transcripts.drop(columns=['semester', 'how_taken_desc', 'course_length_desc','school','completed_date','course_code'], inplace=True)

# Drop rows where 'mastid' is NaN
#transcripts.dropna(subset=['mastid'], inplace=True)
print("nas")
# Decode bytes to string in all columns (no slicing needed)
for col in transcripts.columns:
    transcripts[col] = transcripts[col].apply(clean_bytes)
    print(col)

transcripts.dropna(inplace=True) #this is pretty safe

transcripts.to_csv("../data/transcripts_master.csv",index=False)

KeyboardInterrupt: 

In [25]:

# Mapping based on the examples you gave
conversion_map = {
    '0': 'Modified Curriculum',
    '1': 'Abridged/Adapted (Remedial)',
    '2': 'Standard Version',
    '5': 'Honors/Advanced/Academically Gifted',
    '6': 'Co-op Education',
    '7': 'Advanced Placement',
    '8': 'International Baccalaureate',
    '9': 'Non-Classroom Activity'
}

def convert_to_description(val):
    val = str(val).strip()
    return conversion_map.get(val[0], val) if val else val

# Apply to your column (replace 'your_column' with the actual name)
transcripts['academic_level_desc'] = transcripts['academic_level_desc'].apply(convert_to_description)

transcripts = transcripts[transcripts['include_in_gpa'] == 'Y']

letter_to_score = {
    'A+': 98,
    'A': 95,
    'A-': 91,
    'B+': 88,
    'B': 85,
    'B-': 81,
    'C+': 78,
    'C': 75,
    'C-': 71,
    'D+': 68,
    'D': 65,
    'D-': 61,
    'F': 50,
    'P': None,   # Pass/fail — handle as missing or special flag
    'W': None,   # Withdrawn — usually not a valid mark
    'INC': None, # Incomplete
    'EX': None   # Exempt
}

def convert_final_mark(val):
    if pd.isna(val):
        return None

    try:
        # Try converting to float (for numeric values)
        num = float(val)
        if 0 <= num <= 100:
            return num
    except:
        pass

    # Handle letter grades
    val_str = str(val).strip().upper()
    return letter_to_score.get(val_str, None)

transcripts['final_mark'] = transcripts['final_mark'].apply(convert_final_mark)

transcripts.dropna(subset=['final_mark'])
max_classes = 50

# Sort and rank
transcripts_sorted = transcripts.sort_values(by=['mastid', 'grade', 'academic_level_desc'])
transcripts_sorted['class_rank'] = transcripts_sorted.groupby('mastid').cumcount() + 1

# Trim long histories
transcripts_trimmed = transcripts_sorted[transcripts_sorted['class_rank'] <= max_classes]

# Pivot helper
def pivot_feature(df, colname):
    out = df.pivot(index='mastid', columns='class_rank', values=colname)
    out.columns = [f"{colname}_{i}" for i in out.columns]
    return out

# Pivot all desired features
features = {
    'course_desc': pivot_feature(transcripts_trimmed, 'course_desc'),
    'grade': pivot_feature(transcripts_trimmed, 'grade'),
    'academic_level_desc': pivot_feature(transcripts_trimmed, 'academic_level_desc'),
    'final_mark': pivot_feature(transcripts_trimmed, 'final_mark')
}

# Base student info — from the full transcripts, not trimmed
student_info = transcripts.groupby('mastid')[['lea', 'schlcode']].first()

# Merge all
student_df = student_info.copy()
for df in features.values():
    student_df = student_df.merge(df, left_index=True, right_index=True, how='left')

# Reset index for ML-ready DataFrame
student_df = student_df.reset_index()

student_df.to_csv("../data/transcripts_pivoted.csv",index=False)