## Setup and Data Loading
Load required libraries, define approved courses, and query data from the PostgreSQL database.

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# create database connection
engine = create_engine("postgresql://cosea_user:CoSeaIndex@pgsql.dataconn.net:5432/cosea_db")
pd.set_option("display.max_columns", None)

# list of approved computer science course titles
approved_courses = [
    "advanced placement computer science a",
    "advanced placement computer science principles",
    "ib computer science, year one",
    "ib computer science, year two",
    "computer science principles",
    "programming, games, apps, and society",
    "web development",
    "embedded computing",
    "game design: animation and simulation",
    "introduction to cybersecurity",
    "advanced cybersecurity",
    "coding for fintech",
    "introduction to python"
]

# load tables from PostgreSQL
approved = pd.read_sql('SELECT * FROM "allhsgrades24"."tbl_approvedschools"', engine)
fte = pd.read_sql('SELECT * FROM "allhsgrades24"."fte2024-1_enroll-demog_sch"', engine)
sc_full = pd.read_sql('SELECT * FROM "allhsgrades24"."sc2024_l_comp_sci_crs_enroll_demog_sch"', engine)
sc = sc_full.copy()
cs_teacher = pd.read_sql('SELECT * FROM "allhsgrades24"."sc2024_l_comp_sci_crs_tch_roster_sch"', engine)

# print counts of records in each table
print(f"Approved Schools: {len(approved)}")
print(f"FTE: {len(fte)}")
print(f"Computer Science Courses: {len(sc)}")
print(f"Computer Science Teachers: {len(cs_teacher)}")


Approved Schools: 394
FTE: 2322
Computer Science Courses: 5820
Computer Science Teachers: 7440


## Data Formatting and Cleaning
Standardize ID formats, clean certificate IDs, and normalize course titles.

In [2]:
# standardize IDs and generate unique school ID
for df in [approved, fte, sc, cs_teacher, sc_full]:
    df["SYSTEM_ID"] = df["SYSTEM_ID"].astype(str).str.zfill(4)
    df["SCHOOL_ID"] = df["SCHOOL_ID"].astype(str).str.zfill(4)
    df["UNIQUESCHOOLID"] = df["SYSTEM_ID"] + df["SCHOOL_ID"]

# clean certificate IDs
cs_teacher["CERTIFICATE_ID"] = cs_teacher["CERTIFICATE_ID"].replace(["n/a", "N/A", ".", ""], pd.NA)
cs_teacher["CERTIFICATE_ID"] = pd.to_numeric(cs_teacher["CERTIFICATE_ID"], errors="coerce").astype("Int64")

# count certified teachers per school
certified = cs_teacher.groupby("UNIQUESCHOOLID")["CERTIFICATE_ID"].nunique().reset_index()
certified.rename(columns={"CERTIFICATE_ID": "Certified_Teachers"}, inplace=True)

# normalize course titles to lowercase
sc["COURSE_TITLE"] = sc["COURSE_TITLE"].astype(str).str.lower()
sc_full["COURSE_TITLE"] = sc_full["COURSE_TITLE"].astype(str).str.lower()


## Aggregate Enrollment and Demographics
Summarize CS course taker data and clean total enrollment data from FTE.


In [3]:
# aggregate CS enrollment by school for approved courses
sc_for_enrollment = sc[sc["COURSE_TITLE"].isin(approved_courses)]
sc_agg = sc_for_enrollment.groupby("UNIQUESCHOOLID", as_index=False).agg({
    "COURSE_TAKER_CT": "sum",
    "Race: Asian": "sum",
    "Race: Black": "sum",
    "Race: White": "sum",
    "Ethnicity: Hispanic": "sum",
    "Female": "sum",
    "Male": "sum"
}).rename(columns={
    "COURSE_TAKER_CT": "CS_Enrollment",
    "Race: Asian": "CS_Asian",
    "Race: Black": "CS_Black",
    "Race: White": "CS_White",
    "Ethnicity: Hispanic": "CS_Hispanic",
    "Female": "CS_Female",
    "Male": "CS_Male"
})

# clean FTE values and demographics
fte["Total_Enrollment"] = pd.to_numeric(fte["Total Student Count"], errors="coerce")
for col in ["Race: Asian", "Race: Black", "Race: White", "Ethnicity: Hispanic", "Female", "Male"]:
    fte[col] = pd.to_numeric(fte[col], errors="coerce").fillna(0)

fte_clean = fte[[
    "UNIQUESCHOOLID", "Total_Enrollment",
    "Race: Asian", "Race: Black", "Race: White", "Ethnicity: Hispanic",
    "Female", "Male"
]].rename(columns={
    "Race: Asian": "Total_Asian",
    "Race: Black": "Total_Black",
    "Race: White": "Total_White",
    "Ethnicity: Hispanic": "Total_Hispanic",
    "Female": "Total_Female",
    "Male": "Total_Male"
})


## Merge All Data and Compute Representation Index (RI)
Join all datasets and compute the RI metric for each race/gender group.


In [4]:
# merge all sources
merged = pd.merge(approved, fte_clean, on="UNIQUESCHOOLID", how="left")
merged = pd.merge(merged, sc_agg, on="UNIQUESCHOOLID", how="left")
merged = pd.merge(merged, certified, on="UNIQUESCHOOLID", how="left")
merged = merged[merged["Total_Enrollment"] > 0]

# fill NA with zeros for CS fields
merged["CS_Enrollment"] = merged["CS_Enrollment"].fillna(0)
for col in ["CS_Asian", "CS_Black", "CS_White", "CS_Hispanic", "CS_Female", "CS_Male"]:
    merged[col] = merged[col].fillna(0)

# compute Representation Index (RI)
race_pairs = {
    "Asian": ("CS_Asian", "Total_Asian"),
    "Black": ("CS_Black", "Total_Black"),
    "Hispanic": ("CS_Hispanic", "Total_Hispanic"),
    "White": ("CS_White", "Total_White"),
    "Female": ("CS_Female", "Total_Female")
}

for race, (cs_col, total_col) in race_pairs.items():
    cs_share = np.where(merged["CS_Enrollment"] == 0, 0, merged[cs_col] / merged["CS_Enrollment"])
    school_share = np.where(merged["CS_Enrollment"] == 0, 0, merged[total_col] / merged["Total_Enrollment"])
    merged[f"RI_{race}"] = cs_share - school_share

# add schools from approved that were missing
missing_ids = set(approved["UNIQUESCHOOLID"]) - set(merged["UNIQUESCHOOLID"])
if missing_ids:
    print(f"appending {len(missing_ids)} schools missing from merged")
    extras = approved[approved["UNIQUESCHOOLID"].isin(missing_ids)].copy()
    extras["Total_Enrollment"] = 0
    extras["CS_Enrollment"] = 0
    for col in ["CS_Asian", "CS_Black", "CS_White", "CS_Hispanic", "CS_Female", "CS_Male",
                "Total_Asian", "Total_Black", "Total_White", "Total_Hispanic", "Total_Female", "Total_Male",
                "Certified_Teachers"]:
        extras[col] = 0
    for race in ["Asian", "Black", "Hispanic", "White", "Female"]:
        extras[f"RI_{race}"] = np.nan
    merged = pd.concat([merged, extras], ignore_index=True)


## Course-Level Logic (2-Digit)
Build course-level logic flags based on approval and certification.


In [5]:
cs_teacher["COURSE_TITLE"] = cs_teacher["COURSE_TITLE"].astype(str).str.lower()
cs_teacher["TEACHER_LAST_NAME"] = cs_teacher["TEACHER_LAST_NAME"].astype(str).str.lower()
virtual_teachers = ["software-based instruction", "gavs virtual teacher", "virtual school (non-gavs)"]
virtual_labels = [v.lower() for v in virtual_teachers]
cs_teacher["is_virtual"] = (
    cs_teacher["TEACHER_LAST_NAME"].isin(virtual_labels) |
    cs_teacher["TEACHER_LAST_NAME"].str.contains("de:", na=False)
)


course_merge = pd.merge(
    sc_full[["UNIQUESCHOOLID", "COURSE_NUMBER", "COURSE_TITLE"]],
    cs_teacher[["UNIQUESCHOOLID", "COURSE_NUMBER", "COURSE_TITLE", "CERTIFICATE_ID", "TEACHER_LAST_NAME", "is_virtual"]],
    on=["UNIQUESCHOOLID", "COURSE_NUMBER", "COURSE_TITLE"],
    how="outer"
).drop_duplicates()

course_merge["approved_flag"] = course_merge["COURSE_TITLE"].isin(approved_courses).astype(int)
course_merge["CERTIFICATE_ID"] = pd.to_numeric(course_merge["CERTIFICATE_ID"], errors="coerce")
course_merge["certified_flag"] = course_merge["CERTIFICATE_ID"].notna().astype(int)
course_merge["COURSE_LOGIC"] = course_merge["approved_flag"].astype(str) + course_merge["certified_flag"].astype(str)

course_output = course_merge[
    ["UNIQUESCHOOLID", "COURSE_NUMBER", "COURSE_TITLE", "approved_flag", "certified_flag", "COURSE_LOGIC", "CERTIFICATE_ID", "TEACHER_LAST_NAME", "is_virtual"]
]

course_output.to_sql("course_logic_2024", engine, schema="census", if_exists="replace", index=False)
print("updated census.course_logic_2024")


updated census.course_logic_2024


## School-Level Logic (3-Digit) and Final Export
Compute in-person, virtual, and extra teacher flags. Save the final output to PostgreSQL.


In [6]:
school_courses = course_output.copy()

# logic flags
in_person_cs = school_courses[(school_courses["approved_flag"] == 1) & (~school_courses["is_virtual"])]["UNIQUESCHOOLID"].unique()
virtual_cs = school_courses[(school_courses["approved_flag"] == 1) & (school_courses["is_virtual"])]["UNIQUESCHOOLID"].unique()
school_courses["Valid Course"] = school_courses["approved_flag"] == 1

def check_teacher_extra(group):
    return int(not group["Valid Course"].any())

extra_teachers = school_courses.groupby(["UNIQUESCHOOLID", "CERTIFICATE_ID"], group_keys=False).apply(
    check_teacher_extra, include_groups=False
).reset_index(name="Extra_Flag")
extra_certified_ids = extra_teachers[extra_teachers["Extra_Flag"] == 1]["UNIQUESCHOOLID"].unique()

# assign logic flags
merged["L1_in_person"] = merged["UNIQUESCHOOLID"].isin(in_person_cs).astype(int)
merged["L2_virtual"] = merged["UNIQUESCHOOLID"].isin(virtual_cs).astype(int)
merged["L3_extra"] = merged["UNIQUESCHOOLID"].isin(extra_certified_ids).astype(int)
merged["LOGIC_CLASS"] = merged["L1_in_person"].astype(str) + merged["L2_virtual"].astype(str) + merged["L3_extra"].astype(str)
merged.drop(columns=["L1_in_person", "L2_virtual", "L3_extra"], inplace=True)

print(merged["LOGIC_CLASS"].value_counts().sort_index())

# export
gadoe_output = merged[
    [
        "UNIQUESCHOOLID",
        "CS_Enrollment", "CS_Asian", "CS_Black", "CS_White", "CS_Hispanic", "CS_Female", "CS_Male",
        "Certified_Teachers",
        "RI_Asian", "RI_Black", "RI_Hispanic", "RI_White", "RI_Female",
        "LOGIC_CLASS"
    ]
]

gadoe_output.to_sql("gadoe2024_389", engine, schema="census", if_exists="replace", index=False)
print("updated census.gadoe2024_389 with LOGIC_CLASS")


LOGIC_CLASS
000    45
001    25
010    15
011    11
100    75
101    89
110    47
111    87
Name: count, dtype: int64
updated census.gadoe2024_389 with LOGIC_CLASS


In [7]:
# define expanded approved course list
expanded_courses = approved_courses + [
    "introduction to software technology",
    "introduction to digital technology",
    "introduction to hardware technology"
]

# reassign flags with expanded course list
course_merge["approved_flag_2"] = course_merge["COURSE_TITLE"].isin(expanded_courses).astype(int)
course_merge["COURSE_LOGIC_2"] = course_merge["approved_flag_2"].astype(str) + course_merge["certified_flag"].astype(str)

# assign new logic flags
school_courses_2 = course_merge.copy()
in_person_cs_2 = school_courses_2[
    (school_courses_2["approved_flag_2"] == 1) & (~school_courses_2["is_virtual"])
]["UNIQUESCHOOLID"].unique()

virtual_cs_2 = school_courses_2[
    (school_courses_2["approved_flag_2"] == 1) & (school_courses_2["is_virtual"])
]["UNIQUESCHOOLID"].unique()

school_courses_2["Valid Course 2"] = school_courses_2["approved_flag_2"] == 1

def check_teacher_extra_2(group):
    return int(not group["Valid Course 2"].any())

extra_teachers_2 = school_courses_2.groupby(["UNIQUESCHOOLID", "CERTIFICATE_ID"], group_keys=False).apply(
    check_teacher_extra_2, include_groups=False
).reset_index(name="Extra_Flag_2")

extra_certified_ids_2 = extra_teachers_2[extra_teachers_2["Extra_Flag_2"] == 1]["UNIQUESCHOOLID"].unique()

# assign logic class 2
merged["L1_in_person_2"] = merged["UNIQUESCHOOLID"].isin(in_person_cs_2).astype(int)
merged["L2_virtual_2"] = merged["UNIQUESCHOOLID"].isin(virtual_cs_2).astype(int)
merged["L3_extra_2"] = merged["UNIQUESCHOOLID"].isin(extra_certified_ids_2).astype(int)
merged["LOGIC_CLASS_2"] = (
    merged["L1_in_person_2"].astype(str) +
    merged["L2_virtual_2"].astype(str) +
    merged["L3_extra_2"].astype(str)
)

# clean up temp columns
merged.drop(columns=["L1_in_person_2", "L2_virtual_2", "L3_extra_2"], inplace=True)

# show distribution
print(merged["LOGIC_CLASS_2"].value_counts().sort_index())

# safely update output table
gadoe_output = gadoe_output.copy()
gadoe_output["LOGIC_CLASS_2"] = merged["LOGIC_CLASS_2"].values
gadoe_output.to_sql("gadoe2024_389", engine, schema="census", if_exists="replace", index=False)
print("updated census.gadoe2024_389 with LOGIC_CLASS_2")


LOGIC_CLASS_2
000     41
001      7
010     19
011      3
100    109
101     28
110    131
111     56
Name: count, dtype: int64
updated census.gadoe2024_389 with LOGIC_CLASS_2


## RI Tables based on Locale type

In [8]:
# load gadoe data and approvedschools with Locale
gadoe = pd.read_sql('SELECT * FROM "census"."gadoe2024_389"', engine)
approved = pd.read_sql('SELECT "UNIQUESCHOOLID", "Locale" FROM "allhsgrades24"."tbl_approvedschools"', engine)


# join locale into gadoe
gadoe = gadoe.merge(approved, on="UNIQUESCHOOLID", how="left")

# categorize RI values
def categorize_ri(val):
    if pd.isna(val):
        return "Missing"
    elif val > 0.05:
        return "Overrepresented"
    elif val < -0.05:
        return "Underrepresented"
    else:
        return "Parity"

# function to generate formatted summary table
def formatted_disparity_table(df, ri_col, race_label):
    df = df.copy()
    df["Category"] = df[ri_col].apply(categorize_ri)

    # build locale-wise summary
    summary = df.groupby("Locale").agg({
        "Category": [
            ("Overrepresentation (School Count)", lambda x: (x == "Overrepresented").sum()),
            ("Parity (School Count)", lambda x: (x == "Parity").sum()),
            ("Underrepresentation (School Count)", lambda x: (x == "Underrepresented").sum())
        ]
    })

    # flatten multi-index columns
    summary.columns = [col[1] if isinstance(col, tuple) else col for col in summary.columns]
    summary = summary.rename(columns={ri_col: "Overall Disparity (Mean ± Std)"})
    summary = summary.reset_index()
    summary.insert(0, "Locale Type", summary.pop("Locale"))

    # compute total values
    total_row = {
        "Locale Type": "Total",
        "Overrepresentation (School Count)": (df["Category"] == "Overrepresented").sum(),
        "Parity (School Count)": (df["Category"] == "Parity").sum(),
        "Underrepresentation (School Count)": (df["Category"] == "Underrepresented").sum()
    }

    summary = pd.concat([summary, pd.DataFrame([total_row])], ignore_index=True)
    return summary

asian_table = formatted_disparity_table(gadoe, "RI_Asian", "Asian")
display(asian_table.style.set_caption("Asian"))

black_table = formatted_disparity_table(gadoe, "RI_Black", "Black")
display(black_table.style.set_caption("Black"))

hispanic_table = formatted_disparity_table(gadoe, "RI_Hispanic", "Hispanic")
display(hispanic_table.style.set_caption("Hispanic"))

white_table = formatted_disparity_table(gadoe, "RI_White", "White")
display(white_table.style.set_caption("White"))

Unnamed: 0,Locale Type,Overrepresentation (School Count),Parity (School Count),Underrepresentation (School Count)
0,City,7,54,0
1,Rural,18,119,1
2,Suburb,68,84,0
3,Town,8,35,0
4,Total,101,292,1


Unnamed: 0,Locale Type,Overrepresentation (School Count),Parity (School Count),Underrepresentation (School Count)
0,City,11,25,25
1,Rural,17,85,36
2,Suburb,18,92,42
3,Town,5,23,15
4,Total,51,225,118


Unnamed: 0,Locale Type,Overrepresentation (School Count),Parity (School Count),Underrepresentation (School Count)
0,City,9,36,16
1,Rural,23,88,27
2,Suburb,11,65,76
3,Town,3,33,7
4,Total,46,222,126


Unnamed: 0,Locale Type,Overrepresentation (School Count),Parity (School Count),Underrepresentation (School Count)
0,City,17,37,7
1,Rural,31,69,38
2,Suburb,29,97,26
3,Town,10,23,10
4,Total,87,226,81
