In [1]:
import json
import random
import time
from pathlib import Path

import httpx
import polars as pl
import plotly.express as px

In [2]:
REDOWNLOAD = False

In [3]:
SCALE_ACT_CMPT = {
    "text": "Highest ACT composite score achieved (continuous)",
    "isComposite": False,
    "isScalar": False,
    "dimensionType": 4,
    "code": "ACTCMP",
    "label": "ACT composite score",
    "language": 1033
}
SCALE_SAT_MATH = {
    "text": "Highest SAT mathematics score achieved (continuous)",
    "isComposite": False,
    "isScalar": False,
    "dimensionType": 4,
    "code": "SATMAT",
    "label": "SAT mathematics score",
    "language": 1033
}
SCALE_SAT_VERBAL = {
    "text": "Highest SAT verbal score achieved (continuous)",
    "isComposite": False,
    "isScalar": False,
    "dimensionType": 4,
    "code": "SATVRB",
    "label": "SAT verbal score",
    "language": 1033
}
VARIABLE_TOTAL = {
    "code": "TOTAL",
    "label": "All graduates",
    "combValueLabels": [
        {
            "code": "1",
            "label": "All graduates",
            "selected": True
        }
    ]
}
VARIABLE_ARCH_CONS = {
    "code": "ACOC3",
    "label": "Earned at least 3 credits in architecture and construction courses? (SCED)",
    "combValueLabels": [
        {
            "code": "1",
            "label": "No",
            "selected": True
        },
        {
            "code": "2",
            "label": "Yes",
            "selected": True
        }
    ]
}
VARIABLE_AGR_FOOD = {
    "code": "ANRC3",
    "label": "Earned at least 3 credits in agriculture, food, and natural resources courses? (SCED)",
    "combValueLabels": [
        {
            "code": "1",
            "label": "No",
            "selected": True
        },
        {
            "code": "2",
            "label": "Yes",
            "selected": True
        }
    ]
}
VARIABLE_MANUFACTURING = {
    "code": "MFGC3",
    "label": "Earned at least 3 credits in manufacturing courses? (SCED)",
    "combValueLabels": [
        {
            "code": "1",
            "label": "No",
            "selected": True
        },
        {
            "code": "2",
            "label": "Yes",
            "selected": True
        }
    ]
}
VARIABLE_TRANSPORT = {
    "code": "TDLC3",
    "label": "Earned at least 3 credits in transportation, distribution, and logistics courses? (SCED)",
    "combValueLabels": [
        {
            "code": "1",
            "label": "No",
            "selected": True
        },
        {
            "code": "2",
            "label": "Yes",
            "selected": True
        }
    ]
}

BOOLEAN_VARIABLE = [
    {
        "code": "1",
        "label": "No",
        "selected": True
    },
    {
        "code": "2",
        "label": "Yes",
        "selected": True
    }
]

def get_var_map(columns: list[str]) -> dict:
    COLS = ["Average", "Standard Error (Average)", "Percentage", "Standard Error (Percentage)"]
    rename = {}
    for i, col in enumerate(columns):
        rename[col] = COLS[i % 4]
    return rename

def transform_boolean(df: pl.DataFrame, scale, variable) -> pl.DataFrame:
    bool_cat = {
        **{col: "No" for col in df.columns[2:6]},
        **{col: "Yes" for col in df.columns[6:10]},
        **{col: "Missing" for col in df.columns[10:14]},
    }
    return df.unpivot(
        on=df.columns[2:],
        index=["Year", "Jurisdiction"],
        variable_name="variable",
        value_name="value",
    ).with_columns(
        pl.col("variable").replace(bool_cat).alias(variable['label']),
        pl.col("variable").replace(get_var_map(df.columns[2:])).alias("Metric"),
        pl.col("value").str.strip_chars("()")
        .str.replace(r"[^\d\.]", "")
        .str.to_decimal()
        .cast(pl.Float64)
        .alias(scale['label']),
    ).with_columns(
        pl.col("variable").str.replace(r" \d+$", ""),
    ).select(
        "Year",
        "Jurisdiction",
        "Metric",
        variable['label'],
        scale['label'],
    ).pivot(
        on="Metric",
        index=["Year", "Jurisdiction", variable['label']],
        values=scale['label']
    )

def transform_scalar(df: pl.DataFrame, scale, variable) -> pl.DataFrame:
    flag_cat = {
        **{col: "" for col in df.columns[2:6]},
        **{col: "Missing" for col in df.columns[6:10]},
    }
    return df.unpivot(
        on=df.columns[2:],
        index=["Year", "Jurisdiction"],
        variable_name="variable",
        value_name="value",
    ).with_columns(
        pl.col("variable").replace(flag_cat).alias("Flag"),
        pl.col("variable").replace(get_var_map(df.columns[2:])).alias("Metric"),
        pl.col("value").str.strip_chars("()")
        .str.replace(r"[^\d\.]", "")
        .str.to_decimal()
        .cast(pl.Float64)
        .alias(scale['label']),
    ).with_columns(
        pl.col("variable").str.replace(r" \d+$", ""),
    ).select(
        "Year",
        "Jurisdiction",
        "Metric",
        "Flag",
        scale['label'],
    ).pivot(
        on="Metric",
        index=["Year", "Jurisdiction", "Flag"],
        values=scale['label'],
    )

def hsts_api_to_polars(data: dict, scale, variable) -> pl.DataFrame:
    clean_rows = []
    columns = {
        str(column["col"]): column
        for column in data["columnHeadingsList"]
    }
    for row in data["tabularDataList"]:
        clean_row = {}
        for cell in row:
            col = columns[cell["fieldRef"].removeprefix("colRef")]
            clean_row[col["headingText"] + f" {col['col']}"] = cell["value"]
        clean_rows.append(clean_row)
    df = pl.DataFrame(clean_rows).rename(
        {
            "Year 1": "Year",
            "Jurisdiction 2": "Jurisdiction",
        }
    )
    if variable["combValueLabels"] == BOOLEAN_VARIABLE:
        return transform_boolean(df, scale, variable)
    return transform_scalar(df, scale, variable)


def create_body(scale: dict, variable: dict) -> bytes:
    return json.dumps(
        {
            "PERCENTAGE_CODE": "RP",
            "survey": {
                "code": "hsts",
                "label": "hsts"
            },
            "requestType": "",
            "subject": {
                "code": "TRN",
                "label": "Transcript"
            },
            "cohort": {
                "code": "12",
                "cohort": "3",
                "label": "Grade 12"
            },
            "years": [
                "2019R3",
                "2009R3",
                "2000R3",
                "1990R3"
            ],
            "selectedYears": [],
            "framework": "1",
            "frameworkLabel": "High School Transcript Study",
            "scales": [scale],
            "jurisdictions": [
                {
                    "code": "NT",
                    "label": "National",
                    "level": 0,
                    "isCombinedJur": False,
                    "restOf": False
                }
            ],
            "combinedJurisdictions": [],
            "selectedJurisdictions": [],
            "variables": [variable],
            "combinedVariables": [],
            "crosstabVariables": [],
            "selectedVariables": [],
            "statistics": [
                {
                    "statType": "MN",
                    "label": "Averages",
                    "statElements": [
                        {
                            "element": "MN",
                            "label": "Average",
                            "code": "MN"
                        }
                    ]
                },
                {
                    "statType": "RP",
                    "label": "Percentages",
                    "statElements": [
                        {
                            "element": "RP",
                            "label": "Percentages",
                            "code": "RP"
                        }
                    ]
                }
            ],
            "selectedStatistics": [],
            "reportTitle": {
                "label": "Transcript, Grade 12, Misc.",
                "isUserAssignedName": True
            },
            "reportIndex": 0,
            "isReportEditMode": False,
            "isNoFiltersMode": False,
            "isCreateReportClicked": True,
            "newReports": False,
            "dependent": scale["code"],
            "tableOptions": {},
            "scale": scale["label"],
            "selectedChartValue": "",
            "selectedGroupedBy": "",
            "selections": [],
            "chartType": "",
            "allYearSampleData": [
                {
                    "year": "2019",
                    "sample": "R3",
                    "isNonAccommodated": False,
                    "framework": "1",
                    "frameworkLabel": "High School Transcript Study",
                    "frameworkSortOrder": 1,
                    "dimensionType": 5,
                    "code": "2019R3",
                    "label": "2019",
                    "language": 0
                },
                {
                    "year": "2009",
                    "sample": "R3",
                    "isNonAccommodated": False,
                    "framework": "1",
                    "frameworkLabel": "High School Transcript Study",
                    "frameworkSortOrder": 1,
                    "dimensionType": 5,
                    "code": "2009R3",
                    "label": "2009",
                    "language": 0
                },
                {
                    "year": "2005",
                    "sample": "R3",
                    "isNonAccommodated": False,
                    "framework": "1",
                    "frameworkLabel": "High School Transcript Study",
                    "frameworkSortOrder": 1,
                    "dimensionType": 5,
                    "code": "2005R3",
                    "label": "2005",
                    "language": 0
                },
                {
                    "year": "2000",
                    "sample": "R3",
                    "isNonAccommodated": False,
                    "framework": "1",
                    "frameworkLabel": "High School Transcript Study",
                    "frameworkSortOrder": 1,
                    "dimensionType": 5,
                    "code": "2000R3",
                    "label": "2000",
                    "language": 0
                },
                {
                    "year": "1990",
                    "sample": "R3",
                    "isNonAccommodated": False,
                    "framework": "1",
                    "frameworkLabel": "High School Transcript Study",
                    "frameworkSortOrder": 1,
                    "dimensionType": 5,
                    "code": "1990R3",
                    "label": "1990",
                    "language": 0
                }
            ],
            "MapSelectorKey": "",
            "compGridIsInitialTableLoad": True,
            "focalJurisdictionCode": "NT",
            "compGridSortColumn": 1,
            "compGridSortOrder": "asc",
            "variableLabels": "SHORT",
            "showVariableNameInTitle": True,
            "showMissing": True,
            "numDecimalPlaces": "TWO",
            "yearOrder": "DESC",
            "include": "SE",
            "useParensBrackets": True,
            "surveyForGlobalOptions": "hsts",
            "forExport": False,
            "defaultOverrides": {},
            "runReportImmediately": False,
            "wantUnicode": True,
            "rowLayouts": [
                {
                    "layoutType": "SAMPLE",
                    "position": 1,
                    "code": "SAMPLE"
                },
                {
                    "layoutType": "JURISDICTION",
                    "position": 2,
                    "code": "JURISDICTION"
                }
            ],
            "tableLayouts": [
                {
                    "position": 1,
                    "layoutType": "VARIABLE",
                    "code": variable["code"]
                }
            ],
            "dataTableName": "Data Table 1",
            "acrossYearSigTest": False,
            "shareType": "NONE",
            "selectedSigOptType": {},
            "selectedSigBtns": [],
            "activeTab": "DATATABLE",
            "reportSpecficData": {
                "nameText": "Transcript, Grade 12, Misc.",
                "inEditMode": False,
                "oldNameText": "Transcript, Grade 12, Misc."
            },
            "showActiveTable": {
                "value": False
            },
            "showSigOpts": {
                "value": False
            }
        }
    )


if REDOWNLOAD:
    for scale in (SCALE_ACT_CMPT,): # SCALE_SAT_MATH, SCALE_SAT_VERBAL):
        for variable in (VARIABLE_ARCH_CONS, VARIABLE_AGR_FOOD, VARIABLE_MANUFACTURING, VARIABLE_TRANSPORT): #, VARIABLE_TOTAL):
            r = httpx.post(
                "https://www.nationsreportcard.gov/ndecore/api/dataTable",
                content=create_body(scale, variable),
                cookies={
                    # ASP.NET_SessionId=dyph1tp0sup3bb3ispfkpp5q; TOS_NCE_Accepted=TOS_NCE123; NDEv20a1posxztc=%7B%22userLoginId%22%3A%22dominator.tarro%40gmail.com%22%2C%22userToken%22%3A%225f6f568f-e3e6-4caa-a544-7c9d9543f193%22%7D
                    "ASP.NET_SessionId": "dyph1tp0sup3bb3ispfkpp5q",
                    "TOS_NCE_Accepted": "TOS_NCE123",
                    "NDEv20a1posxztc": r"%7B%22userLoginId%22%3A%22dominator.tarro%40gmail.com%22%2C%22userToken%22%3A%225f6f568f-e3e6-4caa-a544-7c9d9543f193%22%7D",        
                },
                headers={
                    "Content-Type": "application/json",
                    "Host": "www.nationsreportcard.gov",
                    "Origin": "https://www.nationsreportcard.gov",
                    "Referer": "https://www.nationsreportcard.gov/ndecore/xplore/hsts",
                    "sec-ch-ua": '"Not)A;Brand";v="99", "Brave";v="127", "Chromium";v="127"',
                    "sec-ch-ua-mobile": "?0",
                    "sec-ch-ua-platform": '"macOS"',
                    "sec-fetch-dest": "empty",
                    "sec-fetch-mode": "cors",
                    "sec-fetch-site": "same-origin",
                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
                },
            )
            try:
                r.raise_for_status()
            except Exception as e:
                print(scale, variable)
                print(r.text)
                continue

            with open(f"data/nces/hsts/api/HSTS_API_{scale['code']}_{variable['code']}.json", "w") as f:
                data = r.json()
                json.dump(
                    data,
                    f,
                    indent=2,
                )
            try:
                df = hsts_api_to_polars(data, scale, variable)
                df.write_csv(f"data/nces/hsts/csv/HSTS_API_{variable['code']}_{scale['code']}.csv")
            except Exception as e:
                print(scale, variable)
                print("Error:", e)
                continue
            time.sleep(random.uniform(0.25, 1.25))


In [4]:
total = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_TOTAL_*.csv")
    ]
).filter(
    pl.col("Flag") != "Missing"
).drop("Flag").with_columns(
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE").join(
    pl.DataFrame(
        [
            # See data/nces/act/
            {"Year": 1990, "Measure": "ACT Composite", "Standard Deviation": 4.7}, # unknown, using 1997
            {"Year": 2000, "Measure": "ACT Composite", "Standard Deviation": 4.7},
            {"Year": 2009, "Measure": "ACT Composite", "Standard Deviation": 5.1},
            {"Year": 2019, "Measure": "ACT Composite", "Standard Deviation": 5.0},
            # See data/nces/sat/
            {"Year": 1990, "Measure": "SAT Math", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2000, "Measure": "SAT Math", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2009, "Measure": "SAT Math", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2019, "Measure": "SAT Math", "Standard Deviation": 117.0},
            {"Year": 1990, "Measure": "SAT Verbal", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2000, "Measure": "SAT Verbal", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2009, "Measure": "SAT Verbal", "Standard Deviation": 100.0}, # unknown, using test intended score
            {"Year": 2019, "Measure": "SAT Verbal", "Standard Deviation": 104.0},
        ]
    ),
    on=["Year", "Measure"],
)

total

Year,Average,Standard Error (Average),Percentage,Standard Error (Percentage),Measure,Standard Deviation
i64,f64,f64,f64,str,str,f64
1990,19.91,0.225,100.0,,"""ACT Composite""",4.7
2000,21.66,0.209,100.0,,"""ACT Composite""",4.7
2009,21.35,0.169,100.0,,"""ACT Composite""",5.1
2019,20.58,0.294,100.0,,"""ACT Composite""",5.0
1990,485.07,7.582,100.0,,"""SAT Math""",100.0
…,…,…,…,…,…,…
2019,520.4,5.355,100.0,,"""SAT Math""",117.0
1990,436.61,6.218,100.0,,"""SAT Verbal""",100.0
2000,524.54,6.03,100.0,,"""SAT Verbal""",100.0
2009,483.55,10.49,100.0,,"""SAT Verbal""",100.0


In [5]:
acoc1 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_ACOC1_*.csv")
    ]
).filter(
    pl.col("Earned at least 1 credit in architecture and construction courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 1 credit in architecture and construction courses? (SCED)").replace(
        {
            "Yes": "1+",
            "No": "0",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(acoc1)

acoc3 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_ACOC3_*.csv")
    ]
).filter(
    pl.col("Earned at least 3 credits in architecture and construction courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 3 credits in architecture and construction courses? (SCED)").replace(
        {
            "Yes": "3+",
            "No": "<3",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(acoc3)


Year,Earned at least 1 credit in architecture and construction courses? (SCED),Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",522.39,5.492,94.0,1.1,"""0""","""SAT Math"""
2009,"""No""",507.19,4.867,95.0,0.9,"""0""","""SAT Math"""
2000,"""No""",532.65,6.26,93.0,1.7,"""0""","""SAT Math"""
1990,"""No""",486.72,7.591,94.0,1.2,"""0""","""SAT Math"""
2019,"""Yes""",489.38,9.057,6.0,1.1,"""1+""","""SAT Math"""
…,…,…,…,…,…,…,…
1990,"""No""",20.05,0.232,92.0,1.3,"""0""","""ACT Composite"""
2019,"""Yes""",18.23,0.474,7.0,1.0,"""1+""","""ACT Composite"""
2009,"""Yes""",19.35,0.361,6.0,0.7,"""1+""","""ACT Composite"""
2000,"""Yes""",19.76,0.379,8.0,1.5,"""1+""","""ACT Composite"""


Year,Earned at least 3 credits in architecture and construction courses? (SCED),Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",20.62,0.292,99.0,0.2,"""<3""","""ACT Composite"""
2009,"""No""",21.4,0.17,99.0,0.2,"""<3""","""ACT Composite"""
2000,"""No""",21.7,0.202,99.0,0.5,"""<3""","""ACT Composite"""
1990,"""No""",19.95,0.229,99.0,0.3,"""<3""","""ACT Composite"""
2019,"""Yes""",,,1.0,0.2,"""3+""","""ACT Composite"""
2009,"""Yes""",,,1.0,0.2,"""3+""","""ACT Composite"""
2000,"""Yes""",,,1.0,0.5,"""3+""","""ACT Composite"""
1990,"""Yes""",,,1.0,0.3,"""3+""","""ACT Composite"""


In [6]:
anrc1 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_ANRC1_*.csv")
    ]
).filter(
    pl.col("Earned at least 1 credit in agriculture, food, and natural resources courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 1 credit in agriculture, food, and natural resources courses? (SCED)").replace(
        {
            "Yes": "1+",
            "No": "0",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(anrc1)

anrc3 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_ANRC3_*.csv")
    ]
).filter(
    pl.col("Earned at least 3 credits in agriculture, food, and natural resources courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 3 credits in agriculture, food, and natural resources courses? (SCED)").replace(
        {
            "Yes": "3+",
            "No": "<3",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(anrc3)

Year,"Earned at least 1 credit in agriculture, food, and natural resources courses? (SCED)",Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",20.82,0.309,89.0,1.5,"""0""","""ACT Composite"""
2009,"""No""",21.58,0.172,90.0,1.3,"""0""","""ACT Composite"""
2000,"""No""",21.84,0.209,91.0,1.8,"""0""","""ACT Composite"""
1990,"""No""",20.1,0.216,93.0,1.4,"""0""","""ACT Composite"""
2019,"""Yes""",18.71,0.303,11.0,1.5,"""1+""","""ACT Composite"""
…,…,…,…,…,…,…,…
1990,"""No""",436.89,6.23,99.0,0.3,"""0""","""SAT Verbal"""
2019,"""Yes""",497.19,9.936,7.0,1.5,"""1+""","""SAT Verbal"""
2009,"""Yes""",,,2.0,0.8,"""1+""","""SAT Verbal"""
2000,"""Yes""",482.87,19.011,5.0,1.5,"""1+""","""SAT Verbal"""


Year,"Earned at least 3 credits in agriculture, food, and natural resources courses? (SCED)",Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",20.65,0.302,96.0,1.1,"""<3""","""ACT Composite"""
2009,"""No""",21.41,0.172,96.0,0.7,"""<3""","""ACT Composite"""
2000,"""No""",21.72,0.209,97.0,0.8,"""<3""","""ACT Composite"""
1990,"""No""",20.02,0.214,97.0,0.7,"""<3""","""ACT Composite"""
2019,"""Yes""",18.76,0.526,4.0,1.1,"""3+""","""ACT Composite"""
2009,"""Yes""",20.05,0.249,4.0,0.7,"""3+""","""ACT Composite"""
2000,"""Yes""",19.81,0.908,3.0,0.8,"""3+""","""ACT Composite"""
1990,"""Yes""",,,3.0,0.7,"""3+""","""ACT Composite"""


In [7]:
mfgc1 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_MFGC1_*.csv")
    ]
).filter(
    pl.col("Earned at least 1 credit in manufacturing courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 1 credit in manufacturing courses? (SCED)").replace(
        {
            "Yes": "1+",
            "No": "0",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(mfgc1)

mfgc3 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_MFGC3_*.csv")
    ]
).filter(
    pl.col("Earned at least 3 credits in manufacturing courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 3 credits in manufacturing courses? (SCED)").replace(
        {
            "Yes": "3+",
            "No": "<3",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(mfgc3)


Year,Earned at least 1 credit in manufacturing courses? (SCED),Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",521.94,5.399,96.0,1.0,"""0""","""SAT Math"""
2009,"""No""",506.34,5.149,97.0,0.6,"""0""","""SAT Math"""
2000,"""No""",532.16,6.096,94.0,3.2,"""0""","""SAT Math"""
1990,"""No""",486.72,7.582,96.0,0.7,"""0""","""SAT Math"""
2019,"""Yes""",480.63,11.945,4.0,1.0,"""1+""","""SAT Math"""
…,…,…,…,…,…,…,…
1990,"""No""",19.99,0.213,94.0,1.2,"""0""","""ACT Composite"""
2019,"""Yes""",17.84,0.455,4.0,0.5,"""1+""","""ACT Composite"""
2009,"""Yes""",18.77,0.283,4.0,0.7,"""1+""","""ACT Composite"""
2000,"""Yes""",19.52,0.455,7.0,1.3,"""1+""","""ACT Composite"""


Year,Earned at least 3 credits in manufacturing courses? (SCED),Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",20.62,0.295,99.0,0.2,"""<3""","""ACT Composite"""
2009,"""No""",21.37,0.167,100.0,0.1,"""<3""","""ACT Composite"""
2000,"""No""",21.69,0.205,99.0,0.2,"""<3""","""ACT Composite"""
1990,"""No""",19.93,0.223,99.0,0.4,"""<3""","""ACT Composite"""
2019,"""Yes""",,,1.0,0.2,"""3+""","""ACT Composite"""
2009,"""Yes""",,,,0.1,"""3+""","""ACT Composite"""
2000,"""Yes""",,,1.0,0.2,"""3+""","""ACT Composite"""
1990,"""Yes""",,,1.0,0.4,"""3+""","""ACT Composite"""


In [8]:
tdlc1 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_TDLC1_*.csv")
    ]
).filter(
    pl.col("Earned at least 1 credit in transportation, distribution, and logistics courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 1 credit in transportation, distribution, and logistics courses? (SCED)").replace(
        {
            "Yes": "1+",
            "No": "0",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(tdlc1)

tdlc3 = pl.concat(
    [
        pl.read_csv(path, has_header=True).with_columns(MEASURE=pl.lit(path.stem.split("_")[-1]))
        for path in Path("data/nces/hsts/csv").glob("HSTS_API_TDLC3_*.csv")
    ]
).filter(
    pl.col("Earned at least 3 credits in transportation, distribution, and logistics courses? (SCED)") != "Missing"
).with_columns(
    pl.col("Earned at least 3 credits in transportation, distribution, and logistics courses? (SCED)").replace(
        {
            "Yes": "3+",
            "No": "<3",
        }
    ).alias("Course Credits"),
    pl.col("MEASURE").replace(
        {
            "SATMAT": "SAT Math",
            "SATVRB": "SAT Verbal",
            "ACTCMP": "ACT Composite",
        }
    ).alias("Measure"),
).drop("Jurisdiction", "MEASURE")
display(tdlc3)

Year,"Earned at least 1 credit in transportation, distribution, and logistics courses? (SCED)",Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",523.64,5.554,94.0,1.4,"""0""","""SAT Math"""
2009,"""No""",505.47,5.135,98.0,0.4,"""0""","""SAT Math"""
2000,"""No""",533.18,6.579,97.0,0.7,"""0""","""SAT Math"""
1990,"""No""",485.27,7.684,98.0,0.6,"""0""","""SAT Math"""
2019,"""Yes""",468.18,11.593,6.0,1.4,"""1+""","""SAT Math"""
…,…,…,…,…,…,…,…
1990,"""No""",19.89,0.22,97.0,0.8,"""0""","""ACT Composite"""
2019,"""Yes""",17.19,0.464,4.0,0.6,"""1+""","""ACT Composite"""
2009,"""Yes""",18.36,0.345,4.0,0.5,"""1+""","""ACT Composite"""
2000,"""Yes""",19.52,0.585,4.0,1.1,"""1+""","""ACT Composite"""


Year,"Earned at least 3 credits in transportation, distribution, and logistics courses? (SCED)",Average,Standard Error (Average),Percentage,Standard Error (Percentage),Course Credits,Measure
i64,str,f64,f64,f64,f64,str,str
2019,"""No""",20.63,0.292,99.0,0.3,"""<3""","""ACT Composite"""
2009,"""No""",21.4,0.169,99.0,0.2,"""<3""","""ACT Composite"""
2000,"""No""",21.7,0.204,99.0,0.3,"""<3""","""ACT Composite"""
1990,"""No""",19.91,0.22,100.0,0.2,"""<3""","""ACT Composite"""
2019,"""Yes""",,,1.0,0.3,"""3+""","""ACT Composite"""
2009,"""Yes""",17.72,0.531,1.0,0.2,"""3+""","""ACT Composite"""
2000,"""Yes""",,,1.0,0.3,"""3+""","""ACT Composite"""
1990,"""Yes""",,,,0.2,"""3+""","""ACT Composite"""


In [9]:
_df = pl.concat(
    [
        acoc1.with_columns(Course=pl.lit("Architecture and Construction")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        acoc3.with_columns(Course=pl.lit("Architecture and Construction")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        anrc1.with_columns(Course=pl.lit("Agriculture, Food, and Natural Resources")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        anrc3.with_columns(Course=pl.lit("Agriculture, Food, and Natural Resources")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        mfgc1.with_columns(Course=pl.lit("Manufacturing")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        mfgc3.with_columns(Course=pl.lit("Manufacturing")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        tdlc1.with_columns(Course=pl.lit("Transportation, Distribution, and Logistics")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
        tdlc3.with_columns(Course=pl.lit("Transportation, Distribution, and Logistics")).select(
            "Year",
            "Course",
            "Course Credits",
            "Measure",
            "Average",
            "Standard Error (Average)",
        ),
    ]
).filter(
    # I'm tired of searching for adjusted means and standard deviations. Getting rid of SAT because conversions have made YoY comparisons dogshit
    pl.col("Measure") == "ACT Composite",
    # 3+/<3 is too sparse to be useful
    pl.col("Course Credits").is_in(["0", "1+"]),
).join(
    total.select("Year", "Measure", "Average", "Standard Error (Average)", "Standard Deviation"),
    on=["Year", "Measure"],
    suffix="_national",
).with_columns(
    ((pl.col("Average") - pl.col("Average_national")) / pl.col("Standard Deviation")).alias("Z-Score"),
    (pl.col("Standard Error (Average)") / pl.col("Standard Deviation")).alias("Standard Error (Z-Score)"),
)
px.line(
    _df.sort(
        "Year",
        "Course Credits",
        "Course",
        "Measure",
    ),
    x="Year",
    y="Z-Score",
    error_y="Standard Error (Z-Score)",
    color="Course Credits",
    color_discrete_map={
        "0": px.colors.diverging.Earth[0],
        "1+": px.colors.diverging.Earth[-1],
    },
    # color_discrete_sequence=px.colors.diverging.Fall_r[-1:] + px.colors.diverging.Fall_r[:1:2],
    facet_col="Course",
    facet_col_wrap=2,
    template="plotly_dark",
    title="Mid- and low-aptitude students are abandoning technical subjects",
    width=1200,
    height=800,
    range_y=[-.75, 0.75],
).for_each_annotation(lambda a: a.update(
        text=a.text.split("=")[-1],  # Remove "day=" from the label
    )
).for_each_xaxis(
    lambda a: a.update(title=None)
).for_each_yaxis(
    lambda a: a.update(title="Average ACT Z-Score"),
    col=1,
).update_layout(
    legend_orientation="h",
    legend_title_text="Credits in Subject",
    plot_bgcolor="#171717",
    paper_bgcolor="#171717",
).add_annotation(
    x=1.072,
    y=-0.16,
    xref="paper",
    yref="paper",
    text="<br>".join(
        (
            "Chart by Dominic Tarro | 𝕏 @dominictarro",
            "Source: NCES High School Transcript Survey (1990, 2000, 2009, 2019)",
            "Note: Bars represent normalized NCES Standard Error",
            "Z-Scores are calculated as (Course Average - Sample Average) / National Standard Deviation",
        )
    ),
    align="right",
    showarrow=False,
    font=dict(
        size=10,
        color="grey"
    ),
    opacity=0.7
).show()
_df


Year,Course,Course Credits,Measure,Average,Standard Error (Average),Average_national,Standard Error (Average)_national,Standard Deviation,Z-Score,Standard Error (Z-Score)
i64,str,str,str,f64,f64,f64,f64,f64,f64,f64
2019,"""Architecture and Construction""","""0""","""ACT Composite""",20.77,0.307,20.58,0.294,5.0,0.038,0.0614
2009,"""Architecture and Construction""","""0""","""ACT Composite""",21.48,0.173,21.35,0.169,5.1,0.02549,0.033922
2000,"""Architecture and Construction""","""0""","""ACT Composite""",21.82,0.211,21.66,0.209,4.7,0.034043,0.044894
1990,"""Architecture and Construction""","""0""","""ACT Composite""",20.05,0.232,19.91,0.225,4.7,0.029787,0.049362
2019,"""Architecture and Construction""","""1+""","""ACT Composite""",18.23,0.474,20.58,0.294,5.0,-0.47,0.0948
…,…,…,…,…,…,…,…,…,…,…
1990,"""Transportation, Distribution, …","""0""","""ACT Composite""",19.89,0.22,19.91,0.225,4.7,-0.004255,0.046809
2019,"""Transportation, Distribution, …","""1+""","""ACT Composite""",17.19,0.464,20.58,0.294,5.0,-0.678,0.0928
2009,"""Transportation, Distribution, …","""1+""","""ACT Composite""",18.36,0.345,21.35,0.169,5.1,-0.586275,0.067647
2000,"""Transportation, Distribution, …","""1+""","""ACT Composite""",19.52,0.585,21.66,0.209,4.7,-0.455319,0.124468


In [10]:
cte = pl.read_excel("data/nces/hsts/HSTS_1990-2019_1+_Credits.xlsx", sheet_name="CTE").unpivot(
    on=["Y_1990_MEAN", "Y_1990_SE", "Y_2000_MEAN", "Y_2000_SE", "Y_2009_MEAN", "Y_2009_SE", "Y_2019_MEAN", "Y_2019_SE"],
    index="CTE_COURSE_SUBJECT",
).with_columns(
    pl.col("variable").str.extract_groups(r"^Y_(?P<YEAR>\d{4})_(?P<METRIC>\w+)$").alias("ATTRS"),
).unnest("ATTRS").with_columns(
    pl.col("YEAR").cast(pl.Int64),
).pivot(
    on="METRIC",
    index=["CTE_COURSE_SUBJECT", "YEAR"],
    values="value",
).sort("CTE_COURSE_SUBJECT", "YEAR")
cte

CTE_COURSE_SUBJECT,YEAR,MEAN,SE
str,i64,f64,f64
"""Agriculture, food, and natural…",1990,7.0,0.7
"""Agriculture, food, and natural…",2000,9.0,0.9
"""Agriculture, food, and natural…",2009,8.0,0.5
"""Agriculture, food, and natural…",2019,11.0,0.3
"""Architecture and construction""",1990,11.0,0.7
…,…,…,…
"""Public, protective, and govern…",2019,5.0,0.3
"""Transportation, distribution, …",1990,6.0,0.5
"""Transportation, distribution, …",2000,6.0,0.5
"""Transportation, distribution, …",2009,5.0,0.4


In [11]:
math = pl.read_excel("data/nces/hsts/HSTS_1990-2019_1+_Credits.xlsx", sheet_name="MATH").unpivot(
    on=["Y_1990_MEAN", "Y_1990_SE", "Y_2000_MEAN", "Y_2000_SE", "Y_2009_MEAN", "Y_2009_SE", "Y_2019_MEAN", "Y_2019_SE"],
    index="MATH_COURSE",
).with_columns(
    pl.col("variable").str.extract_groups(r"^Y_(?P<YEAR>\d{4})_(?P<METRIC>\w+)$").alias("ATTRS"),
).unnest("ATTRS").with_columns(
    pl.col("YEAR").cast(pl.Int64),
).pivot(
    on="METRIC",
    index=["MATH_COURSE", "YEAR"],
    values="value",
).sort("MATH_COURSE", "YEAR")
math

MATH_COURSE,YEAR,MEAN,SE
str,i64,f64,f64
"""Algebra I""",1990,73.0,1.1
"""Algebra I""",2000,77.0,1.4
"""Algebra I""",2009,78.0,0.8
"""Algebra I""",2019,85.0,0.6
"""Algebra II""",1990,56.0,1.1
…,…,…,…
"""Probability and statistics""",2019,17.0,0.5
"""Trigonometry or analytical geo…",1990,10.0,1.1
"""Trigonometry or analytical geo…",2000,7.0,1.3
"""Trigonometry or analytical geo…",2009,6.0,0.8


In [12]:
px.line(
    pl.concat(
        [
            cte.filter(
                pl.col("CTE_COURSE_SUBJECT").is_in(
                    [
                        "Agriculture, food, and natural resources",
                        "Architecture and construction",
                        "Manufacturing",
                        "Transportation, distribution, and logistics",
                    ]
                )
            ).rename({"CTE_COURSE_SUBJECT": "COURSE_SUBJECT"}).with_columns(pl.lit("Technical Education").alias("SUBJECT_GROUP")),
            math.filter(
                pl.col("MATH_COURSE").is_in(
                    [
                        "Algebra I",
                        "Algebra II",
                        "Calculus",
                        "General, occupational, and technical mathematics",
                    ]
                )
            ).rename({"MATH_COURSE": "COURSE_SUBJECT"}).with_columns(pl.lit("Mathematics").alias("SUBJECT_GROUP")),
        ]
    ).with_columns(
        pl.col("MEAN") / 100,
        pl.col("SE") / 100,
    ),
    x="YEAR",
    y="MEAN",
    color="COURSE_SUBJECT",
    color_discrete_sequence=px.colors.diverging.RdBu[:4] + px.colors.diverging.RdBu[-4:],
    error_y="SE",
    template="plotly_dark",
    title="College-prep courses rise and technical courses fall<br><sup>American high schools have shifted towards college-prep courses over the past 30 years at the expense of courses that meet students where they're at.</sup>",
    width=1200,
    height=600,
).update_layout(
    xaxis_title=None,
    yaxis_title="Percent of High School Students with 1+ Credits",
    # yaxis_title_font_size=10,
    yaxis_tickformat=".0%",
    
    legend_title_text=None,
    plot_bgcolor="#171717",
    paper_bgcolor="#171717",
).add_annotation(
    x=1.48,
    y=-0.15,
    xref="paper",
    yref="paper",
    text="<br>".join(
        (
            "Chart by Dominic Tarro | 𝕏 @dominictarro",
            "Source: NCES High School Transcript Survey (1990, 2000, 2009, 2019)",
        )
    ),
    align="right",
    showarrow=False,
    font=dict(
        size=10,
        color="grey"
    ),
    opacity=0.7
)

In [13]:
px.line(
    pl.read_csv("./data/nces/carnegie-vocational/carnegie_credits_trade_and_industrial.csv").with_columns(
        pl.col("group").replace(
            {
                "total": "All Graduates",
                "male": "Male",
                "female": "Female",
            }
        )
    ),
    x="year",
    y="average_credits",
    error_y="standard_error",
    color="group",
    color_discrete_map={
        "Male": px.colors.diverging.Picnic[1],
        "Female": px.colors.diverging.Picnic[8],
        "All Graduates": "white",
    },
    template="plotly_dark",
    title="The decline of high school trades education is a male phenomenon<br><sup>The change in \"Trade and Industrial\" course enrollment is driven by male abdication.</sup>",
    width=800,
    height=600,
).update_layout(
    xaxis_title=None,
    yaxis_title="Average Graduating Credits",   
    legend_title_text=None,
    plot_bgcolor="#171717",
    paper_bgcolor="#171717",
).add_annotation(
    x=1.257,
    y=-0.185,
    xref="paper",
    yref="paper",
    text="<br>".join(
        (
            "Chart by Dominic Tarro | 𝕏 @dominictarro",
            "Sources: NCES High School Transcript Survey (1987, 1990, 1994, 1998, 2000, and 2005)",
            "NCES High School and Beyond Longitudinal Study (1980, 1982)",
            "NCES Digest of Education Statistics 2007, Table 141"
        )
    ),
    align="right",
    showarrow=False,
    font=dict(
        size=10,
        color="grey"
    ),
    opacity=0.7
)