In [107]:
import pandas as pd
import numpy as np

# Load Data

## Reading

In [108]:
data_cols = {
    "CDA03ARE1223R": "reading", # Campus 2023 Domain 1a: Meets Grade Level Std, Grade 3, All Students, Staar Reading/Ela Rate 
    "CAMPUS": "campus_number",
    "DISTNAME": "district_name"
}

na_values = ['-', '*', '**', 'n/a', pd.NA, np.nan, '?', '-1', '.']


In [109]:
reading = pd.read_csv("../data/raw/CSTAAR_GR3.csv",na_values=na_values, usecols=data_cols.keys())

In [110]:
reading = reading.rename(columns=data_cols)

## Schools

In [111]:
data_cols = {
    "School Number": "campus_number", 
    "School Name": "campus_name", 
    "Instruction Type": "instruction_type", 
    "Charter Type":	"charter_type", 
    "School City": "city", 
    "School State":	"state", 
    "School Zip": "zip_plusfour", 
    "Grade Range": "grade_range", 
    "School Enrollment as of Oct 2023": "enrollment",
    "School Status": "status"
}

In [112]:
schools = pd.read_excel("../data/raw/School Addresses - Hiring Project.xlsx", 
                   dtype={"School Zip":"str"})

In [113]:
schools = schools.rename(columns=data_cols)

In [114]:
schools['zip'] = schools['zip_plusfour'].str.slice(0, 5)

# Merge

In [115]:
reading = reading.merge(schools, on='campus_number', how='left', validate="1:1", indicator=True)

# Clean

## Drop non-Texas schools

In [116]:
# XXX: For now we will keep schools that did not appear in the addresses table.
mask = (reading['state'] == 'TX') | (reading['_merge'] == 'left_only')
masked_states = reading[~mask]['state'].unique()
print("Dropping {} non-TX rows".format(len(mask) - mask.sum()))
print("They are all from ", masked_states)
reading = reading.loc[mask]

Dropping 1 non-TX rows
They are all from  ['NV']


## Drop non-3rd-grade schools

In [117]:
# Check formatting
assert reading['grade_range'].str.startswith("'").all()
# Check EE is never the max grade. Unless it is the only grade.
assert all((reading['grade_range'].str.slice(-2) != 'EE') | (reading['grade_range'] == "'EE"))
# Check EE is never in a multi range.
assert all(reading['grade_range'].fillna('').str.startswith("'EE") | ~(reading['grade_range'].fillna('').str.contains('EE')))

# My understanding is that EE could actually mean anything from PK - 2nd grade.
# Given EE, if present, is ALWAYS at the beginning, we can safely make it the first category code for our range queries.
grade_levels = pd.CategoricalDtype(['EE','PK','KG','01','02','03','04','05','06','07','08','09','10','11','12','AE'], ordered=True)

# Some schools have non-contiguous grade ranges.
def ranges_to_intervals(ranges):
    intervals = []
    if pd.isna(ranges):
        return intervals
    for singlerange in ranges.strip("'").split(" "):
        grade_min = singlerange[:2]
        grade_max = singlerange[-2:]
        assert grade_min in grade_levels.categories or grade_min == 'EE'
        assert grade_max in grade_levels.categories or grade_max == 'EE'
        gradeinterval = pd.Interval(grade_levels.categories.get_loc(grade_min),
                                    grade_levels.categories.get_loc(grade_max), 
                                    closed='both')
        intervals.append(gradeinterval)
    return intervals

grade_intervals = reading['grade_range'].apply(ranges_to_intervals)
third_grade = grade_levels.categories.get_loc('03')
reading['third_grade'] = grade_intervals.apply(lambda xs: any([third_grade in x for x in xs]))

In [118]:
# XXX: There are a small fraction of 3rd-grade schools that are not reporting.
#       There are also a smaller fraction of non-3rd grade schools that report 3rd grade numbers.
pd.concat([pd.crosstab(reading['reading'].notna(), reading['third_grade'], normalize='columns'),
          pd.crosstab(reading['reading'].notna(), reading['third_grade'])],keys=['percent','count'])


Unnamed: 0_level_0,third_grade,False,True
Unnamed: 0_level_1,reading,Unnamed: 2_level_1,Unnamed: 3_level_1
percent,False,0.984939,0.048354
percent,True,0.015061,0.951646
count,False,4120.0,235.0
count,True,63.0,4625.0


In [119]:
mask = reading['third_grade']
print("Dropping {} non-3rd-grade schools".format(len(mask) - mask.sum()))
reading = reading.loc[mask]

Dropping 4183 non-3rd-grade schools


## Drop non-reporting schools

These schools appeared in the CSTAAR dataset but their reading scores were null or masked.

We can drop them because they do not contribute to our analysis.

If I had more time, I would look for systematic patterns of under-reporting and acknowledge them in a note.

In [120]:
mask = reading['reading'].notna()
print("Dropping {} non-reporting schools".format(len(mask) - mask.sum()))
reading = reading.loc[mask]

Dropping 235 non-reporting schools


# Export

In [121]:
reading.to_csv("../data/interim/school_scores.csv", index=False)