In [161]:
import pandas as pd
import os
from pathlib import Path
import missingno as msno
import matplotlib.pyplot as plt
import itertools
from itertools import product

Several data sets need to be cleaned and merged.  School mean SAT & PSAT scores for each year are in separate .xlsx files.  Scores disaggregated by desired demographic indicators are in another set .xlsx files also separated by year.

### 2018 data wrangling

In [162]:
# Filenames & paths
# Just do 2018 data for now.  COVID affected other years

filepath_agg = '../raw_data/aggregated/'
filepath_disagg = '../raw_data/disaggregated/'

file_2017_agg = '2017 SAT PSAT District and School Overall Results_final.xlsx'
file_2018_agg = '2018 PSAT and SAT District and School Summary Achievement Results_FINAL.xlsx'


file_2017_disagg = '2017 SAT PSAT10 Disaggregated Report Formatted.xlsx'
file_2018_disagg = '2018 PSAT and SAT State Achievement Results Disaggregated by Subgroups.xlsx'

In [163]:
###  Longest runtime ###

# Files all have different formats

# Agg data is all on one sheet in xlsx file.
df_2017_agg_raw = pd.read_excel(filepath_agg + file_2017_agg)
df_2018_agg_raw = pd.read_excel(filepath_agg + file_2018_agg)

df_2017_disagg_raw = pd.read_excel(filepath_disagg + file_2017_disagg)
# Disaggregated data for 2018 is split into separate sheets in xlsx file.
# Create dict of df's for each set of disagg data
dict_2018_disagg_raw = pd.read_excel(filepath_disagg + file_2018_disagg, sheet_name=None)

## Aggregated data only

In [164]:
# Drop rows with descriptive text
df_2017_agg = df_2017_agg_raw.drop(df_2017_agg_raw.index[0:4])
df_2018_agg = df_2018_agg_raw.drop(df_2018_agg_raw.index[0:3])

In [165]:
# Set columns headers as first row containing Test, District Number, District Name, etc.
df_2017_agg.columns = df_2017_agg.iloc[0]
df_2018_agg.columns = df_2018_agg.iloc[0]

In [166]:
# Drop first row containing the column headers
df_2017_agg = df_2017_agg.drop(df_2017_agg.index[0])
df_2018_agg = df_2018_agg.drop(df_2018_agg.index[0])

In [167]:
## 2017 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'STATE RESULTS']
district_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'DISTRICT RESULTS']

# Drop those & create new dataframe of school only data
# Note: There's something weird with the school districts here.  BOCES?
schools_2017_agg = df_2017_agg[(df_2017_agg['School Name'] != 'STATE RESULTS') & (df_2017_agg['School Name'] != 'DISTRICT RESULTS')]


## 2018 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'STATE']
district_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'DISTRICT']

# Create dataframe with only school level scores
schools_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'SCHOOL']

# Drop some unneeded columns
schools_2018_agg = schools_2018_agg.drop(['Level','Grade'], axis = 1)

In [168]:
# Rename and reindex 2017 and 2018 data

# Rename some columns for convenience
col_names1_long = list(schools_2017_agg.columns)
col_names1_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean',
                 '2017 Participation Percent',
                 '2016 Valid Scores',
                 '2016 EBRW Mean',
                 '2016 Math Mean',
                 '2016 Overall Mean',
                 '2016 Participation Percent',
                 'Mean Overall Score Change']
schools_2017_agg = schools_2017_agg.rename(columns = dict(zip(col_names1_long, col_names1_short)))

# Rename some columns for convenience
col_names2_long = list(schools_2018_agg.columns)
col_names2_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2018 Valid Scores',
                 '2018 EBRW Mean',
                 '2018 Math Mean',
                 '2018 Overall Mean Score',
                 '2018 Participation Percent',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean Score',
                 '2017 Participation Percent',
                 'Mean Overall Score Change']
schools_2018_agg = schools_2018_agg.rename(columns = dict(zip(col_names2_long, col_names2_short)))

# Set index to school number
#schools_2017_agg['School Number'] = schools_2017_agg['School Number'].astype(int)
#schools_2017_agg = schools_2017_agg.set_index('School Number')

#schools_2018_agg['School Number'] = schools_2018_agg['School Number'].astype(int)
#schools_2018_agg = schools_2018_agg.set_index('School Number')

In [169]:
# Trim white space
schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()
schools_2018_agg['Test'] = schools_2018_agg['Test'].str.strip()

#### Create new dataframe aggregating all data from 2018 into a cleaner format

In [170]:
# Select only rows that have both 2017 and 2018 scores
schools_2018_agg_subset = schools_2018_agg[~schools_2018_agg['Mean Overall Score Change'].isna()]

# Drop rows using '*' to indicate missing data
schools_2018_agg_subset = schools_2018_agg_subset[~(schools_2018_agg_subset['Total Students'] == '*')]

In [183]:
###  May not be necessary ###

# Set appropriate names for each column

score_types = ['Valid Scores',
              'EBRW Mean',
              'Math Mean',
              'Overall Mean Score',
              'Participation Percent']
years_str = ['2017', '2018']
test_types = ['PSAT10', 'SAT']

score_cols = list(product(years_str, test_types, score_types))
score_cols = [year + " " + test + " " + score for year, test, score in score_cols]

all_cols = ['School Number', 'District Name', 'School Name'] + score_cols

In [185]:
schools_2018_agg_SAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "SAT"]
schools_2018_agg_PSAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "PSAT10"]
tests_combined_2018_agg = schools_2018_agg_SAT.merge(schools_2018_agg_PSAT, on="School Number", suffixes=(' SAT', ' PSAT10'))
#list(tests_combined_2018.columns)

['Test SAT',
 'District Number SAT',
 'District Name SAT',
 'School Name SAT',
 'Total Students SAT',
 '2018 Valid Scores SAT',
 '2018 EBRW Mean SAT',
 '2018 Math Mean SAT',
 '2018 Overall Mean Score SAT',
 '2018 Participation Percent SAT',
 '2017 Valid Scores SAT',
 '2017 EBRW Mean SAT',
 '2017 Math Mean SAT',
 '2017 Overall Mean Score SAT',
 '2017 Participation Percent SAT',
 'Mean Overall Score Change SAT',
 'Test PSAT10',
 'District Number PSAT10',
 'District Name PSAT10',
 'School Name PSAT10',
 'Total Students PSAT10',
 '2018 Valid Scores PSAT10',
 '2018 EBRW Mean PSAT10',
 '2018 Math Mean PSAT10',
 '2018 Overall Mean Score PSAT10',
 '2018 Participation Percent PSAT10',
 '2017 Valid Scores PSAT10',
 '2017 EBRW Mean PSAT10',
 '2017 Math Mean PSAT10',
 '2017 Overall Mean Score PSAT10',
 '2017 Participation Percent PSAT10',
 'Mean Overall Score Change PSAT10']

In [186]:
# Drop duplicated &  unnecessary columns
tests_combined_2018_agg = tests_combined_2018_agg.drop(['Test PSAT10',
                                                        'Test SAT',
                                                'District Number PSAT10',
                                                'District Name PSAT10',
                                                'School Name PSAT10',
                                                        'Test PSAT10',
                                                        'Mean Overall Score Change SAT',
                                                        'Mean Overall Score Change PSAT10'
                                               ], axis = 1)
#list(tests_combined_2018_agg.columns)

['District Number SAT',
 'District Name SAT',
 'School Number',
 'School Name SAT',
 'Total Students SAT',
 '2018 Valid Scores SAT',
 '2018 EBRW Mean SAT',
 '2018 Math Mean SAT',
 '2018 Overall Mean Score SAT',
 '2018 Participation Percent SAT',
 '2017 Valid Scores SAT',
 '2017 EBRW Mean SAT',
 '2017 Math Mean SAT',
 '2017 Overall Mean Score SAT',
 '2017 Participation Percent SAT',
 'Total Students PSAT10',
 '2018 Valid Scores PSAT10',
 '2018 EBRW Mean PSAT10',
 '2018 Math Mean PSAT10',
 '2018 Overall Mean Score PSAT10',
 '2018 Participation Percent PSAT10',
 '2017 Valid Scores PSAT10',
 '2017 EBRW Mean PSAT10',
 '2017 Math Mean PSAT10',
 '2017 Overall Mean Score PSAT10',
 '2017 Participation Percent PSAT10']

In [187]:
# Fix column names with unnecessary suffix
tests_combined_2018_agg = tests_combined_2018_agg.rename(columns={'District Number SAT': "District Number", "District Name SAT": "District Name", "School Name SAT": "School Name"}, errors="raise")

## ---Done with aggregated data---

# Data disaggretaed by subgroups