In [239]:
import pandas as pd
import os
from pathlib import Path
import missingno as msno
import matplotlib.pyplot as plt

Several data sets need to be cleaned and merged.  School mean SAT & PSAT scores for each year are in separate .xlsx files.  Scores disaggregated by desired demographic indicators are in another set .xlsx files also separated by year.

In [83]:
# Filenames & paths
# Just do 2017 & 2018 data for now.  COVID affected other years

filepath_agg = '../raw_data/aggregated/'
filepath_disagg = '../raw_data/disaggregated/'

file_2017_agg = '2017 SAT PSAT District and School Overall Results_final.xlsx'
file_2018_agg = '2018 PSAT and SAT District and School Summary Achievement Results_FINAL.xlsx'
#file_2019_agg = '2019 PSAT SAT District and School Achievement Results.xlsx'
#file_2021_agg = '2021 PSAT and SAT District and School Summary Achievement Results.xlsx'
#file_2022_agg = '2022 PSAT-SAT District and School Summary Achievement Results.xlsx'


file_2017_disagg = '2017 SAT PSAT10 Disaggregated Report Formatted.xlsx'
file_2018_disagg = '2018 PSAT and SAT State Achievement Results Disaggregated by Subgroups.xlsx'
#file_2021_disagg = '2021 PSAT and SAT Total Score District and School Achievement Results Disaggregated by Subgroups.xlsx'
#file_2022_disagg = '2022 PSAT-SAT Total Score School and District Achievement Results - Disaggregated by Subgroup.xlsx'

In [84]:
# Read xlsx files

##df_2017_agg = pd.ExcelFile(filepath_agg + file_2017_agg)
##df_2018_agg = pd.ExcelFile(filepath_agg + file_2018_agg)
##df_2019_agg = pd.ExcelFile(filepath_agg + file_2019_agg)

#xls_2021_agg = pd.ExcelFile(filepath_agg + file_2021_agg)
#xls_2022_agg = pd.ExcelFile(filepath_agg + file_2022_agg)

#xls_2021_disagg = pd.ExcelFile(filepath_disagg + file_2021_disagg)
#xls_2022_disagg = pd.ExcelFile(filepath_disagg + file_2022_disagg)

In [85]:
# Files all have different formats

# Agg data is all on one sheet in xlsx file.
df_2017_agg_raw = pd.read_excel(filepath_agg + file_2017_agg)
df_2018_agg = pd.read_excel(filepath_agg + file_2018_agg)

df_2017_disagg_raw = pd.read_excel(filepath_disagg + file_2017_disagg)
# Disaggregated data for 2018 is split into separate sheets in xlsx file.
# Create dict of df's for each set of disagg data
dict_2018_disagg_raw = pd.read_excel(filepath_disagg + file_2018_disagg, sheet_name=None)

In [127]:
# Drop rows with descriptive text
df_2017_agg = df_2017_agg_raw.drop(df_2017_agg_raw.index[0:4])

In [135]:
# Set columns headers as first row containing Test, District Number, District Name, etc.
df_2017_agg.columns = df_2017_agg.iloc[0]

In [137]:
# Drop first row containing the column headers
df_2017_agg = df_2017_agg.drop(df_2017_agg.index[0])

In [233]:
# Assign state & district results to their own dataframes respectively (if they exist)
state_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'STATE RESULTS']
district_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'DISTRICT RESULTS']

# Drop those
# Note: There's something weird with the school districts here.  BOCES?

schools_2017_agg = df_2017_agg[(df_2017_agg['School Name'] != 'STATE RESULTS') & (df_2017_agg['School Name'] != 'DISTRICT RESULTS')]

In [229]:
schools_2017_agg.columns

Index(['Test', 'District Number', 'District Name', 'School Number',
       'School Name', 'Total Students', 'Valid Scores',
       'Evidence Based Reading and Writing Mean Score', 'Math Mean Score',
       'Overall Mean Score', 'Participation Percent', '2016 Valid Scores',
       '2016 Evidence Based Reading and Writing Mean Score',
       '2016 Math Mean Score', '2016 Overall Mean Score',
       '2016 Participation Percent', 'Mean Overall Score Change'],
      dtype='object', name='index')

In [232]:
list(schools_2017_agg.columns)

['Test',
 'Distr_Numb',
 'Distr_Name',
 'School_Numb',
 'School_Name',
 'Total_Stu',
 'Val_Score_2017',
 'EBRW_Mean_2017',
 'Math_Mean_2017',
 'Overall_Mean_2017',
 'Partic_Perc_2017',
 'Val_Score_2016',
 'EBRW_Mean_2016',
 'Math_Mean_2016',
 'Overall_Mean_2016',
 'Partic_Perc_2016',
 'Mean_Overall_Score_Change']

In [234]:
# Set index to school number
schools_2017_agg = schools_2017_agg.set_index('School Number')

In [235]:
# Rename some columns for convenience
col_names_long = list(schools_2017_agg.columns)
col_names_short = ['Test',
                 'Distr_Numb',
                 'Distr_Name',
                 'School_Name',
                 'Total_Stu',
                 'Val_Score_2017',
                 'EBRW_Mean_2017',
                 'Math_Mean_2017',
                 'Overall_Mean_2017',
                 'Partic_Perc_2017',
                 'Val_Score_2016',
                 'EBRW_Mean_2016',
                 'Math_Mean_2016',
                 'Overall_Mean_2016',
                 'Partic_Perc_2016',
                 'Mean_Overall_Score_Change']

schools_2017_agg.rename(columns = dict(zip(col_names_long, col_names_short)), inplace=True)

In [244]:
# Drop rows where either 2016 or 2017 data is missing.
# Assuming NaN in Mean_Overall_Score_Change indicates a missing score.
schools_2017_agg = schools_2017_agg[~schools_2017_agg['Mean_Overall_Score_Change'].isna()]

In [247]:
schools_2017_agg.head()

index,Test,Distr_Numb,Distr_Name,School_Name,Total_Stu,Val_Score_2017,EBRW_Mean_2017,Math_Mean_2017,Overall_Mean_2017,Partic_Perc_2017,Val_Score_2016,EBRW_Mean_2016,Math_Mean_2016,Overall_Mean_2016,Partic_Perc_2016,Mean_Overall_Score_Change
School Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
187,PSAT,10,MAPLETON 1,MAPLETON EXPEDITIONARY SCHOOL OF THE ARTS,105,88,411.7,429.9,841.6,83.8,95,427.9,421.2,849.1,87.2,-7.5
212,PSAT,10,MAPLETON 1,MAPLETON EARLY COLLEGE HIGH SCHOOL,62,60,409.2,408.8,818.0,96.8,57,421.9,409.6,831.6,91.9,-13.6
263,PSAT,10,MAPLETON 1,GLOBAL LEADERSHIP ACADEMY,54,50,410.0,404.2,814.2,92.6,40,403.8,424.5,828.3,93.0,-14.1
309,PSAT,10,MAPLETON 1,ACADEMY HIGH SCHOOL,121,117,410.5,417.0,827.5,96.7,102,431.1,434.0,865.1,91.9,-37.6
503,PSAT,10,MAPLETON 1,YORK INTERNATIONAL,68,62,466.3,444.2,910.5,91.2,51,438.4,434.1,872.5,96.2,38.0


In [257]:
schools_2017_agg.tail()

index,Test,Distr_Numb,Distr_Name,School_Name,Total_Stu,Val_Score_2017,EBRW_Mean_2017,Math_Mean_2017,Overall_Mean_2017,Partic_Perc_2017,Val_Score_2016,EBRW_Mean_2016,Math_Mean_2016,Overall_Mean_2016,Partic_Perc_2016,Mean_Overall_Score_Change
School Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2840,PSAT,9130,EXPEDITIONARY BOCES,ROCKY MOUNTAIN SCHOOL OF EXPEDITIONARY LEARNING,24,23,534.8,470.9,1005.7,95.8,27,464.1,433,897,100,108.7
1550,PSAT,9170,COLORADO DIGITAL BOCES,COLORADO PREP ACADEMY,78,61,500.0,469.7,969.7,78.2,49,469,431,900,47.1,69.7
6971,PSAT,9170,COLORADO DIGITAL BOCES,PIKES PEAK ONLINE SCHOOL,95,63,424.9,394.3,819.2,66.3,47,427,391.1,818.1,64.4,1.1
7449,PSAT,9170,COLORADO DIGITAL BOCES,ROCKY MOUNTAIN DIGITAL ACADEMY,*,< 16,*,*,*,*,62,482.1,442.7,924.8,86.1,*
7484,PSAT,9170,COLORADO DIGITAL BOCES,MOUNTAIN VIEW VIRTUAL,*,< 16,*,*,*,*,< 16,*,*,*,*,*


In [254]:
# Trim whitespaces from Test column

schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()

In [256]:
schools_2017_agg[schools_2017_agg['Test'] == 'SAT']

index,Test,Distr_Numb,Distr_Name,School_Name,Total_Stu,Val_Score_2017,EBRW_Mean_2017,Math_Mean_2017,Overall_Mean_2017,Partic_Perc_2017,Val_Score_2016,EBRW_Mean_2016,Math_Mean_2016,Overall_Mean_2016,Partic_Perc_2016,Mean_Overall_Score_Change
School Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [252]:
schools_2017_agg.iloc[0]

index
Test                                                            PSAT  
Distr_Numb                                                        0010
Distr_Name                                                  MAPLETON 1
School_Name                  MAPLETON EXPEDITIONARY SCHOOL OF THE ARTS
Total_Stu                                                          105
Val_Score_2017                                                      88
EBRW_Mean_2017                                                   411.7
Math_Mean_2017                                                   429.9
Overall_Mean_2017                                                841.6
Partic_Perc_2017                                                  83.8
Val_Score_2016                                                      95
EBRW_Mean_2016                                                   427.9
Math_Mean_2016                                                   421.2
Overall_Mean_2016                                                849.1
