In [1]:
import pandas as pd
import os
from pathlib import Path
import missingno as msno
import matplotlib.pyplot as plt
import itertools
from itertools import product

Several data sets need to be cleaned and merged.  School mean SAT & PSAT scores for each year are in separate .xlsx files.  Scores disaggregated by desired demographic indicators are in another set .xlsx files also separated by year. A third data set with scores grouped by multiple categories is also prepared.

### 2018 data wrangling

In [2]:
# Filenames & paths
# Just do 2018 data for now.  COVID affected other years

filepath_agg = '../raw_data/aggregated/'
filepath_disagg = '../raw_data/disaggregated/'

file_2017_agg = '2017 SAT PSAT District and School Overall Results_final.xlsx'
file_2018_agg = '2018 PSAT and SAT District and School Summary Achievement Results_FINAL.xlsx'


#file_2017_disagg = '2017 SAT PSAT10 Disaggregated Report Formatted.xlsx'
file_2018_disagg = '2018 PSAT and SAT State Achievement Results Disaggregated by Subgroups.xlsx'
file_2018_multicat = '2018 SAT results by subgroups.xlsx'

In [3]:
# Files all have different formats

# Agg data is all on one sheet in xlsx file.
df_2017_agg_raw = pd.read_excel(filepath_agg + file_2017_agg)
df_2018_agg_raw = pd.read_excel(filepath_agg + file_2018_agg)

#df_2017_disagg_raw = pd.read_excel(filepath_disagg + file_2017_disagg)
# Disaggregated data for 2018 is split into separate sheets in xlsx file.
# Create dict of df's for each set of disagg data
dict_2018_disagg_raw = pd.read_excel(filepath_disagg + file_2018_disagg, sheet_name=None)

df_2018_multicat_raw = pd.read_excel(filepath_disagg + file_2018_multicat)

## Aggregated data only

In [4]:
df_2017_agg_raw.head(10)

Unnamed: 0,2017 SAT and PSAT10 District and School Results - EMBARGOED UNTIL 10:00 A.M. ON AUGUST 17TH,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,,,,,,,,,,,,,,,,,
1,Means are calculated using only valid scores i...,,,,,,,,,,,,,,,,
2,* Values suppressed to protect student privacy.,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,
4,Test,District Number,District Name,School Number,School Name,Total Students,Valid Scores,Evidence Based Reading and Writing Mean Score,Math Mean Score,Overall Mean Score,Participation Percent,2016 Valid Scores,2016 Evidence Based Reading and Writing Mean S...,2016 Math Mean Score,2016 Overall Mean Score,2016 Participation Percent,Mean Overall Score Change
5,PSAT,0000,STATE RESULTS,0000,STATE RESULTS,63901,58701,478.0,469.0,947.0,91.9,56753,475.6,468.4,944,88.3,3
6,PSAT,0010,MAPLETON 1,0000,DISTRICT RESULTS,729,634,437.0,428.8,865.7,87.0,549,450.3,434.2,884.4,84.7,-18.7
7,PSAT,0010,MAPLETON 1,0187,MAPLETON EXPEDITIONARY SCHOOL OF THE ARTS,105,88,411.7,429.9,841.6,83.8,95,427.9,421.2,849.1,87.2,-7.5
8,PSAT,0010,MAPLETON 1,0212,MAPLETON EARLY COLLEGE HIGH SCHOOL,62,60,409.2,408.8,818.0,96.8,57,421.9,409.6,831.6,91.9,-13.6
9,PSAT,0010,MAPLETON 1,0263,GLOBAL LEADERSHIP ACADEMY,54,50,410.0,404.2,814.2,92.6,40,403.8,424.5,828.3,93,-14.1


In [5]:
# Drop rows with descriptive text
df_2017_agg = df_2017_agg_raw.drop(df_2017_agg_raw.index[0:4])
df_2018_agg = df_2018_agg_raw.drop(df_2018_agg_raw.index[0:3])

In [6]:
# Set columns headers as first row containing Test, District Number, District Name, etc.
df_2017_agg.columns = df_2017_agg.iloc[0]
df_2018_agg.columns = df_2018_agg.iloc[0]

In [7]:
# Drop first row containing the column headers
df_2017_agg = df_2017_agg.drop(df_2017_agg.index[0])
df_2018_agg = df_2018_agg.drop(df_2018_agg.index[0])

In [8]:
## 2017 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'STATE RESULTS']
district_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'DISTRICT RESULTS']

# Drop those & create new dataframe of school only data
# Note: There's something weird with the school districts here.  BOCES?
schools_2017_agg = df_2017_agg[(df_2017_agg['School Name'] != 'STATE RESULTS') & (df_2017_agg['School Name'] != 'DISTRICT RESULTS')]


## 2018 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'STATE']
district_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'DISTRICT']

# Create dataframe with only school level scores
schools_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'SCHOOL']

# Drop some unneeded columns
schools_2018_agg = schools_2018_agg.drop(['Level','Grade'], axis = 1)

In [9]:
# Rename and reindex 2017 and 2018 data

# Rename some columns for convenience
col_names1_long = list(schools_2017_agg.columns)
col_names1_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean',
                 '2017 Participation Percent',
                 '2016 Valid Scores',
                 '2016 EBRW Mean',
                 '2016 Math Mean',
                 '2016 Overall Mean',
                 '2016 Participation Percent',
                 'Mean Overall Score Change']
schools_2017_agg.columns = col_names1_short

# Rename some columns for convenience
col_names2_long = list(schools_2018_agg.columns)
col_names2_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2018 Valid Scores',
                 '2018 EBRW Mean',
                 '2018 Math Mean',
                 '2018 Overall Mean Score',
                 '2018 Participation Percent',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean Score',
                 '2017 Participation Percent',
                 'Mean Overall Score Change']
schools_2018_agg.columns = col_names2_short

In [10]:
# Trim white space
schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()
schools_2018_agg['Test'] = schools_2018_agg['Test'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()


#### Create new dataframe aggregating all data from 2018 into a cleaner format

In [11]:
# Select only rows that have both 2017 and 2018 scores
schools_2018_agg_subset = schools_2018_agg[~schools_2018_agg['Mean Overall Score Change'].isna()]

# Drop rows using '*' to indicate missing data
schools_2018_agg_subset = schools_2018_agg_subset[~(schools_2018_agg_subset['Total Students'] == '*')]

In [12]:
schools_2018_agg_SAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "SAT"]
schools_2018_agg_PSAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "PSAT10"]
tests_combined_2018_agg = schools_2018_agg_SAT.merge(schools_2018_agg_PSAT, on="School Number", suffixes=(' SAT', ' PSAT10'))

In [13]:
# Drop duplicated &  unnecessary columns
tests_combined_2018_agg = tests_combined_2018_agg.drop(['Test PSAT10',
                                                        'Test SAT',
                                                        'District Number PSAT10',
                                                        'District Name PSAT10',
                                                        'School Name PSAT10',
                                                        'Test PSAT10',
                                                        'Mean Overall Score Change SAT',
                                                        'Mean Overall Score Change PSAT10'
                                                       ], axis = 1)

drop_cats = ['District Name',
            'School Name',
            'Participation',
            'Total Students',
            'Valid Scores']

drop_list = [col for col in tests_combined_2018_agg.columns if any(cat in col for cat in drop_cats)]

In [14]:
tests_combined_2018_agg = tests_combined_2018_agg.drop(drop_list, axis = 1)

In [15]:
# Only keep 2018 SAT and 2017 PSAT

tests_combined_2018_agg = tests_combined_2018_agg.drop(['District Number SAT',
                                                        '2017 EBRW Mean SAT',
                                                        '2017 Math Mean SAT',
                                                        '2017 Overall Mean Score SAT',
                                                        '2018 EBRW Mean PSAT10',
                                                        '2018 Math Mean PSAT10',
                                                        '2018 Overall Mean Score PSAT10'
                                                       ], axis = 1)

In [16]:
tests_combined_2018_agg

Unnamed: 0,School Number,2018 EBRW Mean SAT,2018 Math Mean SAT,2018 Overall Mean Score SAT,2017 EBRW Mean PSAT10,2017 Math Mean PSAT10,2017 Overall Mean Score PSAT10
0,0187,472,467,939,412,430,842
1,0212,464,441,905,409,409,818
2,0263,443,438,880,410,404,814
3,0309,452,434,886,411,417,828
4,0503,495,477,972,466,444,910
...,...,...,...,...,...,...,...
338,9037,511,521,1032,480,488,968
339,6134,431,393,824,411,400,812
340,2840,570,526,1095,535,471,1006
341,1550,529,502,1031,500,470,970


Side notes

* Key of school numbers to name & district would be useful

# Scores arranged by specific subgroups

In [17]:
# Rename dict_2018_disagg_raw keys
disagg_categories = ['Gend', 'Ethn', 'FRM', 'GT','ELL','Migr', 'IEP']
dict_2018_disagg = dict(zip(disagg_categories, list(dict_2018_disagg_raw.values())))

In [18]:
# Function to clean each sheet

def clean_disagg_sheet(sheet_raw):
    
    # Drop rows with descriptive text
    sheet = sheet_raw.drop(sheet_raw.index[0:4])
    
    # Set columns headers as first row containing Test, District Number, District Name, etc.
    sheet.columns = sheet.iloc[0]

    # Drop first row containing the column headers
    sheet = sheet.drop(sheet.index[0])
    
    #Rename some columns for convenience
    col_names_short = ['Level',
                     'Test',
                     'District Number',
                     'District Name',
                     'School',
                     'School Name',
                     'Demo Group',
                     '2018 Total Records',
                     '2018 Valid Scores',
                     '2018 Participation Rate',
                     '2018 EBRW',
                     '2018 Math',
                     '2018 Overall']
    sheet.columns = col_names_short
    
    # Drop unnecessary & missing values
    sheet = sheet[sheet['Test'] == 'SAT']
    sheet = sheet[sheet['2018 Valid Scores'] != '< 16']
    sheet = sheet[sheet['2018 Overall'] != '*']
    sheet = sheet[sheet['Demo Group'] != 'Not Reported']
    
    # Drop unnecessary columns
    sheet = sheet.drop(['Test',
                      '2018 Total Records',
                      '2018 Participation Rate',
                      '2018 Valid Scores',
                       'District Name',
                        'District Number',
                       'School Name'], axis=1)
    
    # Assign state & district results to their own dataframes respectively (if they exist)
    state_sheet = sheet.loc[sheet['Level'] == 'STATE']
    district_sheet = sheet.loc[sheet['Level'] == 'DISTRICT']

    state_sheet = state_sheet.drop(['Level'], axis=1)
    district_sheet = district_sheet.drop(['Level'], axis=1)
    
    # Drop those & create new dataframe of school only data
    schools_sheet = sheet[sheet['Level'] == 'SCHOOL']
    schools_sheet = schools_sheet.drop(['Level'], axis=1)
    
    schools_sheet = schools_sheet.reset_index()
    district_sheet = district_sheet.reset_index()
    state_sheet = state_sheet.reset_index()
    
    state_sheet = state_sheet.drop(['index'], axis=1)
    district_sheet = district_sheet.drop(['index'], axis=1)
    schools_sheet = schools_sheet.drop(['index'], axis=1)
       
    return schools_sheet, district_sheet, state_sheet

In [19]:
# Generate dict of school only disagg data

school_disagg_2018 = {}

for cat in disagg_categories:
    school_disagg_2018[cat] = clean_disagg_sheet(dict_2018_disagg[cat])[0]

Sample of what datasets look like

In [20]:
school_disagg_2018['Ethn']

Unnamed: 0,School,Demo Group,2018 EBRW,2018 Math,2018 Overall
0,0695,Hispanic,478,486,964
1,1796,Hispanic,467,427,894
2,0187,Hispanic,467,464,930
3,0263,Hispanic,442,434,876
4,0212,Hispanic,452,438,890
...,...,...,...,...,...
638,0015,White,543,513,1056
639,6134,White,425,384,809
640,6971,Hispanic,457,409,866
641,6971,White,433,405,838


In [21]:
school_disagg_2018['Ethn'][school_disagg_2018['Ethn']['School'] == '6971']

Unnamed: 0,School,Demo Group,2018 EBRW,2018 Math,2018 Overall
640,6971,Hispanic,457,409,866
641,6971,White,433,405,838


# Score arranged by multiple demographic categories

In [22]:
# Delete header rows & set column headers
df_2018_multicat = df_2018_multicat_raw.drop(df_2018_multicat_raw.index[0:17])
df_2018_multicat.columns = df_2018_multicat.iloc[0]
df_2018_multicat = df_2018_multicat.drop(df_2018_multicat.index[0])
df_2018_multicat.columns.name = 'index'
df_2018_multicat = df_2018_multicat.reset_index().drop(['index'], axis=1)

# Drop unnecessary columns
df_2018_multicat = df_2018_multicat.drop(['Academic Year','Standard Deviation','% Participation','Score Count','Test Name'], axis=1)

In [23]:
# Forward fill missing metadata.  This works because of the format of the original xlsx document
ffill_cols = ['State/District/School',
 'Subject',
 'Gender',
 'Ethnicity',
 'Free and Reduced Lunch',
 'English Language Learners',
 'IEP']

df_2018_multicat[ffill_cols] = df_2018_multicat[ffill_cols].fillna(method='ffill') 

In [24]:
# Drop empty scores
schools_2018_multicat = df_2018_multicat[df_2018_multicat['Mean Scale Score'] != '-']

In [25]:
# Strip 'State/District/School' of everything except school number
schools_2018_multicat['State/District/School'] = schools_2018_multicat['State/District/School'].str.split().str[-1].str.strip('()')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2018_multicat['State/District/School'] = schools_2018_multicat['State/District/School'].str.split().str[-1].str.strip('()')


In [26]:
cols = list(schools_2018_multicat.columns)
new_cols = ['School',
             'Subject',
             'Gend',
             'Ethn',
             'FRL',
             'ELL',
             'IEP',
             'Mean Score']

schools_2018_multicat.columns = new_cols

In [27]:
# Create new dataframe containing both scores in a single line for a given demographic group

schools_2018_multicat_math = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Math']
schools_2018_multicat_math = schools_2018_multicat_math.rename(columns={'Mean Score':'Mean Score Math'})
schools_2018_multicat_EBRW = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Evidence-Based Reading And Writing']
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.rename(columns={'Mean Score':'Mean Score EBRW'})
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.reset_index()

schools_SAT_2018_multicat_combined = schools_2018_multicat_math.reset_index()
schools_SAT_2018_multicat_combined['Mean Score EBRW'] = schools_2018_multicat_EBRW['Mean Score EBRW']

schools_SAT_2018_multicat_combined = schools_SAT_2018_multicat_combined.drop(['index','Subject'],axis=1)

In [28]:
schools_SAT_2018_multicat_combined

Unnamed: 0,School,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW
0,0110,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,505.882353,560.588235
1,0076,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,537.757009,562.056075
2,0076,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,574.725275,564.395604
3,2195,Female,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,503.529412,517.058824
4,2195,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,560.444444,579.222222
...,...,...,...,...,...,...,...,...
705,9672,Male,White,FRL Eligible,Not English Learners,Students without IEPs,505,517.222222
706,9672,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,539.805825,537.087379
707,9696,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,521.219512,547.560976
708,9696,Male,White,FRL Eligible,Not English Learners,Students without IEPs,460.416667,460
