In [977]:
import pandas as pd
import os
from pathlib import Path
import missingno as msno
import matplotlib.pyplot as plt
import itertools
from itertools import product
import re
import urllib.request as ul
from bs4 import BeautifulSoup as soup
import requests

Several data sets need to be cleaned and merged.  School mean SAT & PSAT scores for each year are in separate .xlsx files.  Scores disaggregated by desired demographic indicators are in another set .xlsx files also separated by year. A third data set with scores grouped by multiple categories is also prepared.

### 2018 data wrangling

In [978]:
# Filenames & paths

filepath_agg = '../raw_data/aggregated/'
filepath_disagg = '../raw_data/disaggregated/'

file_2017_agg = '2017 SAT PSAT District and School Overall Results_final.xlsx'
file_2018_agg = '2018 PSAT and SAT District and School Summary Achievement Results_FINAL.xlsx'


file_2018_disagg = '2018 PSAT and SAT State Achievement Results Disaggregated by Subgroups.xlsx'
file_2018_multicat = '2018 SAT results by subgroups.xlsx'

In [979]:
# Files all have different formats

# Agg data is all on one sheet in xlsx file.
df_2017_agg_raw = pd.read_excel(filepath_agg + file_2017_agg)
df_2018_agg_raw = pd.read_excel(filepath_agg + file_2018_agg)

# Disaggregated data for 2018 is split into separate sheets in xlsx file.
# Create dict of df's for each set of disagg data
dict_2018_disagg_raw = pd.read_excel(filepath_disagg + file_2018_disagg, sheet_name=None)

df_2018_multicat_raw = pd.read_excel(filepath_disagg + file_2018_multicat)

## Aggregated data only

In [980]:
df_2017_agg_raw.head(10)

Unnamed: 0,2017 SAT and PSAT10 District and School Results - EMBARGOED UNTIL 10:00 A.M. ON AUGUST 17TH,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,,,,,,,,,,,,,,,,,
1,Means are calculated using only valid scores i...,,,,,,,,,,,,,,,,
2,* Values suppressed to protect student privacy.,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,
4,Test,District Number,District Name,School Number,School Name,Total Students,Valid Scores,Evidence Based Reading and Writing Mean Score,Math Mean Score,Overall Mean Score,Participation Percent,2016 Valid Scores,2016 Evidence Based Reading and Writing Mean S...,2016 Math Mean Score,2016 Overall Mean Score,2016 Participation Percent,Mean Overall Score Change
5,PSAT,0000,STATE RESULTS,0000,STATE RESULTS,63901,58701,478.0,469.0,947.0,91.9,56753,475.6,468.4,944,88.3,3
6,PSAT,0010,MAPLETON 1,0000,DISTRICT RESULTS,729,634,437.0,428.8,865.7,87.0,549,450.3,434.2,884.4,84.7,-18.7
7,PSAT,0010,MAPLETON 1,0187,MAPLETON EXPEDITIONARY SCHOOL OF THE ARTS,105,88,411.7,429.9,841.6,83.8,95,427.9,421.2,849.1,87.2,-7.5
8,PSAT,0010,MAPLETON 1,0212,MAPLETON EARLY COLLEGE HIGH SCHOOL,62,60,409.2,408.8,818.0,96.8,57,421.9,409.6,831.6,91.9,-13.6
9,PSAT,0010,MAPLETON 1,0263,GLOBAL LEADERSHIP ACADEMY,54,50,410.0,404.2,814.2,92.6,40,403.8,424.5,828.3,93,-14.1


In [981]:
# Drop rows with descriptive text
df_2017_agg = df_2017_agg_raw.drop(df_2017_agg_raw.index[0:4])
df_2018_agg = df_2018_agg_raw.drop(df_2018_agg_raw.index[0:3])

In [982]:
# Set columns headers as first row containing Test, District Number, District Name, etc.
df_2017_agg.columns = df_2017_agg.iloc[0]
df_2018_agg.columns = df_2018_agg.iloc[0]

In [983]:
# Drop first row containing the column headers
df_2017_agg = df_2017_agg.drop(df_2017_agg.index[0])
df_2018_agg = df_2018_agg.drop(df_2018_agg.index[0])

In [984]:
## 2017 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'STATE RESULTS']
district_2017_agg = df_2017_agg.loc[df_2017_agg['School Name'] == 'DISTRICT RESULTS']

# Drop those & create new dataframe of school only data
# Note: There's something weird with the school districts here.  BOCES?
schools_2017_agg = df_2017_agg[(df_2017_agg['School Name'] != 'STATE RESULTS') & (df_2017_agg['School Name'] != 'DISTRICT RESULTS')]


## 2018 data
# Assign state & district results to their own dataframes respectively (if they exist)
state_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'STATE']
district_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'DISTRICT']

# Create dataframe with only school level scores
schools_2018_agg = df_2018_agg.loc[df_2018_agg['Level'] == 'SCHOOL']

# Drop some unneeded columns
schools_2018_agg = schools_2018_agg.drop(['Level','Grade'], axis = 1)

In [985]:
# Rename some columns for convenience
col_names1_long = list(schools_2017_agg.columns)
col_names1_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean',
                 '2017 Participation Percent',
                 '2016 Valid Scores',
                 '2016 EBRW Mean',
                 '2016 Math Mean',
                 '2016 Overall Mean',
                 '2016 Participation Percent',
                 'Mean Overall Score Change']
schools_2017_agg.columns = col_names1_short

# Rename some columns for convenience
col_names2_long = list(schools_2018_agg.columns)
col_names2_short = ['Test',
                 'District Number',
                 'District Name',
                 'School Number',
                 'School Name',
                 'Total Students',
                 '2018 Valid Scores',
                 '2018 EBRW Mean',
                 '2018 Math Mean',
                 '2018 Overall Mean Score',
                 '2018 Participation Percent',
                 '2017 Valid Scores',
                 '2017 EBRW Mean',
                 '2017 Math Mean',
                 '2017 Overall Mean Score',
                 '2017 Participation Percent',
                 'Mean Overall Score Change']
schools_2018_agg.columns = col_names2_short

In [986]:
# Trim white space
schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()
schools_2018_agg['Test'] = schools_2018_agg['Test'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2017_agg['Test'] = schools_2017_agg['Test'].str.strip()


#### Create new dataframe aggregating all data from 2018 into a cleaner format

In [987]:
# Select only rows that have both 2017 and 2018 scores
schools_2018_agg_subset = schools_2018_agg[~schools_2018_agg['Mean Overall Score Change'].isna()]

# Drop rows using '*' to indicate missing data
schools_2018_agg_subset = schools_2018_agg_subset[~(schools_2018_agg_subset['Total Students'] == '*')]

In [988]:
schools_2018_agg_subset.dtypes

Test                          object
District Number               object
District Name                 object
School Number                 object
School Name                   object
Total Students                object
2018 Valid Scores             object
2018 EBRW Mean                object
2018 Math Mean                object
2018 Overall Mean Score       object
2018 Participation Percent    object
2017 Valid Scores             object
2017 EBRW Mean                object
2017 Math Mean                object
2017 Overall Mean Score       object
2017 Participation Percent    object
Mean Overall Score Change     object
dtype: object

In [989]:
#### schools_2018_agg_subset[schools_2018_agg_subset['2018 Overall Mean Score']=='*']

In [990]:
#### schools_2018_agg_subset[schools_2018_agg_subset['2017 Overall Mean Score']=='*']

In [991]:
schools_2018_agg_SAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "SAT"]
schools_2018_agg_PSAT = schools_2018_agg_subset.loc[schools_2018_agg_subset["Test"] == "PSAT10"]
tests_combined_2018_agg = schools_2018_agg_SAT.merge(schools_2018_agg_PSAT, on="School Number", suffixes=(' SAT', ' PSAT10'))

In [992]:
# Drop duplicated &  unnecessary columns
tests_combined_2018_agg = tests_combined_2018_agg.drop(['Test PSAT10',
                                                        'Test SAT',
                                                        'District Number PSAT10',
                                                        'District Name PSAT10',
                                                        'School Name PSAT10',
                                                        'Test PSAT10',
                                                        'Mean Overall Score Change SAT',
                                                        'Mean Overall Score Change PSAT10'
                                                       ], axis = 1, errors='ignore')

drop_cats = ['Participation',
            'Total Students',
            'Valid Scores']

drop_list = [col for col in tests_combined_2018_agg.columns if any(cat in col for cat in drop_cats)]

In [993]:
tests_combined_2018_agg = tests_combined_2018_agg.drop(drop_list, axis = 1, errors='ignore')

In [994]:
# Only keep 2018 SAT and 2017 PSAT

tests_combined_2018_agg = tests_combined_2018_agg.drop(['2017 EBRW Mean SAT',
                                                        '2017 Math Mean SAT',
                                                        '2017 Overall Mean Score SAT',
                                                        '2018 EBRW Mean PSAT10',
                                                        '2018 Math Mean PSAT10',
                                                        '2018 Overall Mean Score PSAT10'
                                                       ], axis = 1, errors='ignore')

In [995]:
# Rename first four columns 

tests_combined_2018_agg.columns = ['District Number', 'District Name', 'School Number',
       'School Name', '2018 EBRW Mean SAT', '2018 Math Mean SAT',
       '2018 Overall Mean Score SAT', '2017 EBRW Mean PSAT10',
       '2017 Math Mean PSAT10', '2017 Overall Mean Score PSAT10']

In [996]:
# Set school & district names lowercase

tests_combined_2018_agg['District Name'] = tests_combined_2018_agg.loc[:,'District Name'].str.lower()
tests_combined_2018_agg['School Name'] = tests_combined_2018_agg.loc[:,'School Name'].str.lower()

In [997]:
# Strip whitespace
tests_combined_2018_agg['District Name'] = tests_combined_2018_agg.loc[:,'District Name'].str.strip()
tests_combined_2018_agg['School Name'] = tests_combined_2018_agg.loc[:,'School Name'].str.strip()

In [998]:
# tests_combined_2018_agg.to_csv('../data/SAT_PSAT_aggregated.csv')

In [999]:
tests_combined_2018_agg.dtypes

District Number                   object
District Name                     object
School Number                     object
School Name                       object
2018 EBRW Mean SAT                object
2018 Math Mean SAT                object
2018 Overall Mean Score SAT       object
2017 EBRW Mean PSAT10             object
2017 Math Mean PSAT10             object
2017 Overall Mean Score PSAT10    object
dtype: object

In [1000]:

tests_combined_2018_agg = tests_combined_2018_agg[tests_combined_2018_agg['2018 Overall Mean Score SAT']!='*']

In [1001]:

tests_combined_2018_agg = tests_combined_2018_agg[tests_combined_2018_agg['2017 Overall Mean Score PSAT10']!='*']

# Scores arranged by specific subgroups

In [1002]:
# Rename dict_2018_disagg_raw keys
disagg_categories = ['Gend', 'Ethn', 'FRM', 'GT','ELL','Migr', 'IEP']
dict_2018_disagg = dict(zip(disagg_categories, list(dict_2018_disagg_raw.values())))

In [1003]:
# Function to clean each sheet

def clean_disagg_sheet(sheet_raw):
    
    # Drop rows with descriptive text
    sheet = sheet_raw.drop(sheet_raw.index[0:4])
    
    # Set columns headers as first row containing Test, District Number, District Name, etc.
    sheet.columns = sheet.iloc[0]

    # Drop first row containing the column headers
    sheet = sheet.drop(sheet.index[0])
    
    #Rename some columns for convenience
    col_names_short = ['Level',
                     'Test',
                     'District Number',
                     'District Name',
                     'School Number',
                     'School Name',
                     'Demo Group',
                     '2018 Total Records',
                     '2018 Valid Scores',
                     '2018 Participation Rate',
                     '2018 EBRW',
                     '2018 Math',
                     '2018 Overall']
    sheet.columns = col_names_short
    
    # Drop unnecessary & missing values
    sheet = sheet[sheet['Test'] == 'SAT']
    sheet = sheet[sheet['2018 Valid Scores'] != '< 16']
    sheet = sheet[sheet['2018 Overall'] != '*']
    sheet = sheet[sheet['Demo Group'] != 'Not Reported']
    
    # Drop unnecessary columns
    sheet = sheet.drop(['Test',
                      '2018 Total Records',
                      '2018 Participation Rate',
                      '2018 Valid Scores'
                       ], axis=1, errors='ignore')
    
    # Assign state & district results to their own dataframes respectively (if they exist)
    state_sheet = sheet.loc[sheet['Level'] == 'STATE']
    district_sheet = sheet.loc[sheet['Level'] == 'DISTRICT']

    state_sheet = state_sheet.drop(['Level'], axis=1, errors='ignore')
    district_sheet = district_sheet.drop(['Level'], axis=1, errors='ignore')
    
    # Drop those & create new dataframe of school only data
    schools_sheet = sheet[sheet['Level'] == 'SCHOOL']
    schools_sheet = schools_sheet.drop(['Level'], axis=1, errors='ignore')
    
    schools_sheet = schools_sheet.reset_index()
    district_sheet = district_sheet.reset_index()
    state_sheet = state_sheet.reset_index()
    
    state_sheet = state_sheet.drop(['index'], axis=1, errors='ignore')
    district_sheet = district_sheet.drop(['index'], axis=1, errors='ignore')
    schools_sheet = schools_sheet.drop(['index'], axis=1, errors='ignore')
    
    #state_sheet = state_sheet.drop(['District Number'], axis=1, errors='ignore')
    #district_sheet = district_sheet.drop(['District Number'], axis=1, errors='ignore')
    #schools_sheet = schools_sheet.drop(['District Number'], axis=1, errors='ignore')
    
    #state_sheet = state_sheet.drop(['School'], axis=1, errors='ignore')
    #district_sheet = district_sheet.drop(['School'], axis=1, errors='ignore')
    #schools_sheet = schools_sheet.drop(['School'], axis=1, errors='ignore')
    
    # Set school & district names lowercase

    state_sheet['District Name'] = state_sheet.loc[:,'District Name'].str.lower()
    state_sheet['School Name'] = state_sheet.loc[:,'School Name'].str.lower()
    
    district_sheet['District Name'] = district_sheet.loc[:,'District Name'].str.lower()
    district_sheet['School Name'] = district_sheet.loc[:,'School Name'].str.lower()
    
    schools_sheet['District Name'] = schools_sheet.loc[:,'District Name'].str.lower()
    schools_sheet['School Name'] = schools_sheet.loc[:,'School Name'].str.lower()
    
    schools_sheet['District Name'] = schools_sheet.loc[:,'District Name'].str.strip()
    schools_sheet['School Name'] = schools_sheet.loc[:,'School Name'].str.strip()
       
    return schools_sheet, district_sheet, state_sheet

In [1004]:
# Generate dict of school only disagg data

school_disagg_2018 = {}

for cat in disagg_categories:
    school_disagg_2018[cat] = clean_disagg_sheet(dict_2018_disagg[cat])[0]

Sample of what datasets look like

In [1005]:
school_disagg_2018['Ethn'].head()

Unnamed: 0,District Number,District Name,School Number,School Name,Demo Group,2018 EBRW,2018 Math,2018 Overall
0,10,mapleton 1,695,big picture college and career academy,Hispanic,478,486,964
1,10,mapleton 1,1796,colorado connections academy,Hispanic,467,427,894
2,10,mapleton 1,187,mapleton expeditionary school of the arts,Hispanic,467,464,930
3,10,mapleton 1,263,global leadership academy,Hispanic,442,434,876
4,10,mapleton 1,212,mapleton early college high school,Hispanic,452,438,890


In [1006]:
school_disagg_2018['Gend'].head()

Unnamed: 0,District Number,District Name,School Number,School Name,Demo Group,2018 EBRW,2018 Math,2018 Overall
0,10,mapleton 1,212,mapleton early college high school,Female,472,438,910
1,10,mapleton 1,1796,colorado connections academy,Female,496,444,940
2,10,mapleton 1,1796,colorado connections academy,Male,504,480,984
3,10,mapleton 1,187,mapleton expeditionary school of the arts,Male,467,474,941
4,10,mapleton 1,212,mapleton early college high school,Male,454,444,898


In [1007]:
school_disagg_2018['Ethn'][school_disagg_2018['Ethn']['School Name'] == 'pikes peak online school']

Unnamed: 0,District Number,District Name,School Number,School Name,Demo Group,2018 EBRW,2018 Math,2018 Overall
640,9170,colorado digital boces,6971,pikes peak online school,Hispanic,457,409,866
641,9170,colorado digital boces,6971,pikes peak online school,White,433,405,838


In [1008]:
school_disagg_2018_df = pd.concat([df for df in school_disagg_2018.values()], ignore_index=True)

In [1009]:
#school_disagg_2018_df.to_csv('../data/SAT_2018_single_categories.csv')

# Score arranged by multiple demographic categories

In [1010]:
# Delete header rows & set column headers
df_2018_multicat = df_2018_multicat_raw.drop(df_2018_multicat_raw.index[0:17])
df_2018_multicat.columns = df_2018_multicat.iloc[0]
df_2018_multicat = df_2018_multicat.drop(df_2018_multicat.index[0])
df_2018_multicat.columns.name = 'index'
df_2018_multicat = df_2018_multicat.reset_index().drop(['index'], axis=1)

# Drop unnecessary columns
df_2018_multicat = df_2018_multicat.drop(['Academic Year','Standard Deviation','% Participation','Score Count','Test Name'], axis=1)

In [1011]:
# Forward fill missing metadata.  This works because of the format of the original xlsx document
ffill_cols = ['State/District/School',
 'Subject',
 'Gender',
 'Ethnicity',
 'Free and Reduced Lunch',
 'English Language Learners',
 'IEP']

df_2018_multicat[ffill_cols] = df_2018_multicat[ffill_cols].fillna(method='ffill') 

In [1012]:
# Drop empty scores
schools_2018_multicat = df_2018_multicat[df_2018_multicat['Mean Scale Score'] != '-']

In [1013]:
# Split 'State/District/School' col into 'District Name' and 'School Name' columns
district_and_name = schools_2018_multicat['State/District/School'].str.split(':', expand=True).iloc[:,[0,1]]
schools_2018_multicat[['District Name', 'School Name']] = district_and_name;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2018_multicat[['District Name', 'School Name']] = district_and_name;
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2018_multicat[['District Name', 'School Name']] = district_and_name;


In [1014]:
# Drop 'State/District/School' column
schools_2018_multicat = schools_2018_multicat.drop(['State/District/School'], axis=1, errors='ignore')

#Reorder columns with district and name as first columns
schools_2018_multicat = schools_2018_multicat[['District Name',
       'School Name', 'Subject', 'Gender', 'Ethnicity', 'Free and Reduced Lunch',
       'English Language Learners', 'IEP', 'Mean Scale Score']]

In [1015]:
# Create school and district number columns
schools_2018_multicat['District Number'] = schools_2018_multicat['District Name'].str.extract('.*\((.*)\).*')
schools_2018_multicat['School Number'] = schools_2018_multicat['School Name'].str.extract('.*\((.*)\).*')

In [1016]:
# Strip School and District names of numbers
schools_2018_multicat['District Name'] = schools_2018_multicat['District Name'].str.replace('\(.*$', '')
schools_2018_multicat['School Name'] = schools_2018_multicat['School Name'].str.replace('\(.*$', '')

  schools_2018_multicat['District Name'] = schools_2018_multicat['District Name'].str.replace('\(.*$', '')
  schools_2018_multicat['School Name'] = schools_2018_multicat['School Name'].str.replace('\(.*$', '')


In [1017]:
# Set school & district names lowercase

schools_2018_multicat['District Name'] = schools_2018_multicat.loc[:,'District Name'].str.lower()
schools_2018_multicat['School Name'] = schools_2018_multicat.loc[:,'School Name'].str.lower()

In [1018]:
# Strip whitespace
schools_2018_multicat['District Name'] = schools_2018_multicat.loc[:,'District Name'].str.strip()
schools_2018_multicat['School Name'] = schools_2018_multicat.loc[:,'School Name'].str.strip()

In [1019]:
cols = list(schools_2018_multicat.columns)
new_cols = ['District Name',
             'School Name',
             'Subject',
             'Gend',
             'Ethn',
             'FRL',
             'ELL',
             'IEP',
             'Mean Score',
             'District Number',
             'School Number']

schools_2018_multicat.columns = new_cols

In [1020]:
# Reorder columns
schools_2018_multicat = schools_2018_multicat[['District Number',
                       'District Name',
                       'School Number',
                       'School Name',
                       'Subject',
                       'Gend',
                       'Ethn',
                       'FRL',
                       'ELL',
                       'IEP',
                       'Mean Score']]

In [1021]:
# Create new dataframe containing both scores in a single line for a given demographic group

schools_2018_multicat_math = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Math']
schools_2018_multicat_math = schools_2018_multicat_math.rename(columns={'Mean Score':'Mean Score Math'})
schools_2018_multicat_EBRW = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Evidence-Based Reading And Writing']
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.rename(columns={'Mean Score':'Mean Score EBRW'})
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.reset_index()

schools_SAT_2018_multicat_combined = schools_2018_multicat_math.reset_index()
schools_SAT_2018_multicat_combined['Mean Score EBRW'] = schools_2018_multicat_EBRW['Mean Score EBRW']

schools_SAT_2018_multicat_combined = schools_SAT_2018_multicat_combined.drop(['index','Subject'],axis=1)

In [1022]:
#schools_SAT_2018_multicat_combined.to_csv('../data/SAT_2018_multicategories.csv')

# Check formatting of each df

In [1023]:
tests_combined_2018_agg.head()

Unnamed: 0,District Number,District Name,School Number,School Name,2018 EBRW Mean SAT,2018 Math Mean SAT,2018 Overall Mean Score SAT,2017 EBRW Mean PSAT10,2017 Math Mean PSAT10,2017 Overall Mean Score PSAT10
0,10,mapleton 1,187,mapleton expeditionary school of the arts,472,467,939,412,430,842
1,10,mapleton 1,212,mapleton early college high school,464,441,905,409,409,818
2,10,mapleton 1,263,global leadership academy,443,438,880,410,404,814
3,10,mapleton 1,309,academy high school,452,434,886,411,417,828
4,10,mapleton 1,503,york international,495,477,972,466,444,910


In [1024]:
school_disagg_2018_df.head()

Unnamed: 0,District Number,District Name,School Number,School Name,Demo Group,2018 EBRW,2018 Math,2018 Overall
0,10,mapleton 1,212,mapleton early college high school,Female,472,438,910
1,10,mapleton 1,1796,colorado connections academy,Female,496,444,940
2,10,mapleton 1,1796,colorado connections academy,Male,504,480,984
3,10,mapleton 1,187,mapleton expeditionary school of the arts,Male,467,474,941
4,10,mapleton 1,212,mapleton early college high school,Male,454,444,898


In [1025]:
schools_SAT_2018_multicat_combined.head()

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW
0,1040,academy 20,110,academy online,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,505.882353,560.588235
1,1040,academy 20,76,air academy high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,537.757009,562.056075
2,1040,academy 20,76,air academy high school,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,574.725275,564.395604
3,1040,academy 20,2195,discovery canyon campus high school,Female,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,503.529412,517.058824
4,1040,academy 20,2195,discovery canyon campus high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,560.444444,579.222222


In [1026]:
schools_SAT_2018_multicat_combined[schools_SAT_2018_multicat_combined['School Number'].isna()]

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW
256,880,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474
257,880,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263


In [1027]:
schools_SAT_2018_multicat_combined[schools_SAT_2018_multicat_combined['School Name'] == 'dsst']

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW
256,880,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474
257,880,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263


# Income data & zip code data

In [1028]:
url = 'http://www.usa.com/rank/colorado-state--median-family-income--zip-code-rank.htm'

html = requests.get(url).content
df_list = pd.read_html(html)
income_zipcodes = df_list[-1]
income_zipcodes.columns = income_zipcodes.iloc[0]
income_zipcodes = income_zipcodes.drop(income_zipcodes.index[0])
income_zipcodes = income_zipcodes.drop(['Rank'], axis=1, errors='ignore')

In [1029]:
# Split 'State/District/School' col into 'District Name' and 'School Name' columns
zip_pop = income_zipcodes['Zip / Population'].str.split('/', expand=True)
income_zipcodes[['Zip Code', 'Population']] = zip_pop;
income_zipcodes = income_zipcodes.drop(['Zip / Population'], axis=1, errors='ignore')

In [1030]:
# Rename silly column
income_zipcodes = income_zipcodes.rename(columns={'Median Family Income ▼':'Median Family Income'})

In [1031]:
# Remove non-numeric characters
income_zipcodes['Median Family Income'] = income_zipcodes['Median Family Income'].str.strip('$')
income_zipcodes['Median Family Income'] = income_zipcodes['Median Family Income'].str.replace(',','')
income_zipcodes['Population'] = income_zipcodes['Population'].str.replace(',','')

In [1032]:
# Load school physical addresses
school_addresses = pd.read_excel('../raw_data/Public School Mailing Labels 2021-2022.xlsx', sheet_name='School Physical Address')

In [1033]:
# Load school mailing addresses

school_maddresses = pd.read_excel('../raw_data/Public School Mailing Labels 2021-2022.xlsx', sheet_name='School Mailing Address ')
school_maddresses.columns = school_maddresses.iloc[0]
school_maddresses = school_maddresses.drop(school_maddresses.index[0])

In [1034]:
# Rename columns
school_zipcodes = school_addresses[['SCHOOL_CODE', 'PHYSICAL_ZIPCODE']]
school_zipcodes = school_zipcodes.rename(columns={'SCHOOL_CODE': 'School Number', 'PHYSICAL_ZIPCODE':'Zip Code'})

In [1035]:
# Rename columns
school_mzipcodes = school_maddresses[['SCHOOL_CODE', 'MAILING_ZIPCODE']]
school_mzipcodes = school_mzipcodes.rename(columns={'SCHOOL_CODE': 'School Number', 'MAILING_ZIPCODE':'Zip Code'})

In [1036]:
# Change column dtypes
school_mzipcodes = school_mzipcodes.astype({'School Number': 'str', 'Zip Code': 'Int64'})
income_zipcodes = income_zipcodes.astype({'Median Family Income': 'int', 'Zip Code': 'Int64', 'Population':'int'})

In [1037]:
#school_mzipcodes.dtypes
#income_zipcodes.dtypes
#income_zipcodes.head()
#school_mzipcodes.head()

In [1038]:
# Merge income/zipcode df with school/zipcode df

school_inc_zip = school_mzipcodes.merge(income_zipcodes, on="Zip Code", how='left')
school_inc_zip = school_inc_zip[['School Number', 'Zip Code','Median Family Income','Population']]
#school_inc_zip.dtypes
#school_inc_zip

In [1039]:
#tests_combined_2018_agg
tests_combined_2018_agg_inc = tests_combined_2018_agg.merge(school_inc_zip, how='left', on='School Number')
#tests_combined_2018_agg_inc.head()

In [1040]:
school_disagg_2018_df_inc = school_disagg_2018_df.merge(school_inc_zip, how='left', on='School Number')
#school_disagg_2018_df_inc.head()

In [1041]:
schools_SAT_2018_multicat_inc = schools_SAT_2018_multicat_combined.merge(school_inc_zip, how='left', on='School Number')
#schools_SAT_2018_multicat_inc.head()

In [1042]:
# Check for NaN
schools_SAT_2018_multicat_inc[schools_SAT_2018_multicat_inc['Median Family Income'].isna()]

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW,Zip Code,Median Family Income,Population
135,8001.0,charter school institute,75.0,animas high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,537.142857,596.666667,81302.0,,
136,8001.0,charter school institute,75.0,animas high school,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,527.894737,557.894737,81302.0,,
256,880.0,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474,,,
257,880.0,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263,,,
542,,moffat county re,2020.0,no 1,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,488.333333,524.52381,,,
543,,moffat county re,2020.0,no 1,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,467.5,476.785714,,,
546,2180.0,montrose county re-1j,6058.0,montrose high school,Female,Hispanic,FRL Eligible,Not English Learners,Students without IEPs,459.047619,470.47619,81402.0,,
547,2180.0,montrose county re-1j,6058.0,montrose high school,Female,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,469.473684,518.947368,81402.0,,
548,2180.0,montrose county re-1j,6058.0,montrose high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,540.0,562.075472,81402.0,,
549,2180.0,montrose county re-1j,6058.0,montrose high school,Male,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,515.0,506.875,81402.0,,


Several schools missing income & population data

In [1043]:
schools_SAT_2018_multicat_inc[schools_SAT_2018_multicat_inc['Zip Code'].isna()]

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score Math,Mean Score EBRW,Zip Code,Median Family Income,Population
256,880.0,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474,,,
257,880.0,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263,,,
542,,moffat county re,2020.0,no 1,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,488.333333,524.52381,,,
543,,moffat county re,2020.0,no 1,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,467.5,476.785714,,,


Four schools have mizzing zip code

In [1044]:
# Export data to csv

tests_combined_2018_agg_inc.to_csv('../data/SAT_PSAT_aggregated.csv')
school_disagg_2018_df_inc.to_csv('../data/SAT_2018_single_categories.csv')
schools_SAT_2018_multicat_inc.to_csv('../data/SAT_2018_multicategories.csv')