# SAT & PSAT score data wrangling

# Import packages & data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

In [2]:
# Filenames & paths

filepath = '../raw_data/'
file_2018_multicat = '2018 SAT results by subgroups.xlsx'
file_2017_PSAT_multicat = '2017 PSAT results by subgroups.xlsx'

In [3]:
df_2018_multicat_raw = pd.read_excel(filepath + file_2018_multicat)
df_2017_PSAT_multicat_raw = pd.read_excel(filepath + file_2017_PSAT_multicat)

In [4]:
df_2018_multicat_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17910 entries, 0 to 17909
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   13 non-null     object
 1   Unnamed: 1   490 non-null    object
 2   Unnamed: 2   480 non-null    object
 3   Unnamed: 3   959 non-null    object
 4   Unnamed: 4   2020 non-null   object
 5   Unnamed: 5   7073 non-null   object
 6   Unnamed: 6   11065 non-null  object
 7   Unnamed: 7   13943 non-null  object
 8   Unnamed: 8   17896 non-null  object
 9   Unnamed: 9   17893 non-null  object
 10  Unnamed: 10  17893 non-null  object
 11  Unnamed: 11  17896 non-null  object
 12  Unnamed: 12  17893 non-null  object
dtypes: object(13)
memory usage: 1.8+ MB


In [5]:
df_2017_PSAT_multicat_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18298 entries, 0 to 18297
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   13 non-null     object
 1   Unnamed: 1   493 non-null    object
 2   Unnamed: 2   483 non-null    object
 3   Unnamed: 3   965 non-null    object
 4   Unnamed: 4   1918 non-null   object
 5   Unnamed: 5   6915 non-null   object
 6   Unnamed: 6   10865 non-null  object
 7   Unnamed: 7   14057 non-null  object
 8   Unnamed: 8   18284 non-null  object
 9   Unnamed: 9   18281 non-null  object
 10  Unnamed: 10  18281 non-null  object
 11  Unnamed: 11  18284 non-null  object
 12  Unnamed: 12  18281 non-null  object
dtypes: object(13)
memory usage: 1.8+ MB


# Cleaning dataframe after csv import

**Scores are arranged by multiple demographic categories**

In [6]:
# SAT data
# Delete header rows & set column headers
df_2018_multicat = df_2018_multicat_raw.drop(df_2018_multicat_raw.index[0:17])
df_2018_multicat.columns = df_2018_multicat.iloc[0]
df_2018_multicat = df_2018_multicat.drop(df_2018_multicat.index[0])
df_2018_multicat.columns.name = 'index'
df_2018_multicat = df_2018_multicat.reset_index().drop(['index'], axis=1)

# Drop unnecessary columns
df_2018_multicat = df_2018_multicat.drop(['Academic Year','Standard Deviation','% Participation','Score Count','Test Name'], axis=1)

In [7]:
# PSAT data
# Delete header rows & set column headers
df_2017_PSAT_multicat = df_2017_PSAT_multicat_raw.drop(df_2017_PSAT_multicat_raw.index[0:17])
df_2017_PSAT_multicat.columns = df_2017_PSAT_multicat.iloc[0]
df_2017_PSAT_multicat = df_2017_PSAT_multicat.drop(df_2017_PSAT_multicat.index[0])
df_2017_PSAT_multicat.columns.name = 'index'
df_2017_PSAT_multicat = df_2017_PSAT_multicat.reset_index().drop(['index'], axis=1)

# Drop unnecessary columns
df_2017_PSAT_multicat = df_2017_PSAT_multicat.drop(['Academic Year','Standard Deviation','% Participation','Score Count','Test Name'], axis=1)

In [8]:
# Forward fill missing data.  This works because of the format of the original xlsx document
ffill_cols = ['State/District/School',
 'Subject',
 'Gender',
 'Ethnicity',
 'Free and Reduced Lunch',
 'English Language Learners',
 'IEP']

df_2018_multicat[ffill_cols] = df_2018_multicat[ffill_cols].fillna(method='ffill') 
df_2017_PSAT_multicat[ffill_cols] = df_2017_PSAT_multicat[ffill_cols].fillna(method='ffill') 

In [9]:
# Drop empty scores
schools_2018_multicat = df_2018_multicat[df_2018_multicat['Mean Scale Score'] != '-']
schools_2017_PSAT_multicat = df_2017_PSAT_multicat[df_2017_PSAT_multicat['Mean Scale Score'] != '-']

In [10]:
# Split 'State/District/School' col into 'District Name' and 'School Name' columns
district_and_name = schools_2018_multicat['State/District/School'].str.split(':', expand=True).iloc[:,[0,1]];
schools_2018_multicat[['District Name', 'School Name']] = district_and_name;

district_and_name = schools_2017_PSAT_multicat['State/District/School'].str.split(':', expand=True).iloc[:,[0,1]];
schools_2017_PSAT_multicat[['District Name', 'School Name']] = district_and_name;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2018_multicat[['District Name', 'School Name']] = district_and_name;
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2018_multicat[['District Name', 'School Name']] = district_and_name;
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_2017_PSAT_multicat[['District Name', 'Schoo

In [11]:
# Drop 'State/District/School' column
schools_2018_multicat = schools_2018_multicat.drop(['State/District/School'], axis=1, errors='ignore')
schools_2017_PSAT_multicat = schools_2017_PSAT_multicat.drop(['State/District/School'], axis=1, errors='ignore')

#Reorder columns with district and name as first columns
schools_2018_multicat = schools_2018_multicat[['District Name',
       'School Name', 'Subject', 'Gender', 'Ethnicity', 'Free and Reduced Lunch',
       'English Language Learners', 'IEP', 'Mean Scale Score']]
schools_2017_PSAT_multicat = schools_2017_PSAT_multicat[['District Name',
       'School Name', 'Subject', 'Gender', 'Ethnicity', 'Free and Reduced Lunch',
       'English Language Learners', 'IEP', 'Mean Scale Score']]

In [12]:
# Create school and district number columns
schools_2018_multicat['District Number'] = schools_2018_multicat['District Name'].str.extract('.*\((.*)\).*')
schools_2018_multicat['School Number'] = schools_2018_multicat['School Name'].str.extract('.*\((.*)\).*')

schools_2017_PSAT_multicat['District Number'] = schools_2017_PSAT_multicat['District Name'].str.extract('.*\((.*)\).*')
schools_2017_PSAT_multicat['School Number'] = schools_2017_PSAT_multicat['School Name'].str.extract('.*\((.*)\).*')

In [13]:
# Strip School and District names of numbers
schools_2018_multicat['District Name'] = schools_2018_multicat['District Name'].str.replace('\(.*$', '')
schools_2018_multicat['School Name'] = schools_2018_multicat['School Name'].str.replace('\(.*$', '')

schools_2017_PSAT_multicat['District Name'] = schools_2017_PSAT_multicat['District Name'].str.replace('\(.*$', '')
schools_2017_PSAT_multicat['School Name'] = schools_2017_PSAT_multicat['School Name'].str.replace('\(.*$', '')

  schools_2018_multicat['District Name'] = schools_2018_multicat['District Name'].str.replace('\(.*$', '')
  schools_2018_multicat['School Name'] = schools_2018_multicat['School Name'].str.replace('\(.*$', '')
  schools_2017_PSAT_multicat['District Name'] = schools_2017_PSAT_multicat['District Name'].str.replace('\(.*$', '')
  schools_2017_PSAT_multicat['School Name'] = schools_2017_PSAT_multicat['School Name'].str.replace('\(.*$', '')


In [14]:
# Set school & district names lowercase

schools_2018_multicat['District Name'] = schools_2018_multicat.loc[:,'District Name'].str.lower()
schools_2018_multicat['School Name'] = schools_2018_multicat.loc[:,'School Name'].str.lower()

schools_2017_PSAT_multicat['District Name'] = schools_2017_PSAT_multicat.loc[:,'District Name'].str.lower()
schools_2017_PSAT_multicat['School Name'] = schools_2017_PSAT_multicat.loc[:,'School Name'].str.lower()

In [15]:
# Strip whitespace
schools_2018_multicat['District Name'] = schools_2018_multicat.loc[:,'District Name'].str.strip()
schools_2018_multicat['School Name'] = schools_2018_multicat.loc[:,'School Name'].str.strip()

schools_2017_PSAT_multicat['District Name'] = schools_2017_PSAT_multicat.loc[:,'District Name'].str.strip()
schools_2017_PSAT_multicat['School Name'] = schools_2017_PSAT_multicat.loc[:,'School Name'].str.strip()

In [16]:
cols = list(schools_2018_multicat.columns)
new_cols = ['District Name',
             'School Name',
             'Subject',
             'Gend',
             'Ethn',
             'FRL',
             'ELL',
             'IEP',
             'Mean Score',
             'District Number',
             'School Number']

schools_2018_multicat.columns = new_cols
schools_2017_PSAT_multicat.columns = new_cols

In [17]:
# Reorder columns
schools_2018_multicat = schools_2018_multicat[['District Number',
                       'District Name',
                       'School Number',
                       'School Name',
                       'Subject',
                       'Gend',
                       'Ethn',
                       'FRL',
                       'ELL',
                       'IEP',
                       'Mean Score']]

schools_2017_PSAT_multicat = schools_2017_PSAT_multicat[['District Number',
                       'District Name',
                       'School Number',
                       'School Name',
                       'Subject',
                       'Gend',
                       'Ethn',
                       'FRL',
                       'ELL',
                       'IEP',
                       'Mean Score']]

In [18]:
# Create new dataframe containing both scores in a single line for a given demographic group

schools_2018_multicat_math = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Math']
schools_2018_multicat_math = schools_2018_multicat_math.rename(columns={'Mean Score':'Mean Score SAT Math'})
schools_2018_multicat_EBRW = schools_2018_multicat.loc[schools_2018_multicat['Subject'] == 'Evidence-Based Reading And Writing']
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.rename(columns={'Mean Score':'Mean Score SAT EBRW'})
schools_2018_multicat_EBRW = schools_2018_multicat_EBRW.reset_index()

schools_SAT_2018_multicat_combined = schools_2018_multicat_math.reset_index()
schools_SAT_2018_multicat_combined['Mean Score SAT EBRW'] = schools_2018_multicat_EBRW['Mean Score SAT EBRW']

schools_SAT_2018_multicat_combined = schools_SAT_2018_multicat_combined.drop(['index','Subject'],axis=1)


# Now for PSAT data
schools_2017_PSAT_multicat_math = schools_2017_PSAT_multicat.loc[schools_2017_PSAT_multicat['Subject'] == 'Math']
schools_2017_PSAT_multicat_math = schools_2017_PSAT_multicat_math.rename(columns={'Mean Score':'Mean Score PSAT Math'})
schools_2017_PSAT_multicat_EBRW = schools_2017_PSAT_multicat.loc[schools_2017_PSAT_multicat['Subject'] == 'Evidence-Based Reading And Writing']
schools_2017_PSAT_multicat_EBRW = schools_2017_PSAT_multicat_EBRW.rename(columns={'Mean Score':'Mean Score PSAT EBRW'})
schools_2017_PSAT_multicat_EBRW = schools_2017_PSAT_multicat_EBRW.reset_index()

schools_2017_PSAT_multicat_combined = schools_2017_PSAT_multicat_math.reset_index()
schools_2017_PSAT_multicat_combined['Mean Score PSAT EBRW'] = schools_2017_PSAT_multicat_EBRW['Mean Score PSAT EBRW']

schools_2017_PSAT_multicat_combined = schools_2017_PSAT_multicat_combined.drop(['index','Subject'],axis=1)

### Check formatting of each df

In [19]:
schools_SAT_2018_multicat_combined.head()

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score SAT Math,Mean Score SAT EBRW
0,1040,academy 20,110,academy online,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,505.882353,560.588235
1,1040,academy 20,76,air academy high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,537.757009,562.056075
2,1040,academy 20,76,air academy high school,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,574.725275,564.395604
3,1040,academy 20,2195,discovery canyon campus high school,Female,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,503.529412,517.058824
4,1040,academy 20,2195,discovery canyon campus high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,560.444444,579.222222


In [20]:
schools_2017_PSAT_multicat_combined.head()

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score PSAT Math,Mean Score PSAT EBRW
0,1040,academy 20,76,air academy high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,501.909091,535.272727
1,1040,academy 20,76,air academy high school,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,541.5625,524.583333
2,1040,academy 20,2195,discovery canyon campus high school,Female,White,Not FRL Eligible,Not English Learners,Students without IEPs,521.397849,542.580645
3,1040,academy 20,2195,discovery canyon campus high school,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,554.390244,541.219512
4,1040,academy 20,5126,liberty high school,Female,Hispanic,Not FRL Eligible,Not English Learners,Students without IEPs,440.0,466.470588


In [21]:
schools_SAT_2018_multicat_combined[schools_SAT_2018_multicat_combined['School Number'].isna()]

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score SAT Math,Mean Score SAT EBRW
256,880,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474
257,880,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263


In [22]:
schools_SAT_2018_multicat_combined[schools_SAT_2018_multicat_combined['School Name'] == 'dsst']

Unnamed: 0,District Number,District Name,School Number,School Name,Gend,Ethn,FRL,ELL,IEP,Mean Score SAT Math,Mean Score SAT EBRW
256,880,denver county 1,,dsst,Male,Hispanic,FRL Eligible,English Learners,Students without IEPs,546.842105,525.789474
257,880,denver county 1,,dsst,Male,White,Not FRL Eligible,Not English Learners,Students without IEPs,692.631579,672.105263


# Income data & zip code data

In [23]:
# Get median family income by zip code

url = 'http://www.usa.com/rank/colorado-state--median-family-income--zip-code-rank.htm'

html = requests.get(url).content
df_list = pd.read_html(html)
income_zipcodes = df_list[-1]
income_zipcodes.columns = income_zipcodes.iloc[0]
income_zipcodes = income_zipcodes.drop(income_zipcodes.index[0])
income_zipcodes = income_zipcodes.drop(['Rank'], axis=1, errors='ignore')

In [24]:
# Split 'State/District/School' col into 'District Name' and 'School Name' columns
zip_pop = income_zipcodes['Zip / Population'].str.split('/', expand=True)
income_zipcodes[['Zip Code', 'Population']] = zip_pop;
income_zipcodes = income_zipcodes.drop(['Zip / Population'], axis=1, errors='ignore')

In [25]:
# Rename silly column
income_zipcodes = income_zipcodes.rename(columns={'Median Family Income ▼':'Median Family Income'})

In [26]:
# Remove non-numeric characters
income_zipcodes['Median Family Income'] = income_zipcodes['Median Family Income'].str.strip('$')
income_zipcodes['Median Family Income'] = income_zipcodes['Median Family Income'].str.replace(',','')
income_zipcodes['Population'] = income_zipcodes['Population'].str.replace(',','')

In [27]:
# Load school physical addresses from spreadsheet

school_addresses = pd.read_excel('../raw_data/Public School Mailing Labels 2021-2022.xlsx', sheet_name='School Physical Address')

In [28]:
# Load school mailing addresses

school_maddresses = pd.read_excel('../raw_data/Public School Mailing Labels 2021-2022.xlsx', sheet_name='School Mailing Address ')
school_maddresses.columns = school_maddresses.iloc[0]
school_maddresses = school_maddresses.drop(school_maddresses.index[0])

In [29]:
# Rename columns
school_zipcodes = school_addresses[['SCHOOL_CODE', 'PHYSICAL_ZIPCODE']]
school_zipcodes = school_zipcodes.rename(columns={'SCHOOL_CODE': 'School Number', 'PHYSICAL_ZIPCODE':'Zip Code'})

In [30]:
# Rename columns
school_mzipcodes = school_maddresses[['SCHOOL_CODE', 'MAILING_ZIPCODE']]
school_mzipcodes = school_mzipcodes.rename(columns={'SCHOOL_CODE': 'School Number', 'MAILING_ZIPCODE':'Zip Code'})

In [31]:
# Change column dtypes
school_mzipcodes = school_mzipcodes.astype({'School Number': 'str', 'Zip Code': 'Int64'})
income_zipcodes = income_zipcodes.astype({'Median Family Income': 'int', 'Zip Code': 'Int64', 'Population':'int'})

In [32]:
# Merge income/zipcode df with school/zipcode df

school_inc_zip = school_mzipcodes.merge(income_zipcodes, on="Zip Code", how='left')
school_inc_zip = school_inc_zip[['School Number', 'Zip Code','Median Family Income','Population']]

In [33]:
schools_SAT_2018_multicat_inc = schools_SAT_2018_multicat_combined.merge(school_inc_zip, how='left', on='School Number')
schools_PSAT_2017_multicat_inc = schools_2017_PSAT_multicat_combined.merge(school_inc_zip, how='left', on='School Number')

# Final checks before export to csv

In [34]:
# Check for NaN
schools_SAT_2018_multicat_inc.isna().sum()

District Number          2
District Name            0
School Number            2
School Name              0
Gend                     0
Ethn                     0
FRL                      0
ELL                      0
IEP                      0
Mean Score SAT Math      0
Mean Score SAT EBRW      0
Zip Code                 4
Median Family Income    14
Population              14
dtype: int64

Only 14 schools have missing income & population data.  Dropping these since they are only 14 out of 710.

In [35]:
# Check for NaN
schools_PSAT_2017_multicat_inc.isna().sum()

District Number          2
District Name            0
School Number            5
School Name              0
Gend                     0
Ethn                     0
FRL                      0
ELL                      0
IEP                      0
Mean Score PSAT Math     0
Mean Score PSAT EBRW     0
Zip Code                 7
Median Family Income    19
Population              19
dtype: int64

In [36]:
schools_PSAT_2017_multicat_inc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 745 entries, 0 to 744
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   District Number       743 non-null    object 
 1   District Name         745 non-null    object 
 2   School Number         740 non-null    object 
 3   School Name           745 non-null    object 
 4   Gend                  745 non-null    object 
 5   Ethn                  745 non-null    object 
 6   FRL                   745 non-null    object 
 7   ELL                   745 non-null    object 
 8   IEP                   745 non-null    object 
 9   Mean Score PSAT Math  745 non-null    object 
 10  Mean Score PSAT EBRW  745 non-null    object 
 11  Zip Code              738 non-null    Int64  
 12  Median Family Income  726 non-null    float64
 13  Population            726 non-null    float64
dtypes: Int64(1), float64(2), object(11)
memory usage: 88.0+ KB


In [37]:
schools_SAT_2018_multicat_inc['Mean Score SAT EBRW'].describe()

count     710.0
unique    651.0
top       560.0
freq        6.0
Name: Mean Score SAT EBRW, dtype: float64

Only 14 schools have missing income & population data.  Dropping these since there are only 19.

In [38]:
schools_SAT_2018_multicat_inc.dropna(inplace=True)
schools_PSAT_2017_multicat_inc.dropna(inplace=True)

### Merge dataframes for csv export

In [39]:
# Generate uniqueID for each score for convenient merging.

schools_PSAT_2017_multicat_inc['UniqueID'] = schools_PSAT_2017_multicat_inc[['District Number', 'District Name', 'School Number', 'School Name',
       'Gend', 'Ethn', 'FRL', 'ELL', 'IEP']].astype(str).apply(lambda x: ', '.join(x), axis = 1)
schools_SAT_2018_multicat_inc['UniqueID'] = schools_SAT_2018_multicat_inc[['District Number', 'District Name', 'School Number', 'School Name',
       'Gend', 'Ethn', 'FRL', 'ELL', 'IEP']].astype(str).apply(lambda x: ', '.join(x), axis = 1)

In [40]:
schools_combined_multicat = pd.merge(schools_SAT_2018_multicat_inc, schools_PSAT_2017_multicat_inc[['Mean Score PSAT Math', 'Mean Score PSAT EBRW', 'UniqueID']], how='left', on='UniqueID')

schools_combined_multicat.drop('UniqueID', axis=1, inplace=True)

In [41]:
schools_combined_multicat.to_csv('../data/SAT_PSAT_multicategories_v2.csv')