In [30]:
# import files
import pandas as pd
import numpy as np

enrollment_path = './data/ma/enrollment/raw/'

years = ['2013', '2014', '2015', '2016', '2017', '2018', '2019']

df_bygrade = pd.DataFrame()

for year in years:
    temp = pd.read_excel(enrollment_path + 'enrollmentbygrade' + year + '.xlsx', dtype={'School Name':str,'School Code':str,'PK':str,'K':str,'1':str,'2':str,'3':str,'4':str,'5':str,'6':str,'7':str,'8':str,'9':str,'10':str,'11':str,'12':str,'SP':str,'Total':str},skiprows=[0])
    temp['year'] = year
    df_bygrade = df_bygrade.append(temp, ignore_index=True, sort=True)

print(df_bygrade.shape)

(12974, 19)


In [31]:
df_byracegender = pd.DataFrame()

for year in years:
    temp = pd.read_excel(enrollment_path + 'enrollmentbyracegender' + year + '.xlsx', dtype={'School Name':str,'School Code':str,'African American':str,'Asian':str,'Hispanic':str,'White':str,'Native American':str,'Native Hawaiian, Pacific Islander':str,'Multi-Race, Non-Hispanic':str,'Males':str,'Females':str},skiprows=[0])
    temp['year'] = year
    df_byracegender = df_byracegender.append(temp, ignore_index=True, sort=True)

print(df_byracegender.shape)

(12978, 12)


In [32]:
# join totals to racegender
print(df_byracegender.shape)
df_byracegender = df_byracegender.merge(df_bygrade[['year', 'School Code', 'Total']], on=['year', 'School Code'], how='inner')
print(df_byracegender.shape)

(12978, 12)
(12974, 13)


In [33]:
df_byspecial = pd.DataFrame()

for year in years:
    temp = pd.read_excel(enrollment_path + 'selectedpopulations' + year + '.xlsx', dtype={'School Name':str,'School Code':str,'First Language Not English #':str,'First Language Not English %':str,'English Language Learner #':str,'English Language Learner %':str,'Students With Disabilities #':str,'Students With Disabilities %':str,'Low Income #':str,'Low Income %':str,'Free Lunch #':str,'Free Lunch %':str,'Reduced Lunch #':str,'Reduced Lunch %':str,'High Needs #':str,'High Needs #':str,'Economically Disadvantaged #':str,'Economically Disadvantaged %':str,},skiprows=[0])
    temp['year'] = year
    df_byspecial = df_byspecial.append(temp, ignore_index=True, sort=True)

df_byspecial = df_byspecial.rename(columns={
    'First Language Not English #': 'First Language Not English',
    'English Language Learner #': 'English Language Learner',
    'Students With Disabilities #': 'Students With Disabilities',
    'Low Income #': 'Low Income',
    'Free Lunch #': 'Free Lunch',
    'Reduced Lunch #': 'Reduced Lunch',
    'High Needs #': 'High Needs',
    'Economically Disadvantaged #': 'Economically Disadvantaged'
})
print(df_byspecial.shape)

(12978, 19)


In [34]:
# pivot bygrade
id_vars = ['year', 'School Code', 'School Name']
value_vars = ['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9', 'K', 'PK', 'SP', 'Total']
df_bygrade = pd.melt(df_bygrade, id_vars = id_vars, value_vars = value_vars, var_name = 'grade', value_name = 'num')
df_bygrade['group_state'] = 'All Students'
print(df_bygrade.columns)

Index(['year', 'School Code', 'School Name', 'grade', 'num', 'group_state'], dtype='object')


In [35]:
# convert num to float, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan
df_bygrade['num'] = df_bygrade.apply(lambda x: toFloat(x, 'num'), axis = 1)
print(df_bygrade.shape)
df_bygrade = df_bygrade[~df_bygrade.num.eq(0)]
print(df_bygrade.shape)
df_bygrade = df_bygrade.dropna(subset=['num'])
print(df_bygrade.shape)

(207584, 6)
(80207, 6)
(79018, 6)


In [36]:
# pivot byracegender
id_vars = ['year', 'School Code', 'School Name', 'Total']
value_vars = ['African American', 'Asian', 'Females', 'Hispanic', 'Males', 'Multi-Race, Non-Hispanic', 'Native American', 'Native Hawaiian, Pacific Islander', 'White']
df_byracegender = pd.melt(df_byracegender, id_vars = id_vars, value_vars = value_vars, var_name = 'group_state', value_name = 'percent')
df_byracegender['grade'] = 'All'
print(df_byracegender.columns)

Index(['year', 'School Code', 'School Name', 'Total', 'group_state', 'percent',
       'grade'],
      dtype='object')


In [38]:
# convert num to float, calculate num, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan
df_byracegender['percent'] = df_byracegender.apply(lambda x: toFloat(x, 'percent'), axis = 1)
df_byracegender['Total'] = df_byracegender.apply(lambda x: toFloat(x, 'Total'), axis = 1)
df_byracegender['num'] = df_byracegender['percent'] * df_byracegender['Total'] / 100
del df_byracegender['percent']
del df_byracegender['Total']
print(df_byracegender.shape)
df_byracegender = df_byracegender[~df_byracegender.num.eq(0)]
print(df_byracegender.shape)
df_byracegender = df_byracegender.dropna(subset=['num'])
print(df_byracegender.shape)

   year School Code                                        School Name  Total  \
0  2013    04450105  Abby Kelley Foster Charter Public (District) -...    NaN   
1  2013    00010505                           Abington - Abington High  502.0   
2  2013    00010003          Abington - Beaver Brook Elementary School  685.0   
3  2013    00010002                Abington - Center Elementary School  190.0   
4  2013    00010405                    Abington - Frolio Middle School  344.0   

        group_state  percent grade     num  
0  African American     35.6   All     NaN  
1  African American      2.6   All  13.052  
2  African American      1.5   All  10.275  
3  African American      1.1   All   2.090  
4  African American      1.5   All   5.160  
(116766, 6)
(99194, 6)
(89564, 6)


In [41]:
# pivot byspecial
id_vars = ['year', 'School Code', 'School Name']
value_vars = ['Economically Disadvantaged', 'English Language Learner', 'First Language Not English', 'Free Lunch', 'High Needs', 'Low Income','Reduced Lunch', 'Students With Disabilities']
df_byspecial = pd.melt(df_byspecial, id_vars = id_vars, value_vars = value_vars, var_name = 'group_state', value_name = 'num')
df_byspecial['grade'] = 'All'
print(df_byspecial.columns)

Index(['year', 'School Code', 'School Name', 'group_state', 'num', 'grade'], dtype='object')


In [42]:
# convert num to float, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan
df_byspecial['num'] = df_byspecial.apply(lambda x: toFloat(x, 'num'), axis = 1)
print(df_byspecial.shape)
df_byspecial = df_byspecial[~df_byspecial.num.eq(0)]
print(df_byspecial.shape)
df_byspecial = df_byspecial.dropna(subset=['num'])
print(df_byspecial.shape)

(103824, 6)
(101323, 6)
(69786, 6)


In [44]:
# append all data
df = df_bygrade.append(df_byracegender, ignore_index=True, sort=True)
df = df.append(df_byspecial, ignore_index=True, sort=True)
print(df.columns)
print(df.shape)

Index(['School Code', 'School Name', 'grade', 'group_state', 'num', 'year'], dtype='object')
(238368, 6)


In [45]:
# split school_id and school into district id and code
def splitSchool(row, type):
    if type == 'school':
        return row['School Name'][row['School Name'].find(' - ') + 3:]
    if type == 'district':
        return row['School Name'][:row['School Name'].find(' - ')]

df['district_id'] = df.apply(lambda x: x['School Code'][:4], axis = 1)
df['district'] = df.apply(lambda x: splitSchool(x, 'district'), axis = 1)
df['school_id'] = df.apply(lambda x: x['School Code'][4:], axis = 1)
df['school'] = df.apply(lambda x: splitSchool(x, 'school'), axis = 1)
del df['School Code']
del df['School Name']

In [50]:
print(df.dtypes)
df['num'] = df['num'].astype('int64')
print(df.dtypes)

grade           object
group_state     object
num            float64
year            object
district_id     object
district        object
school_id       object
school          object
dtype: object
grade          object
group_state    object
num             int64
year           object
district_id    object
district       object
school_id      object
school         object
dtype: object


In [51]:
# export final dataset
df.to_csv('./data/finalized/ma_enrollment.csv', index=False)