In [11]:
# import files
import pandas as pd
import numpy as np

enrollment_path = './data/enrollmentbygrade/raw/'

years = ['2020-21', '2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12', '2010-11', '2009-10']

df_bygrade = pd.DataFrame()

for year in years:
    temp = pd.read_excel(enrollment_path + 'ma_enrollmentbygrade_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'PK':str,'K':str,'1':str,'2':str,'3':str,'4':str,'5':str,'6':str,'7':str,'8':str,'9':str,'10':str,'11':str,'12':str,'SP':str,'Total':str},skiprows=[0])
    temp['year'] = year
    df_bygrade = df_bygrade.append(temp, ignore_index=True, sort=True)

print(df_bygrade.shape)
print(df_bygrade.head(5))

(4839, 19)
          1        10        11        12         2         3         4  \
0       116        99        98        86       120       118       117   
1       168       160       153       152       161       144       162   
2         0        63        63        58         0         0         0   
3       321       465       450       426       365       372       404   
4        83         0         0         0       117        95        80   

          5         6         7         8         9 District Code  \
0       115       117       114       111        97      04450000   
1       140       169       161       169       146      00010000   
2        53        72        73        76        82      04120000   
3       390       414       410       421       406      06000000   
4       108       104       100       109         0      00030000   

                                       District Name         K        PK  \
0       Abby Kelley Foster Charter Public (Dist

In [12]:
df_byracegender = pd.DataFrame()

for year in years:
    temp = pd.read_excel('./data/enrollmentbyracegender/raw/ma_enrollmentbyracegender_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'African American':str,'Asian':str,'Hispanic':str,'White':str,'Native American':str,'Native Hawaiian, Pacific Islander':str,'Multi-Race, Non-Hispanic':str,'Males':str,'Females':str},skiprows=[0])
    temp['year'] = year
    df_byracegender = df_byracegender.append(temp, ignore_index=True, sort=True)

print(df_byracegender.shape)
print(df_byracegender.head(5))

(4839, 13)
  African American   Asian District Code  \
0             54.1     3.4      04450000   
1              4.8     2.5      00010000   
2             62.8     0.7      04120000   
3              3.1    33.7      06000000   
4              0.9     0.9      00030000   

                                       District Name Females Hispanic   Males  \
0       Abby Kelley Foster Charter Public (District)    54.3     19.4    45.7   
1                                           Abington    49.6     10.9    50.4   
2  Academy Of the Pacific Rim Charter Public (Dis...    48.5     28.3    51.5   
3                                   Acton-Boxborough    48.0      6.2    51.9   
4                                           Acushnet    46.8      4.4    53.2   

  Multi-Race, Non-Hispanic Native American Native Hawaiian, Pacific Islander  \
0                      5.1             0.6                               0.0   
1                      1.9             0.6                               0.1 

In [13]:
# join totals to racegender
print(df_byracegender.shape)
df_byracegender = df_byracegender.merge(df_bygrade[['year', 'District Code', 'Total']], on=['year', 'District Code'], how='inner')
print(df_byracegender.shape)
print(df_byracegender.head(5))

(4839, 13)
(4839, 14)
  African American   Asian District Code  \
0             54.1     3.4      04450000   
1              4.8     2.5      00010000   
2             62.8     0.7      04120000   
3              3.1    33.7      06000000   
4              0.9     0.9      00030000   

                                       District Name Females Hispanic   Males  \
0       Abby Kelley Foster Charter Public (District)    54.3     19.4    45.7   
1                                           Abington    49.6     10.9    50.4   
2  Academy Of the Pacific Rim Charter Public (Dis...    48.5     28.3    51.5   
3                                   Acton-Boxborough    48.0      6.2    51.9   
4                                           Acushnet    46.8      4.4    53.2   

  Multi-Race, Non-Hispanic Native American Native Hawaiian, Pacific Islander  \
0                      5.1             0.6                               0.0   
1                      1.9             0.6                        

In [14]:
df_byspecial = pd.DataFrame()

for year in years:
    temp = pd.read_excel('./data/enrollmentbyselectedpopulation/raw/ma_enrollmentbyselectedpopulation_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'First Language Not English #':str,'First Language Not English %':str,'English Language Learner #':str,'English Language Learner %':str,'Students With Disabilities #':str,'Students With Disabilities %':str,'Low Income #':str,'Low Income %':str,'Free Lunch #':str,'Free Lunch %':str,'Reduced Lunch #':str,'Reduced Lunch %':str,'High Needs #':str,'High Needs #':str,'Economically Disadvantaged #':str,'Economically Disadvantaged %':str,},skiprows=[0])
    temp['year'] = year
    df_byspecial = df_byspecial.append(temp, ignore_index=True, sort=True)

df_byspecial = df_byspecial.rename(columns={
    'First Language Not English #': 'First Language Not English',
    'English Language Learner #': 'English Language Learner',
    'Students With Disabilities #': 'Students With Disabilities',
    'Low Income #': 'Low Income',
    'Free Lunch #': 'Free Lunch',
    'Reduced Lunch #': 'Reduced Lunch',
    'High Needs #': 'High Needs',
    'Economically Disadvantaged #': 'Economically Disadvantaged'
})
print(df_byspecial.shape)
print(df_byspecial.head(5))

(4839, 19)
  District Code                                      District Name  \
0      04450000       Abby Kelley Foster Charter Public (District)   
1      00010000                                           Abington   
2      04120000  Academy Of the Pacific Rim Charter Public (Dis...   
3      06000000                                   Acton-Boxborough   
4      00030000                                           Acushnet   

  Economically Disadvantaged Economically Disadvantaged %  \
0                        684                           48   
1                        628                         29.7   
2                        260                         48.1   
3                        422                          8.1   
4                        258                         28.4   

  English Language Learner English Language Learner %  \
0                      195                       13.7   
1                      162                        7.7   
2                       34    

In [15]:
# pivot bygrade
id_vars = ['year', 'District Code', 'District Name']
value_vars = ['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9', 'K', 'PK', 'SP', 'Total']
df_bygrade = pd.melt(df_bygrade, id_vars = id_vars, value_vars = value_vars, var_name = 'grade', value_name = 'num')
df_bygrade['group_state'] = 'All Students'
print(df_bygrade.head(5))

      year District Code                                      District Name  \
0  2020-21      04450000       Abby Kelley Foster Charter Public (District)   
1  2020-21      00010000                                           Abington   
2  2020-21      04120000  Academy Of the Pacific Rim Charter Public (Dis...   
3  2020-21      06000000                                   Acton-Boxborough   
4  2020-21      00030000                                           Acushnet   

  grade       num   group_state  
0     1       116  All Students  
1     1       168  All Students  
2     1         0  All Students  
3     1       321  All Students  
4     1        83  All Students  


In [16]:
# convert num to float, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column].replace(',',''))
    except:
        return np.nan
df_bygrade['num'] = df_bygrade.apply(lambda x: toFloat(x, 'num'), axis = 1)
print(df_bygrade.shape)
df_bygrade = df_bygrade[~df_bygrade.num.eq(0)]
print(df_bygrade.shape)
df_bygrade = df_bygrade.dropna(subset=['num'])
print(df_bygrade.shape)

(77424, 6)
(57600, 6)
(57600, 6)


In [17]:
# pivot byracegender
id_vars = ['year', 'District Code', 'District Name', 'Total']
value_vars = ['African American', 'Asian', 'Females', 'Hispanic', 'Males', 'Multi-Race, Non-Hispanic', 'Native American', 'Native Hawaiian, Pacific Islander', 'White']
df_byracegender = pd.melt(df_byracegender, id_vars = id_vars, value_vars = value_vars, var_name = 'group_state', value_name = 'percent')
df_byracegender['grade'] = 'All'
print(df_byracegender.head(5))

      year District Code                                      District Name  \
0  2020-21      04450000       Abby Kelley Foster Charter Public (District)   
1  2020-21      00010000                                           Abington   
2  2020-21      04120000  Academy Of the Pacific Rim Charter Public (Dis...   
3  2020-21      06000000                                   Acton-Boxborough   
4  2020-21      00030000                                           Acushnet   

      Total       group_state percent grade  
0     1,425  African American    54.1   All  
1     2,117  African American     4.8   All  
2       540  African American    62.8   All  
3     5,207  African American     3.1   All  
4       910  African American     0.9   All  


In [18]:
# convert num to float, calculate num, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column].replace(',',''))
    except:
        return np.nan
df_byracegender['percent'] = df_byracegender.apply(lambda x: toFloat(x, 'percent'), axis = 1)
df_byracegender['Total'] = df_byracegender.apply(lambda x: toFloat(x, 'Total'), axis = 1)
df_byracegender['num'] = df_byracegender['percent'] * df_byracegender['Total'] / 100
del df_byracegender['percent']
del df_byracegender['Total']
print(df_byracegender.shape)
df_byracegender = df_byracegender[~df_byracegender.num.eq(0)]
print(df_byracegender.shape)
df_byracegender = df_byracegender.dropna(subset=['num'])
print(df_byracegender.shape)

(43551, 6)
(38469, 6)
(38469, 6)


In [19]:
# pivot byspecial
id_vars = ['year', 'District Code', 'District Name']
value_vars = ['Economically Disadvantaged', 'English Language Learner', 'First Language Not English', 'Free Lunch', 'High Needs', 'Low Income','Reduced Lunch', 'Students With Disabilities']
df_byspecial = pd.melt(df_byspecial, id_vars = id_vars, value_vars = value_vars, var_name = 'group_state', value_name = 'num')
df_byspecial['grade'] = 'All'
print(df_byspecial.head(5))

      year District Code                                      District Name  \
0  2020-21      04450000       Abby Kelley Foster Charter Public (District)   
1  2020-21      00010000                                           Abington   
2  2020-21      04120000  Academy Of the Pacific Rim Charter Public (Dis...   
3  2020-21      06000000                                   Acton-Boxborough   
4  2020-21      00030000                                           Acushnet   

                  group_state  num grade  
0  Economically Disadvantaged  684   All  
1  Economically Disadvantaged  628   All  
2  Economically Disadvantaged  260   All  
3  Economically Disadvantaged  422   All  
4  Economically Disadvantaged  258   All  


In [20]:
# convert num to float, then drop nulls and zeros
def toFloat(row, column):
    try:
        return float(row[column].replace(',',''))
    except:
        return np.nan
df_byspecial['num'] = df_byspecial.apply(lambda x: toFloat(x, 'num'), axis = 1)
print(df_byspecial.shape)
df_byspecial = df_byspecial[~df_byspecial.num.eq(0)]
print(df_byspecial.shape)
df_byspecial = df_byspecial.dropna(subset=['num'])
print(df_byspecial.shape)

(38712, 6)
(37645, 6)
(25932, 6)


In [21]:
# append all data
df = df_bygrade.append(df_byracegender, ignore_index=True, sort=True)
df = df.append(df_byspecial, ignore_index=True, sort=True)
df = df.rename(columns={'District Name': 'district'})
print(df.columns)
print(df.shape)
df = df[~df.district.eq('State Total')]
print(df.shape)
df = df[~df.district.eq('State Totals')]
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

Index(['District Code', 'district', 'grade', 'group_state', 'num', 'year'], dtype='object')
(122001, 6)
(121939, 6)
(121634, 6)
(120785, 6)


In [22]:
# add manual data changes
print(df.shape)
df = df[df['District Code'] != '04070000']
print(df.shape)
df = df[df['District Code'] != '04110000']
print(df.shape)
df = df[df['District Code'] != '04230000']
print(df.shape)
df = df[df['District Code'] != '04270000']
print(df.shape)
df = df[df['District Code'] != '04520000']
print(df.shape)
df = df[df['District Code'] != '04710000']
print(df.shape)
df = df[df['District Code'] != '04770000']
print(df.shape)
df = df[df['District Code'] != '04800000']
print(df.shape)
df = df[df['District Code'] != '35050000']
print(df.shape)
df = df[df['District Code'] != '35110000']
print(df.shape)

(120785, 6)
(120609, 6)
(120609, 6)
(120609, 6)
(120609, 6)
(120591, 6)
(120505, 6)
(120505, 6)
(120324, 6)
(120130, 6)
(120035, 6)


In [45]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0600
3    0003
4    0005
Name: district_id, dtype: object


In [46]:
# change grade of Total to All
df['grade'] = [x if x != 'Total' else 'All' for x in df['grade']]

In [47]:
print(df.dtypes)
df['num'] = df['num'].astype('int64')
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

district        object
grade           object
group_state     object
num            float64
year            object
district_id     object
dtype: object
district       object
grade          object
group_state    object
num             int64
year           object
district_id     int64
dtype: object


In [None]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

In [48]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]

df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
print(df.shape)
print(df.head(5))

(121634, 6)
(121634, 7)
                                       district grade   group_state  num  \
0  Abby Kelley Foster Charter Public (District)     1  All Students  116   
1                                      Abington     1  All Students  168   
2                              Acton-Boxborough     1  All Students  321   
3                                      Acushnet     1  All Students   83   
4                                        Agawam     1  All Students  255   

      year  district_id  charter_flag  
0  2020-21          445           1.0  
1  2020-21            1           NaN  
2  2020-21          600           NaN  
3  2020-21            3           NaN  
4  2020-21            5           NaN  


In [49]:
# edit year field
print(df['year'].drop_duplicates())
df['year'] = ['20' + x[-2:] for x in df['year']]
print(df['year'].drop_duplicates())

0       2020-21
313     2019-20
627     2018-19
943     2017-18
1261    2016-17
1578    2015-16
1895    2014-15
2208    2013-14
2520    2012-13
2829    2011-12
3135    2010-11
3441    2009-10
Name: year, dtype: object
0       2021
313     2020
627     2019
943     2018
1261    2017
1578    2016
1895    2015
2208    2014
2520    2013
2829    2012
3135    2011
3441    2010
Name: year, dtype: object


In [None]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

In [50]:
# join chapter to district data for view 2
df_2 = df[df.group_state.eq('All Students') & df.grade.eq('All')]
df_charter = df_2[df_2['charter_flag'] == 1]
df_district = df_2[df_2['charter_flag'] != 1]

df_2 = pd.read_csv('./data/finalized/charter_to_district.csv', dtype={'year':str})
print(df_2.shape)
df_2 = df_2.merge(df_charter, left_on=['year', 'charter_lea_code'], right_on=['year', 'district_id'], how='outer')
print(df_2.shape)
df_2 = df_2.merge(df_district, left_on=['year', 'sending_lea_code'], right_on=['year', 'district_id'], how='outer', suffixes=['_charter', '_district'])
print(df_2.shape)

(3513, 7)
(4006, 13)
(7066, 19)


In [51]:
# create enrollment file with multipliers for views 3 & 4
df_mult = pd.read_csv('./data/finalized/charter_to_district.csv')
print(df_mult.dtypes)
# df_totals = df_mult.groupby(['year', 'charter_lea_code'], as_index = False)['enrolled_n'].sum()
# df_totals = df_totals.rename(columns={'enrolled_n':'total_enrolled'})
# df_mult = df_mult.merge(df_totals, on=['year', 'sending_lea_code'])
print(df_mult.shape)
# df_mult['multiplier'] = df_mult['enrolled_n'] / df_mult['total_enrolled']

# filter to just fy21, since we're missing many years in the data
df_mult = df_mult[df_mult.fy.eq('fy21')]
print(df_mult.shape)
del df_mult['year']

df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'grade', 'group_state'], right_on = ['district_id', 'year', 'grade', 'group_state'], how='outer', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.head(5))

fy                            object
year                           int64
charter_lea_code               int64
sending_lea_code               int64
enrolled_n                     int64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(3513, 7)
(966, 7)
(239645, 13)
(271979, 17)
     fy  charter_lea_code  sending_lea_code  enrolled_n  \
0  fy21             409.0               3.0         2.0   
1  fy21             409.0               3.0         2.0   
2  fy21             409.0               3.0         2.0   
3  fy21             409.0               3.0         2.0   
4  fy21             409.0               3.0         2.0   

  physical_charter_location  chartered_to_serve  \
0                       NaN                 NaN   
1                       NaN                 NaN   
2                       NaN                 NaN   
3                       NaN                 NaN   
4                       NaN                 NaN   

                    

In [52]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [53]:
# export final dataset
df.to_csv('./data/finalized/enrollment.csv', index=False)

In [54]:
df_2.to_csv('./data/finalized/enrollment2.csv', index=False)

In [55]:
# export final mult dataset
df_mult.to_csv('./data/finalized/enrollment_with_multipliers.csv', index=False)