In [1]:
# import files
import pandas as pd
import numpy as np

path = './data/studentattendancereport/raw/'

years = ['2019-20', '2018-19', '2017-18']
groups = ['All Students', 'Female', 'Male', 'Economically Disadvantaged', 'High Needs', 'English learner', 'Low Income', 'Students with disabilities', 'African AmericanBlack', 'American Indian or Alaskan Native', 'Asian', 'Hispanic or Latino', 'Multi-race, non-Hispanic or Latino', 'Native Hawaiian or Pacific Islander', 'White']

df = pd.DataFrame()

for year in years:
    for group in groups:
        temp = pd.read_excel(path + 'ma_studentattendancereport_' + group + '_'+ year + '.xlsx', dtype={'District Name':str,'District Code':str,'Attendance Rate':np.float64,'Average # of Absences':np.float64,'Absent 10 or more days':np.float64,'Chronically Absent (10% or more)':np.float64,'Unexcused > 9 days':np.float64},skiprows=[0])
        temp['year'] = year
        temp['group_state'] = group
        df = df.append(temp, ignore_index=True, sort=True)

df = df.rename(columns={'District Name': 'district'})
        
print(df.shape)
print(df.head(5))

(15874, 9)
   Absent 10 or more days  Attendance Rate  Average # of Absences  \
0                     5.7             97.1                    3.1   
1                    17.9             94.5                    6.1   
2                    20.2             94.6                    5.9   
3                     9.1             96.2                    4.1   
4                    11.5             95.8                    4.8   

   Chronically Absent (10% or more) District Code  \
0                               3.5      04450000   
1                              12.7      00010000   
2                              14.9      04120000   
3                               7.6      06000000   
4                               7.3      00030000   

                                            district  Unexcused > 9 days  \
0       Abby Kelley Foster Charter Public (District)                 5.7   
1                                           Abington                 5.8   
2  Academy Of the Pacific R

In [3]:
# add manual data changes
print(df.shape)
df = df[df['District Code'] != '04070000']
print(df.shape)
df = df[df['District Code'] != '04110000']
print(df.shape)
df = df[df['District Code'] != '04230000']
print(df.shape)
df = df[df['District Code'] != '04270000']
print(df.shape)
df = df[df['District Code'] != '04520000']
print(df.shape)
df = df[df['District Code'] != '04710000']
print(df.shape)
df = df[df['District Code'] != '04770000']
print(df.shape)
df = df[df['District Code'] != '04800000']
print(df.shape)
df = df[df['District Code'] != '35050000']
print(df.shape)
df = df[df['District Code'] != '35110000']
print(df.shape)

(15874, 9)
(15835, 9)
(15794, 9)
(15794, 9)
(15768, 9)
(15729, 9)
(15729, 9)
(15717, 9)
(15677, 9)
(15635, 9)
(15599, 9)


In [12]:
# remove district totals & Horace Mann
print(df.shape)
df = df[df['District Code'] != '00000000']
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

(15874, 9)
(15832, 9)
(15714, 9)


In [13]:
# reshape data
id_vars = ['year', 'District Code', 'district', 'group_state']
value_vars = ['Absent 10 or more days', 'Attendance Rate', 'Average # of Absences', 'Chronically Absent (10% or more)', 'Unexcused > 9 days']
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, var_name = 'metric', value_name = 'value')
print(df.head(5))

      year District Code                                           district  \
0  2019-20      04450000       Abby Kelley Foster Charter Public (District)   
1  2019-20      00010000                                           Abington   
2  2019-20      04120000  Academy Of the Pacific Rim Charter Public (Dis...   
3  2019-20      06000000                                   Acton-Boxborough   
4  2019-20      00030000                                           Acushnet   

    group_state                  metric  value  
0  All Students  Absent 10 or more days    5.7  
1  All Students  Absent 10 or more days   17.9  
2  All Students  Absent 10 or more days   20.2  
3  All Students  Absent 10 or more days    9.1  
4  All Students  Absent 10 or more days   11.5  


In [14]:
# drop rows with null values
print(df.shape)
df = df.dropna(subset=['value'])
print(df.shape)

(78570, 6)
(69765, 6)


In [15]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0412
3    0600
4    0003
Name: district_id, dtype: object


In [18]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

year            object
district        object
group_state     object
metric          object
value          float64
district_id     object
dtype: object
year            object
district        object
group_state     object
metric          object
value          float64
district_id      int64
dtype: object


In [None]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

In [19]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]

df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
del df_flags
print(df.shape)
print(df.head(5))

(69765, 6)
(69765, 7)
      year                                           district   group_state  \
0  2019-20       Abby Kelley Foster Charter Public (District)  All Students   
1  2019-20                                           Abington  All Students   
2  2019-20  Academy Of the Pacific Rim Charter Public (Dis...  All Students   
3  2019-20                                   Acton-Boxborough  All Students   
4  2019-20                                           Acushnet  All Students   

                   metric  value  district_id  charter_flag  
0  Absent 10 or more days    5.7          445           1.0  
1  Absent 10 or more days   17.9            1           NaN  
2  Absent 10 or more days   20.2          412           1.0  
3  Absent 10 or more days    9.1          600           NaN  
4  Absent 10 or more days   11.5            3           NaN  


In [21]:
# edit year field
print(df['year'].drop_duplicates())
df['year'] = ['20' + x[-2:] for x in df['year']]
print(df['year'].drop_duplicates())

0       2019-20
4637    2018-19
9308    2017-18
Name: year, dtype: object
0       2020
4637    2019
9308    2018
Name: year, dtype: object


In [None]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

In [23]:
# join charter_to_district data for calculating multipliers
df_mult = pd.read_csv('./data/finalized/charter_to_district.csv')
print(df_mult.dtypes)
# df_totals = df_mult.groupby(['year', 'charter_lea_code'], as_index = False)['enrolled_n'].sum()
# df_totals = df_totals.rename(columns={'enrolled_n':'total_enrolled'})
# df_mult = df_mult.merge(df_totals, on=['year', 'sending_lea_code'])
print(df_mult.shape)
# df_mult['multiplier'] = df_mult['enrolled_n'] / df_mult['total_enrolled']

# filter to just fy21, since we're missing many years in the data
df_mult = df_mult[df_mult.fy.eq('fy21')]
print(df_mult.shape)
del df_mult['year']

df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'group_state', 'metric'], right_on = ['district_id', 'year','group_state', 'metric'], how='outer', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.head(5))

fy                            object
year                           int64
charter_lea_code               int64
sending_lea_code               int64
enrolled_n                     int64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(3513, 7)
(966, 7)
(162095, 13)
(176945, 17)
     fy  charter_lea_code  sending_lea_code  enrolled_n  \
0  fy21             409.0               3.0         2.0   
1  fy21             496.0               3.0         2.0   
2  fy21             409.0               3.0         2.0   
3  fy21             496.0               3.0         2.0   
4  fy21             409.0               3.0         2.0   

  physical_charter_location  chartered_to_serve  year  \
0                       NaN                 NaN  2020   
1                       NaN                 NaN  2020   
2                       NaN                 NaN  2020   
3                       NaN                 NaN  2020   
4                       NaN               

In [52]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [24]:
# export attendance dataset for QA
df.to_csv('./data/finalized/attendance.csv')

In [25]:
# export final dataset
df_mult.to_csv('./data/finalized/attendance_with_multipliers.csv', index=False)