In [1]:
# import files
import pandas as pd
import numpy as np

path = './data/graduation/raw/'

years = ['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']
gradrates = ['4yr Grad', '4yr Grad Adjusted', '5yr Grad', '5yr Grad Adjusted']
groups = ['All Students', 'Female', 'Male', 'High Needs', 'English Learner', 'Low Income', 'Students with disabilities', 'African AmericanBlack', 'American Indian or Alaskan Native', 'Asian', 'Hispanic or Latino', 'Multi-race, non-Hispanic or Latino', 'Native Hawaiian or Pacific Islander', 'White']

df = pd.DataFrame()

for year in years:
    for gradrate in gradrates:
        for group in groups:
            temp = pd.read_excel(path + 'ma_graduation_' + gradrate + '_' + group + '_'+ year + '.xlsx', dtype={'District Name':str,'District Code':str,'# in Cohort':np.float64,'% Graduated':np.float64,'% Still in School':np.float64,'% Non-Grad Completers':np.float64,'% H.S. Equiv.':np.float64,'% Dropped Out':np.float64,'% Permanently Excluded':np.float64},skiprows=[0], thousands=',')
            temp['year'] = year
            temp['metric'] = gradrate
            temp['group_state'] = group
            df = df.append(temp, ignore_index=True, sort=True)

df = df.rename(columns={'District Name': 'district'})
        
print(df.shape)
print(df.head(5))

(94597, 12)
   # in Cohort  % Dropped Out  % Graduated  % H.S. Equiv.  \
0         84.0            2.4         97.6            0.0   
1        122.0            6.6         89.3            0.0   
2         40.0            0.0         95.0            0.0   
3        503.0            0.8         96.6            0.2   
4         88.0            4.5         89.8            2.3   

   % Non-Grad Completers  % Permanently Excluded  % Still in School  \
0                    0.0                     0.0                0.0   
1                    0.8                     0.0                3.3   
2                    0.0                     0.0                5.0   
3                    0.0                     0.0                2.4   
4                    1.1                     0.0                2.3   

  District Code                                           district  \
0      04450000       Abby Kelley Foster Charter Public (District)   
1      00010000                                       

In [2]:
# add manual data changes
print(df.shape)
df = df[df['District Code'] != '04070000']
print(df.shape)
df = df[df['District Code'] != '04110000']
print(df.shape)
df = df[df['District Code'] != '04230000']
print(df.shape)
df = df[df['District Code'] != '04270000']
print(df.shape)
df = df[df['District Code'] != '04520000']
print(df.shape)
df = df[df['District Code'] != '04710000']
print(df.shape)
df = df[df['District Code'] != '04770000']
print(df.shape)
df = df[df['District Code'] != '04800000']
print(df.shape)
df = df[df['District Code'] != '35050000']
print(df.shape)
df = df[df['District Code'] != '35110000']
print(df.shape)

(94597, 12)
(94597, 12)
(94363, 12)
(94363, 12)
(94363, 12)
(94060, 12)
(93950, 12)
(93950, 12)
(93950, 12)
(93950, 12)
(93950, 12)


In [3]:
# remove district totals & Horace Mann
print(df.shape)
df = df[df['District Code'] != '00000000']
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

(93950, 12)
(93686, 12)
(93686, 12)


In [4]:
# drop rows with null values
print(df.shape)
df = df.dropna(subset=['# in Cohort', '% Dropped Out', '% Graduated', '% H.S. Equiv.',
       '% Non-Grad Completers', '% Permanently Excluded', '% Still in School'], how='all')
print(df.shape)

(93686, 12)
(93686, 12)


In [5]:
print(df.columns)

Index(['# in Cohort', '% Dropped Out', '% Graduated', '% H.S. Equiv.',
       '% Non-Grad Completers', '% Permanently Excluded', '% Still in School',
       'District Code', 'district', 'group_state', 'metric', 'year'],
      dtype='object')


In [6]:
# reshape data from long to wide
print(df.shape)
df1 = df[df.metric.eq('4yr Grad')]
del df1['metric']
print(df1.shape)
df2 = df[df.metric.eq('4yr Grad Adjusted')]
del df2['metric']
print(df2.shape)
df3 = df[df.metric.eq('5yr Grad')]
del df3['metric']
print(df3.shape)
df4 = df[df.metric.eq('5yr Grad Adjusted')]
del df4['metric']
print(df4.shape)

df_new = df1.merge(df2, on=['district', 'group_state', 'year', 'District Code'], how = 'outer', suffixes=('', '4yr_adjusted')).merge(df3, on=['district', 'group_state', 'year', 'District Code'], how = 'outer', suffixes=('', '5yr')).merge(df4, on=['district', 'group_state', 'year', 'District Code'], how = 'outer', suffixes=('', '5yr_adjusted'))
print(df_new.shape)
print(df_new.columns)
del df
df = df_new
del df_new

(93686, 12)
(25436, 11)
(23851, 11)
(22907, 11)
(21492, 11)
(25719, 32)
Index(['# in Cohort', '% Dropped Out', '% Graduated', '% H.S. Equiv.',
       '% Non-Grad Completers', '% Permanently Excluded', '% Still in School',
       'District Code', 'district', 'group_state', 'year',
       '# in Cohort4yr_adjusted', '% Dropped Out4yr_adjusted',
       '% Graduated4yr_adjusted', '% H.S. Equiv.4yr_adjusted',
       '% Non-Grad Completers4yr_adjusted',
       '% Permanently Excluded4yr_adjusted', '% Still in School4yr_adjusted',
       '# in Cohort5yr', '% Dropped Out5yr', '% Graduated5yr',
       '% H.S. Equiv.5yr', '% Non-Grad Completers5yr',
       '% Permanently Excluded5yr', '% Still in School5yr',
       '# in Cohort5yr_adjusted', '% Dropped Out5yr_adjusted',
       '% Graduated5yr_adjusted', '% H.S. Equiv.5yr_adjusted',
       '% Non-Grad Completers5yr_adjusted',
       '% Permanently Excluded5yr_adjusted', '% Still in School5yr_adjusted'],
      dtype='object')


In [7]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0412
3    0600
4    0603
Name: district_id, dtype: object


In [8]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

# in Cohort                           float64
% Dropped Out                         float64
% Graduated                           float64
% H.S. Equiv.                         float64
% Non-Grad Completers                 float64
% Permanently Excluded                float64
% Still in School                     float64
district                               object
group_state                            object
year                                   object
# in Cohort4yr_adjusted               float64
% Dropped Out4yr_adjusted             float64
% Graduated4yr_adjusted               float64
% H.S. Equiv.4yr_adjusted             float64
% Non-Grad Completers4yr_adjusted     float64
% Permanently Excluded4yr_adjusted    float64
% Still in School4yr_adjusted         float64
# in Cohort5yr                        float64
% Dropped Out5yr                      float64
% Graduated5yr                        float64
% H.S. Equiv.5yr                      float64
% Non-Grad Completers5yr          

In [9]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

(25719, 31)
(25719, 32)


In [10]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]

df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
del df_flags
print(df.shape)
print(df.head(5))

(25719, 32)
(25719, 33)
   # in Cohort  % Dropped Out  % Graduated  % H.S. Equiv.  \
0         84.0            2.4         97.6            0.0   
1         35.0            0.0        100.0            0.0   
2         49.0            4.1         95.9            0.0   
3         44.0            2.3         97.7            0.0   
4         40.0            2.5         97.5            0.0   

   % Non-Grad Completers  % Permanently Excluded  % Still in School  \
0                    0.0                     0.0                0.0   
1                    0.0                     0.0                0.0   
2                    0.0                     0.0                0.0   
3                    0.0                     0.0                0.0   
4                    0.0                     0.0                0.0   

    group_state  year  # in Cohort4yr_adjusted  ...  # in Cohort5yr_adjusted  \
0  All Students  2019                     84.0  ...                      NaN   
1        Female  2019 

In [11]:
# edit year field
# print(df['year'].drop_duplicates())
# df['year'] = ['20' + x[-2:] for x in df['year']]
print(df['year'].drop_duplicates())

0     2019
9     2018
19    2017
28    2016
37    2015
47    2014
57    2013
66    2012
75    2011
83    2010
Name: year, dtype: object


In [12]:
print(df.columns)

Index(['# in Cohort', '% Dropped Out', '% Graduated', '% H.S. Equiv.',
       '% Non-Grad Completers', '% Permanently Excluded', '% Still in School',
       'group_state', 'year', '# in Cohort4yr_adjusted',
       '% Dropped Out4yr_adjusted', '% Graduated4yr_adjusted',
       '% H.S. Equiv.4yr_adjusted', '% Non-Grad Completers4yr_adjusted',
       '% Permanently Excluded4yr_adjusted', '% Still in School4yr_adjusted',
       '# in Cohort5yr', '% Dropped Out5yr', '% Graduated5yr',
       '% H.S. Equiv.5yr', '% Non-Grad Completers5yr',
       '% Permanently Excluded5yr', '% Still in School5yr',
       '# in Cohort5yr_adjusted', '% Dropped Out5yr_adjusted',
       '% Graduated5yr_adjusted', '% H.S. Equiv.5yr_adjusted',
       '% Non-Grad Completers5yr_adjusted',
       '% Permanently Excluded5yr_adjusted', '% Still in School5yr_adjusted',
       'district_id', 'district', 'charter_flag'],
      dtype='object')


In [13]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

   Charter LEA ID                                   Charter LEA Name  \
0         4070405  Dudley Street Neighborhood Charter School (Dis...   
1         4090205  Alma del Mar Charter School (District) - Alma ...   
2         4100205  Excel Academy Charter (District) - Excel Acade...   
3         4110305  Boston Green Academy Horace Mann Charter Schoo...   
4         4120530  Academy Of the Pacific Rim Charter Public (Dis...   

  Regional Affiliation/s Geographic Location/s  
0                 Boston                Boston  
1                Gateway          Southeastern  
2                 Boston                Boston  
3                 Boston                Boston  
4                 Boston                Boston  
(25719, 33)
(26468, 35)


In [14]:
# join charter_to_district data for calculating multipliers
df_mult = pd.read_csv('./data/finalized/charter_to_district.csv')
print(df_mult.dtypes)
# df_totals = df_mult.groupby(['year', 'charter_lea_code'], as_index = False)['enrolled_n'].sum()
# df_totals = df_totals.rename(columns={'enrolled_n':'total_enrolled'})
# df_mult = df_mult.merge(df_totals, on=['year', 'sending_lea_code'])
print(df_mult.shape)
# df_mult['multiplier'] = df_mult['enrolled_n'] / df_mult['total_enrolled']

# filter to just fy21, since we're missing many years in the data
df_mult = df_mult[df_mult.fy.eq('fy21')]
print(df_mult.shape)
del df_mult['year']

df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'group_state'], right_on = ['district_id', 'year','group_state'], how='outer', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.head(5))

fy                            object
year                           int64
charter_lea_code               int64
sending_lea_code               int64
enrolled_n                     int64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(3513, 7)
(966, 7)
(50732, 41)
(58840, 74)
     fy  charter_lea_code  sending_lea_code  enrolled_n  \
0  fy21             409.0               3.0         2.0   
1  fy21             409.0              72.0         1.0   
2  fy21            3509.0              72.0         2.0   
3  fy21             409.0              94.0         3.0   
4  fy21             409.0             201.0       643.0   

  physical_charter_location  chartered_to_serve  # in Cohort_charter  \
0                       NaN                 NaN                  NaN   
1                       NaN                 NaN                  NaN   
2                       NaN                 NaN                  NaN   
3                       NaN              

In [15]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [16]:
# export grad dataset for QA
df.to_csv('./data/finalized/graduation.csv')

In [17]:
# export final dataset
df_mult.to_csv('./data/finalized/graduation_with_multipliers.csv', index=False)