In [1]:
# import files
import pandas as pd
import numpy as np

path = './data/dropout/raw/'

years = ['2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12', '2010-11', '2009-10']
groups = ['All Students', 'Female', 'Male', 'Economically Disadvantaged', 'High Needs', 'English Learner', 'Low Income', 'Students with disabilities', 'African AmericanBlack', 'American Indian or Alaskan Native', 'Asian', 'Hispanic or Latino', 'Multi-race, non-Hispanic or Latino', 'Native Hawaiian or Pacific Islander', 'White']

df = pd.DataFrame()

for year in years:
    for group in groups:
        temp = pd.read_excel(path + 'ma_dropout_' + group + '_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'# Enrolled Grades 09 through 12':np.float64,'# Dropout All Grades':np.float64,'% Dropout All Grades':np.float64,'% Dropout Grade 09':np.float64,'% Dropout Grade 10':np.float64,'% Dropout Grade 11':np.float64,'% Dropout Grade 12':np.float64},skiprows=[0], thousands=',')
        temp['year'] = year
        temp['group_state'] = group
        df = df.append(temp, ignore_index=True, sort=True)

df = df.rename(columns={'District Name': 'district'})
        
print(df.shape)
print(df.head(5))

(38218, 11)
   # Dropout All Grades  # Enrolled Grades 09 through 12  \
0                   1.0                            375.0   
1                   3.0                            540.0   
2                   1.0                            226.0   
3                   8.0                           1834.0   
4                   5.0                            292.0   

   % Dropout All Grades  % Dropout Grade 09  % Dropout Grade 10  \
0                   0.3                 0.0                 0.0   
1                   0.6                 0.6                 0.0   
2                   0.4                 0.0                 1.4   
3                   0.4                 0.0                 0.5   
4                   1.7                 1.2                 0.0   

   % Dropout Grade 11  % Dropout Grade 12 District Code  \
0                 0.0                 1.2      04450000   
1                 0.8                 0.9      00010000   
2                 0.0                 0.0      

In [2]:
# add manual data changes
print(df.shape)

# remove district totals & Horace Mann
print(df.shape)
df = df[df['District Code'] != '00000000']
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

df.district = df.district.str.replace('MATCH', 'Match', regex=False)
print(df[df.district.str.contains('Match')].district.drop_duplicates())

(38218, 11)
(38218, 11)
(38078, 11)
(37858, 11)
148    Match Charter Public School (District)
Name: district, dtype: object


In [3]:
# drop rows with null values
print(df.shape)
df = df.dropna(subset=['# Dropout All Grades','# Enrolled Grades 09 through 12','% Dropout All Grades','% Dropout Grade 09','% Dropout Grade 10','% Dropout Grade 11','% Dropout Grade 12',], how='all')
print(df.shape)

(37858, 11)
(37858, 11)


In [4]:
print(df.columns)

Index(['# Dropout All Grades', '# Enrolled Grades 09 through 12',
       '% Dropout All Grades', '% Dropout Grade 09', '% Dropout Grade 10',
       '% Dropout Grade 11', '% Dropout Grade 12', 'District Code', 'district',
       'group_state', 'year'],
      dtype='object')


In [5]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0412
3    0600
4    0603
Name: district_id, dtype: object


In [6]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

# Dropout All Grades               float64
# Enrolled Grades 09 through 12    float64
% Dropout All Grades               float64
% Dropout Grade 09                 float64
% Dropout Grade 10                 float64
% Dropout Grade 11                 float64
% Dropout Grade 12                 float64
district                            object
group_state                         object
year                                object
district_id                         object
dtype: object
# Dropout All Grades               float64
# Enrolled Grades 09 through 12    float64
% Dropout All Grades               float64
% Dropout Grade 09                 float64
% Dropout Grade 10                 float64
% Dropout Grade 11                 float64
% Dropout Grade 12                 float64
district                            object
group_state                         object
year                                object
district_id                          int64
dtype: object


In [7]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

(37858, 10)
(37858, 11)


In [8]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]
df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
del df_flags
print(df.shape)
print(df.head(5))

(37858, 11)
(37858, 12)
   # Dropout All Grades  # Enrolled Grades 09 through 12  \
0                   1.0                            375.0   
1                   0.0                            196.0   
2                   1.0                            179.0   
3                   0.0                            110.0   
4                   0.0                            148.0   

   % Dropout All Grades  % Dropout Grade 09  % Dropout Grade 10  \
0                   0.3                 0.0                 0.0   
1                   0.0                 0.0                 0.0   
2                   0.6                 0.0                 0.0   
3                   0.0                 0.0                 0.0   
4                   0.0                 0.0                 0.0   

   % Dropout Grade 11  % Dropout Grade 12                 group_state  \
0                 0.0                 1.2                All Students   
1                 0.0                 0.0                      Fem

In [9]:
# edit year field
print(df['year'].drop_duplicates())
df['year'] = ['20' + x[-2:] for x in df['year']]
print(df['year'].drop_duplicates())

0      2018-19
13     2017-18
27     2016-17
40     2015-16
52     2014-15
63     2013-14
74     2012-13
86     2011-12
97     2010-11
108    2009-10
Name: year, dtype: object
0      2019
13     2018
27     2017
40     2016
52     2015
63     2014
74     2013
86     2012
97     2011
108    2010
Name: year, dtype: object


In [10]:
print(df.columns)

Index(['# Dropout All Grades', '# Enrolled Grades 09 through 12',
       '% Dropout All Grades', '% Dropout Grade 09', '% Dropout Grade 10',
       '% Dropout Grade 11', '% Dropout Grade 12', 'group_state', 'year',
       'district_id', 'district', 'charter_flag'],
      dtype='object')


In [11]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

   Charter LEA ID                                   Charter LEA Name  \
0         4070405  Dudley Street Neighborhood Charter School (Dis...   
1         4090205  Alma del Mar Charter School (District) - Alma ...   
2         4100205  Excel Academy Charter (District) - Excel Acade...   
3         4110305  Boston Green Academy Horace Mann Charter Schoo...   
4         4120530  Academy Of the Pacific Rim Charter Public (Dis...   

  Regional Affiliation/s Geographic Location/s  
0                 Boston                Boston  
1                Gateway          Southeastern  
2                 Boston                Boston  
3                 Boston                Boston  
4                 Boston                Boston  
(37858, 12)
(39042, 14)


In [12]:
print(df.columns)

Index(['# Dropout All Grades', '# Enrolled Grades 09 through 12',
       '% Dropout All Grades', '% Dropout Grade 09', '% Dropout Grade 10',
       '% Dropout Grade 11', '% Dropout Grade 12', 'group_state', 'year',
       'district_id', 'district', 'charter_flag', 'Regional Affiliation/s',
       'Geographic Location/s'],
      dtype='object')


In [13]:
# import charter_to_district data for calculating multipliers
df_mult = pd.read_csv('./data/finalized/charter_to_district_edited.csv')
print(df_mult.dtypes)
print(df_mult.shape)

charter_lea_code               int64
sending_lea_code             float64
enrolled_n                   float64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(1053, 5)


In [14]:
# import district_remainders data for calculating multipliers
df_enrollment = pd.read_csv('./data/finalized/district_remainders.csv')
print(df_enrollment.dtypes)
print(df_enrollment.shape)

sending_lea_code      int64
enrolled_n          float64
dtype: object
(253, 2)


In [15]:
# join charter_to_district to data
df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'group_state'], right_on = ['district_id', 'year','group_state'], how='left', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.columns)

(96661, 19)
(96661, 31)
Index(['charter_lea_code', 'sending_lea_code', 'enrolled_n',
       'physical_charter_location', 'chartered_to_serve',
       '# Dropout All Grades_charter',
       '# Enrolled Grades 09 through 12_charter',
       '% Dropout All Grades_charter', '% Dropout Grade 09_charter',
       '% Dropout Grade 10_charter', '% Dropout Grade 11_charter',
       '% Dropout Grade 12_charter', 'group_state', 'year',
       'district_id_charter', 'district_charter', 'charter_flag_charter',
       'Regional Affiliation/s_charter', 'Geographic Location/s_charter',
       '# Dropout All Grades_district',
       '# Enrolled Grades 09 through 12_district',
       '% Dropout All Grades_district', '% Dropout Grade 09_district',
       '% Dropout Grade 10_district', '% Dropout Grade 11_district',
       '% Dropout Grade 12_district', 'district_id_district',
       'district_district', 'charter_flag_district',
       'Regional Affiliation/s_district', 'Geographic Location/s_district'],
 

In [16]:
# join enrollment remainders to district data
df_district.columns = df_district.columns.map(lambda x: str(x) + '_district')
df_district = df_district.rename(columns={'year_district': 'year', 'group_state_district': 'group_state'})
df_district = df_district.merge(df_enrollment, left_on=['district_id_district'], right_on=['sending_lea_code'], how='inner')
print(df_district.columns)

Index(['# Dropout All Grades_district',
       '# Enrolled Grades 09 through 12_district',
       '% Dropout All Grades_district', '% Dropout Grade 09_district',
       '% Dropout Grade 10_district', '% Dropout Grade 11_district',
       '% Dropout Grade 12_district', 'group_state', 'year',
       'district_id_district', 'district_district', 'charter_flag_district',
       'Regional Affiliation/s_district', 'Geographic Location/s_district',
       'sending_lea_code', 'enrolled_n'],
      dtype='object')


In [17]:
# append back to other joined data
print(df_mult.shape)
print(df_district.shape)
df_mult = df_mult.append(df_district, ignore_index=True, sort=True)
print(df_mult.shape)

(96661, 31)
(27595, 16)
(124256, 31)


In [18]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [19]:
# export grad dataset for QA
df.to_csv('./data/finalized/dropout.csv')

In [20]:
# export final dataset
print(df_mult.shape)
df_mult = df_mult.dropna(subset=['year'])
print(df_mult.shape)
df_mult.to_csv('./data/finalized/dropout_with_multipliers.csv', index=False)

(124256, 31)
(123964, 31)
