In [1]:
import pandas as pd
import numpy as np

In [2]:
# import attrition files
path = './data/attrition/raw/'

years = ['2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12', '2010-11', '2009-10']
groups = ['All Students', 'Female', 'Male', 'Economically Disadvantaged', 'High Needs', 'English Learner', 'Low Income', 'Students with disabilities', 'African AmericanBlack', 'American Indian or Alaskan Native', 'Asian', 'Hispanic or Latino', 'Multi-race, non-Hispanic or Latino', 'Native Hawaiian or Pacific Islander', 'White']

df_a = pd.DataFrame()

for year in years:
    for group in groups:
        temp = pd.read_excel(path + 'ma_attrition_' + group + '_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'K':np.float64,'1':np.float64,'2':np.float64,'3':np.float64,'4':np.float64,'5':np.float64,'6':np.float64,'7':np.float64,'8':np.float64,'9':np.float64,'10':np.float64,'11':np.float64,'ALL':np.float64},skiprows=[0], thousands=',')
        temp['year'] = year
        temp['group_state'] = group
        df_a = df_a.append(temp, ignore_index=True, sort=True)

df_a = df_a.rename(columns={'District Name': 'district'})
        
print(df_a.shape)
print(df_a.head(5))

(56234, 17)
     1   10   11    2    3    4     5     6    7     8     9   ALL  \
0  1.8  6.3  2.3  6.7  1.7  3.5   3.4   4.2  6.2  13.6   4.6   4.9   
1  3.5  2.0  5.0  5.0  5.5  3.1   1.2   3.1  3.7  17.9   1.9   4.9   
2  NaN  4.5  4.4  NaN  NaN  NaN  10.0  17.7  8.0  15.1  11.0  10.6   
3  3.5  3.5  0.9  3.9  3.6  3.7   4.9   5.9  1.8   8.0   2.4   3.8   
4  3.7  NaN  NaN  1.3  3.7  4.7   4.0   0.0  5.7   NaN   NaN   3.6   

  District Code                                           district    K  \
0      04450000       Abby Kelley Foster Charter Public (District)  3.4   
1      00010000                                           Abington  6.5   
2      04120000  Academy Of the Pacific Rim Charter Public (Dis...  NaN   
3      06000000                                   Acton-Boxborough  3.3   
4      00030000                                           Acushnet  4.9   

    group_state     year  
0  All Students  2019-20  
1  All Students  2019-20  
2  All Students  2019-20  
3  All S

In [3]:
# edit year field
print(df_a['year'].drop_duplicates())
df_a['year'] = ['20' + x[-2:] for x in df_a['year']]
print(df_a['year'].drop_duplicates())

0        2019-20
5203     2018-19
10381    2017-18
15565    2016-17
20746    2015-16
25938    2014-15
31072    2013-14
36196    2012-13
41262    2011-12
46265    2010-11
51269    2009-10
Name: year, dtype: object
0        2020
5203     2019
10381    2018
15565    2017
20746    2016
25938    2015
31072    2014
36196    2013
41262    2012
46265    2011
51269    2010
Name: year, dtype: object


In [4]:
# import mobility files
path = './data/mobility/raw/'

years = ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']
groups = ['All Students', 'Economically Disadvantaged', 'High Needs', 'English Learner', 'Low Income', 'Students with disabilities', 'African AmericanBlack', 'American Indian or Alaskan Native', 'Asian', 'Hispanic or Latino', 'Multi-race, non-Hispanic or Latino', 'Native Hawaiian or Pacific Islander', 'White']

df_m = pd.DataFrame()

for year in years:
    for group in groups:
        temp = pd.read_excel(path + 'ma_mobility_' + group + '_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'Churn/Intake Enroll':np.float64,'% Churn':np.float64,'% Intake':np.float64,'Stability Enroll':np.float64,'% Stability':np.float64},skiprows=[0], thousands=',')
        temp['year'] = year
        temp['group_state'] = group
        df_m = df_m.append(temp, ignore_index=True, sort=True)

df_m = df_m.rename(columns={'District Name': 'district'})
        
print(df_m.shape)
print(df_m.head(5))

(41238, 9)
   % Churn  % Intake  % Stability  Churn/Intake Enroll District Code  \
0      5.1       3.3         95.8               1451.0      04450000   
1      7.1       4.4         95.7               2229.0      00010000   
2      6.9       5.4         96.4                551.0      04120000   
3      4.0       2.6         98.1               5531.0      06000000   
4      4.8       2.6         96.9               1033.0      00030000   

                                            district  Stability Enroll  \
0       Abby Kelley Foster Charter Public (District)            1438.0   
1                                           Abington            2163.0   
2  Academy Of the Pacific Rim Charter Public (Dis...             532.0   
3                                   Acton-Boxborough            5413.0   
4                                           Acushnet            1016.0   

    group_state  year  
0  All Students  2020  
1  All Students  2020  
2  All Students  2020  
3  All Students

In [5]:
# join attrition & mobility datasets
print(df_a.shape)
print(df_m.shape)
df = df_a.merge(df_m, on=['year', 'District Code', 'group_state'], how='outer')
print(df.shape)
df['district'] = df.district_x.combine_first(df.district_y)
print(df.shape)
print(df.head(5))

(56234, 17)
(41238, 9)
(57003, 23)
(57003, 24)
     1   10   11    2    3    4     5     6    7     8  ...    K  \
0  1.8  6.3  2.3  6.7  1.7  3.5   3.4   4.2  6.2  13.6  ...  3.4   
1  3.5  2.0  5.0  5.0  5.5  3.1   1.2   3.1  3.7  17.9  ...  6.5   
2  NaN  4.5  4.4  NaN  NaN  NaN  10.0  17.7  8.0  15.1  ...  NaN   
3  3.5  3.5  0.9  3.9  3.6  3.7   4.9   5.9  1.8   8.0  ...  3.3   
4  3.7  NaN  NaN  1.3  3.7  4.7   4.0   0.0  5.7   NaN  ...  4.9   

    group_state  year % Churn  % Intake % Stability Churn/Intake Enroll  \
0  All Students  2020     5.1       3.3        95.8              1451.0   
1  All Students  2020     7.1       4.4        95.7              2229.0   
2  All Students  2020     6.9       5.4        96.4               551.0   
3  All Students  2020     4.0       2.6        98.1              5531.0   
4  All Students  2020     4.8       2.6        96.9              1033.0   

                                          district_y  Stability Enroll  \
0       Abby Kelley

In [6]:
del df_a
del df_m
del df['district_x']
del df['district_y']

In [7]:
# add manual data changes
print(df.shape)
df = df[df['District Code'] != '04070000']
print(df.shape)
df = df[df['District Code'] != '04110000']
print(df.shape)
df = df[df['District Code'] != '04230000']
print(df.shape)
df = df[df['District Code'] != '04270000']
print(df.shape)
df = df[df['District Code'] != '04520000']
print(df.shape)
df = df[df['District Code'] != '04710000']
print(df.shape)
df = df[df['District Code'] != '04770000']
print(df.shape)
df = df[df['District Code'] != '04800000']
print(df.shape)
df = df[df['District Code'] != '35050000']
print(df.shape)
df = df[df['District Code'] != '35110000']
print(df.shape)

# remove district totals & Horace Mann
print(df.shape)
df = df[df['District Code'] != '00000000']
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

df.district = df.district.str.replace('MATCH', 'Match', regex=False)
print(df[df.district.str.contains('Match')].district.drop_duplicates())

(57003, 22)
(56916, 22)
(56805, 22)
(56767, 22)
(56629, 22)
(56487, 22)
(56440, 22)
(56325, 22)
(56207, 22)
(56113, 22)
(56055, 22)
(56055, 22)
(56055, 22)
(56055, 22)
196                 Match Charter Public School (District)
31275    Match Community Day Charter Public School (Dis...
Name: district, dtype: object


In [8]:
# drop rows with null values
print(df.shape)
df = df.dropna(subset=['K','1','2','3','4','5','6','7','8','9','10','11','ALL','Churn/Intake Enroll','% Churn','% Intake','Stability Enroll','% Stability'], how='all')
print(df.shape)

(56055, 22)
(50457, 22)


In [9]:
print(df.columns)

Index(['1', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', 'ALL',
       'District Code', 'K', 'group_state', 'year', '% Churn', '% Intake',
       '% Stability', 'Churn/Intake Enroll', 'Stability Enroll', 'district'],
      dtype='object')


In [10]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0412
3    0600
4    0003
Name: district_id, dtype: object


In [11]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

1                      float64
10                     float64
11                     float64
2                      float64
3                      float64
4                      float64
5                      float64
6                      float64
7                      float64
8                      float64
9                      float64
ALL                    float64
K                      float64
group_state             object
year                    object
% Churn                float64
% Intake               float64
% Stability            float64
Churn/Intake Enroll    float64
Stability Enroll       float64
district                object
district_id             object
dtype: object
1                      float64
10                     float64
11                     float64
2                      float64
3                      float64
4                      float64
5                      float64
6                      float64
7                      float64
8                      fl

In [12]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

(50457, 21)
(50457, 22)


In [13]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]
df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
del df_flags
print(df.shape)
print(df.head(5))

(50457, 22)
(50457, 23)
     1    10   11     2    3    4    5    6    7     8  ...  \
0  1.8   6.3  2.3   6.7  1.7  3.5  3.4  4.2  6.2  13.6  ...   
1  1.4   3.6  0.0   3.0  3.4  1.9  4.7  6.3  3.5  10.9  ...   
2  2.6  10.3  5.0  11.3  0.0  4.8  1.9  1.8  8.9  15.9  ...   
3  4.1   3.7  0.0   3.6  4.0  2.3  4.1  2.2  5.1  11.1  ...   
4  2.5   2.9  0.0   6.7  2.4  3.1  2.9  3.3  5.5  14.6  ...   

                  group_state  year  % Churn % Intake % Stability  \
0                All Students  2020      5.1      3.3        95.8   
1                      Female  2020      NaN      NaN         NaN   
2                        Male  2020      NaN      NaN         NaN   
3  Economically Disadvantaged  2020      5.4      4.0        95.7   
4                  High Needs  2020      5.7      4.2        95.4   

   Churn/Intake Enroll  Stability Enroll  district_id  \
0               1451.0            1438.0          445   
1                  NaN               NaN          445   
2          

In [14]:
print(df.columns)

Index(['1', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', 'ALL', 'K',
       'group_state', 'year', '% Churn', '% Intake', '% Stability',
       'Churn/Intake Enroll', 'Stability Enroll', 'district_id', 'district',
       'charter_flag'],
      dtype='object')


In [15]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

   Charter LEA ID                                   Charter LEA Name  \
0         4070405  Dudley Street Neighborhood Charter School (Dis...   
1         4090205  Alma del Mar Charter School (District) - Alma ...   
2         4100205  Excel Academy Charter (District) - Excel Acade...   
3         4110305  Boston Green Academy Horace Mann Charter Schoo...   
4         4120530  Academy Of the Pacific Rim Charter Public (Dis...   

  Regional Affiliation/s Geographic Location/s  
0                 Boston                Boston  
1                Gateway          Southeastern  
2                 Boston                Boston  
3                 Boston                Boston  
4                 Boston                Boston  
(50457, 23)
(52247, 25)


In [16]:
print(df.columns)

Index(['1', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', 'ALL', 'K',
       'group_state', 'year', '% Churn', '% Intake', '% Stability',
       'Churn/Intake Enroll', 'Stability Enroll', 'district_id', 'district',
       'charter_flag', 'Regional Affiliation/s', 'Geographic Location/s'],
      dtype='object')


In [17]:
# import charter_to_district data for calculating multipliers
df_mult = pd.read_csv('./data/finalized/charter_to_district_edited.csv')
print(df_mult.dtypes)
print(df_mult.shape)

charter_lea_code               int64
sending_lea_code               int64
enrolled_n                     int64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(1029, 5)


In [18]:
# import district_remainders data for calculating multipliers
df_enrollment = pd.read_csv('./data/finalized/district_remainders.csv')
print(df_enrollment.dtypes)
print(df_enrollment.shape)

sending_lea_code    int64
enrolled_n          int64
dtype: object
(253, 2)


In [19]:
# join charter_to_district to data
df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'group_state'], right_on = ['district_id', 'year','group_state'], how='left', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.columns)

(144166, 30)
(144166, 53)
Index(['charter_lea_code', 'sending_lea_code', 'enrolled_n',
       'physical_charter_location', 'chartered_to_serve', '1_charter',
       '10_charter', '11_charter', '2_charter', '3_charter', '4_charter',
       '5_charter', '6_charter', '7_charter', '8_charter', '9_charter',
       'ALL_charter', 'K_charter', 'group_state', 'year', '% Churn_charter',
       '% Intake_charter', '% Stability_charter',
       'Churn/Intake Enroll_charter', 'Stability Enroll_charter',
       'district_id_charter', 'district_charter', 'charter_flag_charter',
       'Regional Affiliation/s_charter', 'Geographic Location/s_charter',
       '1_district', '10_district', '11_district', '2_district', '3_district',
       '4_district', '5_district', '6_district', '7_district', '8_district',
       '9_district', 'ALL_district', 'K_district', '% Churn_district',
       '% Intake_district', '% Stability_district',
       'Churn/Intake Enroll_district', 'Stability Enroll_district',
       '

In [20]:
# join enrollment remainders to district data
df_district.columns = df_district.columns.map(lambda x: str(x) + '_district')
df_district = df_district.rename(columns={'year_district': 'year', 'group_state_district': 'group_state'})
df_district = df_district.merge(df_enrollment, left_on=['district_id_district'], right_on=['sending_lea_code'], how='inner')
print(df_district.columns)

Index(['1_district', '10_district', '11_district', '2_district', '3_district',
       '4_district', '5_district', '6_district', '7_district', '8_district',
       '9_district', 'ALL_district', 'K_district', 'group_state', 'year',
       '% Churn_district', '% Intake_district', '% Stability_district',
       'Churn/Intake Enroll_district', 'Stability Enroll_district',
       'district_id_district', 'district_district', 'charter_flag_district',
       'Regional Affiliation/s_district', 'Geographic Location/s_district',
       'sending_lea_code', 'enrolled_n'],
      dtype='object')


In [21]:
# append back to other joined data
print(df_mult.shape)
print(df_district.shape)
df_mult = df_mult.append(df_district, ignore_index=True, sort=True)
print(df_mult.shape)

(144166, 53)
(33773, 27)
(177939, 53)


In [22]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [23]:
# export grad dataset for QA
df.to_csv('./data/finalized/attrition_mobility.csv')

In [24]:
# export final dataset
print(df_mult.shape)
df_mult = df_mult.dropna(subset=['year'])
print(df_mult.shape)
df_mult.to_csv('./data/finalized/attrition_mobility_with_multipliers.csv', index=False)

(177939, 53)
(177939, 53)
