In [43]:
import pandas as pd

In [44]:
year = ['17', '18', '19', '21']

### Enrollment Data: https://www.azed.gov/accountability-research/data

In [46]:
def enrollment_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LEA Name':'districtname', 
    'LEA Entity ID':'districtcode','School Name':'schoolname', 'School Entity ID':'schoolcode'}).loc[:, 
    ['districtcode', 'districtname', 'schoolcode','schoolname', 'Subgroup', 'EthnicBlackAfricanAmerican', 
     'EthnicHispanicLatino','EthnicWhite', 'Total']].dropna()
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, 
                                    ['districtcode', 'schoolcode']].astype('int').astype('str')
    data = data[data['Subgroup'] == 'All Students'].replace('*', 0)
    data['Total'] = data['Total'].astype('int')
    data = data[data['Total'] > 0]
    data.loc[:, ['EthnicBlackAfricanAmerican', 'EthnicHispanicLatino','EthnicWhite']] = data.loc[:, 
    ['EthnicBlackAfricanAmerican', 'EthnicHispanicLatino','EthnicWhite']].astype('int')
    
    data['black'] = round((data['EthnicBlackAfricanAmerican'] / data['Total']) * 100, 2)
    data['hispanic'] = round((data['EthnicHispanicLatino'] / data['Total']) * 100, 2)
    data['white'] = round((data['EthnicWhite'] / data['Total']) * 100, 2)
    
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    data = data.rename(columns = {'Total':'totalenroll'})
    
    data = data.loc[:, ['year', 'mergecode', 'districtcode', 'schoolcode', 'black', 'hispanic', 'white',
                       'totalenroll']].set_index('year').to_csv(f'enrollment_clean_{year}.csv')

In [47]:
for i in year:
    enrollment_clean(f'enrollment{i}.csv', i)

In [48]:
data21 = set(list(pd.read_csv('enrollment_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('enrollment_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('enrollment_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('enrollment_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [49]:
all_data_enroll = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'enrollment_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_enroll = pd.concat([all_data_enroll, data])

In [50]:
grade = pd.read_csv('school_grade.csv')
grade = grade.loc[(grade['9'] == '*') & (grade['10'] == '*') & (grade['11'] == '*') & (grade['12'] == '*')
         & (grade['SchoolEntityID'].notna())].rename(
    columns = {'DistrictofResidenceEntityID':'districtcode','SchoolEntityID':'schoolcode'}).loc[
    :, ['districtcode', 'schoolcode']]
grade.loc[:, ['districtcode', 'schoolcode']] = grade.loc[:, 
                                    ['districtcode', 'schoolcode']].astype('int').astype('str')
grade['mergecode'] = grade['districtcode'] + '-' + grade['schoolcode']
all_data_enroll = all_data_enroll.loc[
    ~all_data_enroll.mergecode.isin(grade['mergecode'].to_list())]


### FRPM Data

In [51]:
def frpm_clean(x, year):
    data = pd.read_csv(x).rename(columns ={'SponsorEntityID':'districtcode', 
                                               'SiteEntityID':'schoolcode'}).dropna()
    data = data[data['lowincome'] != '#VALUE!']
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, ['districtcode', 
                                                               'schoolcode']].astype('int').astype('str')
    data['lowincome'] = data['lowincome'].astype('float')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['lowincome'] = round(data['lowincome'] * 100, 2)
    data['year'] = year
    data.loc[:,['year', 'mergecode', 'lowincome']].set_index('year').to_csv(f'frpm_clean_{year}.csv')

In [52]:
for i in year:
    frpm_clean(f'frpm{i}.csv', i)

In [53]:
data21 = set(list(pd.read_csv('frpm_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('frpm_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('frpm_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('frpm_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [54]:
all_data_frpm = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'frpm_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_frpm = pd.concat([all_data_frpm, data])

### Dropout Data

In [55]:
def dropout_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LEA ID':'districtcode', 'School Entity ID':'schoolcode',
                                           'Dropout Rate':'droprate'}).dropna()
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[
        :, ['districtcode', 'schoolcode']].astype('int').astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    data.loc[:, ['year', 'mergecode', 'droprate']].set_index('year').to_csv(f'dropout_clean_{year}.csv')

In [56]:
for i in year:
    dropout_clean(f'drop{i}.csv', i)

In [57]:
data21 = set(list(pd.read_csv('dropout_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('dropout_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('dropout_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('dropout_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [58]:
all_data_drop = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'dropout_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_drop = pd.concat([all_data_drop, data])

In [59]:
mode = pd.read_csv('learning_mode.csv')

mode['mergecode'] = mode['StateAssignedDistrictID'
                        ].astype('int').astype('str') + '-' + mode['StateAssignedSchoolID'].astype('str')

charter = mode.loc[:, ['mergecode', 'Charter']].drop_duplicates().rename(columns = {'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter['charter'] = charter['charter'].map(charter_map)

mode = mode[mode['LearningModel']!='Closed']

mode = mode.groupby(['mergecode', 'NCESSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index()

mode = mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = ['mergecode', 'NCESSchoolID']).reset_index().fillna(0).loc[:, ['mergecode', 'NCESSchoolID',
                                                                'Hybrid', 'Virtual', 'In-person']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper', 'In-person':'inpersonper'})

mode['schoolmode'] = 1 * mode['virtualper'] + 0.5 * mode['hybridper']

mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']] = round(
    mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']], 2)

mode['NCESSchoolID'] = mode['NCESSchoolID'].astype('str')

In [60]:
all_data_enroll

Unnamed: 0,year,mergecode,districtcode,schoolcode,black,hispanic,white,totalenroll
1,17,79053-79123,79053,79123,10.42,80.00,6.25,240
3,17,91303-91304,91303,91304,10.05,76.72,5.88,408
4,17,91305-91306,91305,91306,9.36,26.81,50.64,235
8,17,92987-635758,92987,635758,0.00,54.84,22.58,62
9,17,89869-89870,89869,89870,13.55,32.26,18.06,155
...,...,...,...,...,...,...,...,...
2086,21,4507-89576,4507,89576,1.01,60.60,33.22,2089
2087,21,4507-6190,4507,6190,0.85,89.26,7.13,2468
2088,21,4507-80409,4507,80409,0.00,99.20,0.73,2617
2089,21,4507-6302,4507,6302,0.00,85.46,7.05,227


In [62]:
csv = all_data_enroll.merge(all_data_frpm, on = ['year', 'mergecode']).merge(all_data_drop, 
on = ['year', 'mergecode']).merge(mode, on = 'mergecode').merge(charter, on = 'mergecode').drop(
    columns = 'schoolcode')
csv['state'] = 'arizona'
csv['mergecode'] = csv['mergecode'] + csv['state']
csv['districtcode'] = csv['districtcode'].astype('str') + csv['state']
csv.loc[csv['year']!=21, ['hybridper', 'virtualper', 'schoolmode']] = 0
csv.loc[csv['year']!=21, ['inpersonper']] = 1
csv['year_21'] = 1
csv.loc[csv['year']!=21, ['year_21']] = 0
csv.set_index('year')#.to_csv('arizona_dropout.csv')

Unnamed: 0_level_0,mergecode,districtcode,black,hispanic,white,totalenroll,lowincome,droprate,NCESSchoolID,hybridper,virtualper,inpersonper,schoolmode,charter,state,year_21
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
17,4289-5454arizona,4289arizona,10.86,64.91,19.50,1667,62.0,1.00,40045000001,0.0,0.0,1.0,0.0,0,arizona,0
18,4289-5454arizona,4289arizona,10.85,64.28,19.19,1761,60.0,1.00,40045000001,0.0,0.0,1.0,0.0,0,arizona,0
19,4289-5454arizona,4289arizona,11.09,64.81,17.98,1830,57.0,2.36,40045000001,0.0,0.0,1.0,0.0,0,arizona,0
21,4289-5454arizona,4289arizona,10.37,67.46,15.71,1687,59.0,3.55,40045000001,0.4,0.6,0.0,0.8,0,arizona,1
17,4289-79799arizona,4289arizona,13.89,50.91,27.38,1764,50.0,1.32,40045002273,0.0,0.0,1.0,0.0,0,arizona,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,4507-6302arizona,4507arizona,0.00,85.46,7.05,227,85.0,21.69,40963000157,0.6,0.4,0.0,0.7,0,arizona,1
17,4507-6189arizona,4507arizona,1.75,87.65,7.53,1142,80.0,1.95,40963000997,0.0,0.0,1.0,0.0,0,arizona,0
18,4507-6189arizona,4507arizona,1.70,87.35,8.43,1115,80.0,2.07,40963000997,0.0,0.0,1.0,0.0,0,arizona,0
19,4507-6189arizona,4507arizona,1.68,86.18,8.76,1187,80.0,0.90,40963000997,0.0,0.0,1.0,0.0,0,arizona,0


In [63]:
csv.groupby('year').count()

Unnamed: 0_level_0,mergecode,districtcode,black,hispanic,white,totalenroll,lowincome,droprate,NCESSchoolID,hybridper,virtualper,inpersonper,schoolmode,charter,state,year_21
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
17,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227
18,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227
19,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227
21,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227,227
