In [1]:
import pandas as pd

# Data: https://www.azed.gov/accountability-research/data

In [2]:
year = ['17', '18', '19', '21']

In [3]:
def enrollment_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LEA Name':'districtname', 
    'LEA Entity ID':'districtcode','School Name':'schoolname', 'School Entity ID':'schoolcode'}).loc[:, 
    ['districtcode', 'districtname', 'schoolcode','schoolname', 'Subgroup', 'EthnicBlackAfricanAmerican', 
     'EthnicHispanicLatino','EthnicWhite', 'Total']].dropna()
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, 
                                    ['districtcode', 'schoolcode']].astype('int').astype('str')
    data = data[data['Subgroup'] == 'All Students'].replace('*', 0)
    data['Total'] = data['Total'].astype('int')
    data = data[data['Total'] > 0]
    data.loc[:, ['EthnicBlackAfricanAmerican', 'EthnicHispanicLatino','EthnicWhite']] = data.loc[:, 
    ['EthnicBlackAfricanAmerican', 'EthnicHispanicLatino','EthnicWhite']].astype('int')
    
    data['black'] = round((data['EthnicBlackAfricanAmerican'] / data['Total']) * 100, 2)
    data['hispanic'] = round((data['EthnicHispanicLatino'] / data['Total']) * 100, 2)
    data['white'] = round((data['EthnicWhite'] / data['Total']) * 100, 2)
    
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    data = data.rename(columns = {'Total':'totalenroll'})
    
    data = data.loc[:, ['year', 'mergecode', 'districtcode', 'schoolcode', 'black', 'hispanic', 'white',
                       'totalenroll']].set_index('year').to_csv(f'enrollment_clean_{year}.csv')

In [4]:
for i in year:
    enrollment_clean(f'enrollment{i}.csv', i)

In [5]:
data21 = set(list(pd.read_csv('enrollment_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('enrollment_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('enrollment_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('enrollment_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [6]:
all_data_enroll = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'enrollment_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_enroll = pd.concat([all_data_enroll, data])

In [7]:
grade = pd.read_csv('school_grade.csv')
grade = grade.loc[(grade['9'] == '*') & (grade['10'] == '*') & (grade['11'] == '*') & (grade['12'] == '*')
         & (grade['SchoolEntityID'].notna())].rename(
    columns = {'DistrictofResidenceEntityID':'districtcode','SchoolEntityID':'schoolcode'}).loc[
    :, ['districtcode', 'schoolcode']]
grade.loc[:, ['districtcode', 'schoolcode']] = grade.loc[:, 
                                    ['districtcode', 'schoolcode']].astype('int').astype('str')
grade['mergecode'] = grade['districtcode'] + '-' + grade['schoolcode']
all_data_enroll = all_data_enroll.loc[
    ~all_data_enroll.mergecode.isin(grade['mergecode'].to_list())]


In [8]:
def frpm_clean(x, year):
    data = pd.read_csv(x).rename(columns ={'SponsorEntityID':'districtcode', 
                                               'SiteEntityID':'schoolcode'}).dropna()
    data = data[data['lowincome'] != '#VALUE!']
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, ['districtcode', 
                                                               'schoolcode']].astype('int').astype('str')
    data['lowincome'] = data['lowincome'].astype('float')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['lowincome'] = round(data['lowincome'] * 100, 2)
    data['year'] = year
    data.loc[:,['year', 'mergecode', 'lowincome']].set_index('year').to_csv(f'frpm_clean_{year}.csv')

In [9]:
for i in year:
    frpm_clean(f'frpm{i}.csv', i)

In [10]:
data21 = set(list(pd.read_csv('frpm_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('frpm_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('frpm_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('frpm_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [11]:
all_data_frpm = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'frpm_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_frpm = pd.concat([all_data_frpm, data])

In [12]:
def dropout_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LEA ID':'districtcode', 'School Entity ID':'schoolcode',
                                           'Dropout Rate':'droprate'}).dropna()
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[
        :, ['districtcode', 'schoolcode']].astype('int').astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    data.loc[:, ['year', 'mergecode', 'droprate']].set_index('year').to_csv(f'dropout_clean_{year}.csv')

In [13]:
for i in year:
    dropout_clean(f'drop{i}.csv', i)

In [14]:
data21 = set(list(pd.read_csv('dropout_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('dropout_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('dropout_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('dropout_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [15]:
all_data_drop = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'dropout_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_drop = pd.concat([all_data_drop, data])

In [16]:
mode = pd.read_csv('learning_mode.csv')

mode['mergecode'] = mode['StateAssignedDistrictID'
                        ].astype('int').astype('str') + '-' + mode['StateAssignedSchoolID'].astype('str')

charter = mode.loc[:, ['mergecode', 'Charter']].drop_duplicates().rename(columns = {'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter['charter'] = charter['charter'].map(charter_map)

mode = mode[mode['LearningModel']!='Closed']

mode = mode.groupby(['mergecode', 'NCESSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index()

mode = mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = ['mergecode', 'NCESSchoolID']).reset_index().fillna(0).loc[:, ['mergecode', 'NCESSchoolID',
                                                                'Hybrid', 'Virtual', 'In-person']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper', 'In-person':'inpersonper'})

mode['schoolmode'] = 1 * mode['virtualper'] + 0.5 * mode['hybridper']

mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']] = round(
    mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']], 2)

mode['NCESSchoolID'] = mode['NCESSchoolID'].astype('str')

In [17]:
csv = all_data_enroll.merge(all_data_frpm, on = ['year', 'mergecode']).merge(all_data_drop, 
on = ['year', 'mergecode']).merge(mode, on = 'mergecode').merge(charter, on = 'mergecode').drop(
    columns = 'schoolcode')
csv['state'] = 'arizona'
csv['mergecode'] = csv['mergecode'] + csv['state']
csv['districtcode'] = csv['districtcode'].astype('str') + csv['state']
csv.loc[csv['year']!=21, ['hybridper', 'virtualper', 'schoolmode']] = 0
csv.loc[csv['year']!=21, ['inpersonper']] = 1
csv['year_21'] = 1
csv.loc[csv['year']!=21, ['year_21']] = 0
csv.set_index('year').to_csv('arizona_dropout.csv')