In [9]:
import pandas as pd

# Data: https://www.in.gov/doe/it/data-center-and-reports/

In [10]:
year = ['17', '18', '19', '21']

In [11]:
def enrollment_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'Corp ID':'districtcode', 'Schl ID':'schoolcode',
                                           'Grand Total':'totalenroll'}).fillna(0)
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, ['districtcode', 
                                                               'schoolcode']].astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data.loc[:, ['Black', 'Hispanic', 'White', 'Free/Reduced price meals', 
                 'totalenroll']] = data.loc[:, ['Black', 'Hispanic', 
                 'White', 'Free/Reduced price meals', 'totalenroll']].astype('int')
    data['white'] = round((data['White'] / data['totalenroll']) * 100, 2)
    data['black'] = round((data['Black'] / data['totalenroll']) * 100, 2)
    data['hispanic'] = round((data['Hispanic'] / data['totalenroll']) * 100, 2)
    data['lowincome'] = round((data['Free/Reduced price meals'] / data['totalenroll']) * 100, 2)
    data['year'] = year
    data.loc[:, ['year', 'mergecode', 'districtcode', 'white', 'black', 'hispanic', 
                 'lowincome', 'totalenroll']].set_index('year').to_csv(f'enrollment_clean_{year}.csv')

In [12]:
for i in year:
    enrollment_clean(f'enrollment{i}.csv', i)

In [13]:
data21 = set(list(pd.read_csv('enrollment_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('enrollment_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('enrollment_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('enrollment_clean_17.csv').mergecode))
mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [14]:
all_data_enroll = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'enrollment_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_enroll = pd.concat([all_data_enroll, data])

In [15]:
def test_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'Corp ID':'districtcode', 'School ID':'schoolcode',
                                           'ELA Percent Pass':'elapass', 'Math Percent Pass':'mathpass'})
    data = data.loc[:, ['districtcode', 'schoolcode', 'Math Test N', 'ELA Test N', 'elapass',
                       'mathpass']].dropna()
    data['districtcode'] = data['districtcode'].astype('int').astype('str')
    data['schoolcode'] = data['schoolcode'].astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    math = data.loc[:, ['year', 'mergecode', 'Math Test N', 'mathpass']].rename(columns = {'Math Test N':
                                                                                          'totaltest'})
    math = math[((math['totaltest']!= '***') & (math['mathpass']!='***'))]
    ela = data.loc[:, ['year', 'mergecode', 'ELA Test N', 'elapass']].rename(columns = {'ELA Test N':
                                                                                          'totaltest'})
    ela = ela[((ela['totaltest']!= '***') & (ela['elapass']!='***'))]
    math['totaltest'] = math['totaltest'].astype('int')
    ela['totaltest'] = ela['totaltest'].astype('int')
    math['mathpass'] = round((math['mathpass'].astype('float') * 100), 2)
    ela['elapass'] = round((ela['elapass'].astype('float') * 100), 2)
    math.set_index('year').to_csv(f'math_clean_{year}.csv')
    ela.set_index('year').to_csv(f'ela_clean_{year}.csv')

In [16]:
for i in year:
    test_clean(f'test{i}.csv', i)

In [17]:
data21 = set(list(pd.read_csv('ela_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('ela_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('ela_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('ela_clean_17.csv').mergecode))
mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [18]:
all_data_ela = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'ela_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_ela = pd.concat([all_data_ela, data])

In [19]:
data21 = set(list(pd.read_csv('math_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('math_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('math_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('math_clean_17.csv').mergecode))
mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [20]:
all_data_math = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'math_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_math = pd.concat([all_data_math, data])

In [21]:
all_data_ela = all_data_ela.merge(all_data_math[['year', 'mergecode']], on = ['year', 'mergecode'])

In [22]:
mode = pd.read_csv('learning_mode.csv').rename(columns = {
    'StateAssignedSchoolID':'schoolcode', 'StateAssignedDistrictID':'districtcode'
})

mode.loc[:, ['schoolcode', 'districtcode']] = mode.loc[:, 
            ['schoolcode', 'districtcode']].astype('int').astype('str')

mode['mergecode'] = mode['districtcode'] + '-' + mode['schoolcode']

mode = mode[mode['LearningModel']!='Closed']

charter = mode.loc[:, ['mergecode', 'Charter']].drop_duplicates().rename(
    columns = {'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter['charter'] = charter['charter'].map(charter_map)

mode = mode.groupby(['mergecode', 'NCESSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index()

mode = mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = ['mergecode', 'NCESSchoolID']).reset_index().fillna(0).loc[:, ['mergecode', 'NCESSchoolID',
                                                               'Hybrid', 'Virtual', 'In-person']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper', 'In-person':'inpersonper'})

mode['schoolmode'] = 1 * mode['virtualper'] + 0.5 * mode['hybridper']

mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']] = round(
    mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']], 2)

mode['NCESSchoolID'] = mode['NCESSchoolID'].astype('str')


In [23]:
math = all_data_math.merge(mode, on='mergecode').merge(charter, on='mergecode').merge(
    all_data_enroll, on=['year', 'mergecode'])
math.loc[math['year'] != 21, ['virtualper', 'hybridper', 'schoolmode']] = 0
math.loc[math['year'] != 21, ['inpersonper']] = 1
math['year_21'] = 1
math.loc[math['year'] != 21, ['year_21']] = 0
math.loc[:, ['year', 'districtcode']] = math.loc[:, ['year', 'districtcode']].astype('str')
math['charter'] = math['charter'].astype('int')
math['state'] = 'indiana'
math['mergecode'] = math['mergecode'] + math['state']
math['districtcode'] = math['districtcode'] + math['state']
math.set_index('year').to_csv('indiana_mathpass.csv')

In [24]:
ela = all_data_ela.merge(all_data_math.loc[:, ['year', 'mergecode']], 
    on = ['year', 'mergecode']).merge(mode, on='mergecode').merge(charter, on='mergecode').merge(
    all_data_enroll, on=['year', 'mergecode'])
ela.loc[ela['year'] != 21, ['virtualper', 'hybridper', 'schoolmode']] = 0
ela.loc[ela['year'] != 21, ['inpersonper']] = 1
ela['year_21'] = 1
ela.loc[ela['year'] != 21, ['year_21']] = 0
ela.loc[:, ['year', 'districtcode']] = ela.loc[:, ['year', 'districtcode']].astype('str')
ela['charter'] = ela['charter'].astype('int')
ela['state'] = 'indiana'
ela['mergecode'] = ela['mergecode'] + ela['state']
ela['districtcode'] = ela['districtcode'] + ela['state']
ela.set_index('year').to_csv('indiana_elapass.csv')