In [1]:
import pandas as pd

# Data: https://gosa.georgia.gov/dashboards-data-report-card/downloadable-data

In [2]:
year = ['17', '18', '19', '21']

In [3]:
def code_change(x):
    code_len = len(str(x))
    if code_len != 4:
        add_0 = 4 - code_len
        return add_0 * '0' + str(x)
    else:
        return str(x)

def dropout_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LONG_SCHOOL_YEAR':'year', 'DETAIL_LVL_DESC':'level', 
                        'SCHOOL_DSTRCT_CD':'districtcode', 'SCHOOL_DSTRCT_NM':'districtname', 
                        'INSTN_NUMBER':'schoolcode','INSTN_NAME':'schoolname', 
                        'LABEL_LVL_1_DESC':'droplabel', 'PROGRAM_PERCENT':'droprate',
                        'PROGRAM_TOTAL':'dropnum'})
    data = data[((data['level'] == 'School') & (data['droplabel'] == '9-12 Drop Outs -ALL Students'))]
    data['schoolcode'] = data['schoolcode'].apply(lambda x: code_change(x))
    data['districtcode'] = data['districtcode'].apply(lambda x: str(x))
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data = data[data['droprate'] != 'TFS']
    data = data[((data['dropnum'] != 'TFS') & (data['dropnum'].notna()))]
    data['dropnum'] = data['dropnum'].astype('int')
    data['droprate'] = data['droprate'].astype('float')
    data['totalenroll'] = round(data['dropnum'] / (data['droprate'] / 100))
    data['droprate'] = round(data['droprate'].fillna(0).astype('float'), 2)
    data['year'] = year
    data.loc[:, ['year', 'mergecode', 'droprate', 'totalenroll']].set_index(
        'year').to_csv(f'dropout_clean_{year}.csv')
    

In [4]:
for i in year:
    dropout_clean(f'drop{i}.csv', i)

In [5]:
data21 = set(list(pd.read_csv('dropout_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('dropout_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('dropout_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('dropout_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [6]:
all_data_drop = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'dropout_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_drop = pd.concat([all_data_drop, data])

In [7]:
def enroll_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'LONG_SCHOOL_YEAR':'year', 'DETAIL_LVL_DESC':'level', 
                        'SCHOOL_DSTRCT_CD':'districtcode', 'SCHOOL_DSTRCT_NM':'districtname', 
                        'INSTN_NUMBER':'schoolcode','INSTN_NAME':'schoolname', 
                        'ENROLL_PERCENT_BLACK':'black','ENROLL_PERCENT_HISPANIC':'hispanic', 
                        'ENROLL_PERCENT_WHITE':'white','ENROLL_PERCENT_ED':'lowincome'}).loc[:, 
                        ['year', 'level','districtcode','schoolcode', 'districtname', 'schoolname', 
                        'black', 'hispanic', 'white', 'lowincome']]
    data = data[data['level'] == 'School']
    data['schoolcode'] = data['schoolcode'].apply(lambda x: code_change(x))
    data['districtcode'] = data['districtcode'].apply(lambda x: str(x))
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data = data.loc[data['mergecode'].isin(mutual_code)]
    data = data.loc[:, ['year', 'mergecode', 'districtcode', 'black', 'hispanic', 'white', 'lowincome']]
    data['year'] = year
    data.set_index('year').fillna(0).to_csv(f'enrollment_clean_{year}.csv')
    

In [8]:
for i in year:
    enroll_clean(f'enrollment{i}.csv', i)

In [9]:
all_data_enroll = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'enrollment_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_enroll = pd.concat([all_data_enroll, data])

In [10]:
mode = pd.read_csv('learning_mode.csv')
mode = mode[mode['LearningModel']!='Closed']

charter = mode.loc[:, ['StateAssignedSchoolID', 'Charter']].drop_duplicates().rename(
    columns = {'StateAssignedSchoolID':'mergecode', 'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter['charter'] = charter['charter'].map(charter_map)

mode = mode.groupby(['StateAssignedSchoolID', 'NCESSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index().rename(
columns = {'StateAssignedSchoolID':'mergecode'})

mode = mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = ['mergecode', 'NCESSchoolID']).reset_index().fillna(0).loc[:, ['mergecode', 'NCESSchoolID',
                                                                'Hybrid', 'Virtual', 'In-person']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper', 'In-person':'inpersonper'})

mode['schoolmode'] = 1 * mode['virtualper'] + 0.5 * mode['hybridper']

mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']] = round(
    mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']], 2)

mode['NCESSchoolID'] = mode['NCESSchoolID'].astype('str')

In [11]:
csv = all_data_enroll.merge(all_data_drop, on = ['year', 'mergecode']).merge(
    charter, on = 'mergecode').merge(mode, on = 'mergecode')

csv['state'] = 'georgia'
csv['mergecode'] = csv['mergecode'] + csv['state']
csv['districtcode'] = csv['districtcode'].astype('str') + csv['state']
csv.loc[csv['year']!=21, ['hybridper', 'virtualper', 'schoolmode']] = 0
csv.loc[csv['year']!=21, ['inpersonper']] = 1
csv['year_21'] = 1
csv.loc[csv['year']!=21, ['year_21']] = 0
csv['totalenroll'] = csv['totalenroll'].astype('int')
csv.set_index('year').to_csv('georgia_dropout.csv')