In [1]:
import pandas as pd

# Data: https://www.cde.state.co.us/cdereval

In [2]:
year = ['17', '18', '19', '21']

In [3]:
def dropout_clean(x, year):
    data = pd.read_csv(x).rename(columns = 
    {'Org. Code':'districtcode', 'Organization Name':'districtname', 'School Code':'schoolcode', 
     'School Name / Category (include or exclude alternative schools)':'schoolname', 
     'Total Pupil Count':'totalenroll', 'Total Dropout Rate':'droprate',
    'Black Or Aftican American Pupil Count':'black', 'Hispanic or Latino Pupil Count':'hispanic',
    'White Pupil Count':'white'}).loc[:, ['districtcode','districtname', 'schoolcode', 'schoolname', 
    'totalenroll', 'droprate', 'black', 'hispanic', 'white']]
    data = data[((data['districtcode'].notna()) & (data['schoolcode'].notna()) 
                 & (data['totalenroll']!=0) & (data['schoolcode']!='DDRP'))]
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, ['districtcode', 
                                                               'schoolcode']].astype('int').astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data = data[~data['schoolname'].str.contains('DISTRICT|STATE')]
    data['year'] = year
    data['droprate'] = round(data['droprate'].astype('float') * 100, 2)
    data.loc[:, ['black', 'hispanic', 'white']] = data.loc[:, ['black', 'hispanic', 'white']].astype('int')
    data['black'] = round((data['black'] / data['totalenroll'])*100, 2)
    data['hispanic'] = round((data['hispanic'] / data['totalenroll'])*100, 2)
    data['white'] = round((data['white'] / data['totalenroll'])*100, 2)
    data = data[data['schoolcode']!='0']
    data['totalenroll'] = data['totalenroll'].astype('int')
    data.loc[:, ['mergecode', 'schoolcode', 'districtcode', 'year', 'droprate', 'black', 'white', 'hispanic',
                'totalenroll']].set_index('year').to_csv(f'dropout_clean_{year}.csv')

In [4]:
for i in year:
    dropout_clean(f'drop{i}.csv', i)

In [5]:
data21 = set(list(pd.read_csv('dropout_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('dropout_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('dropout_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('dropout_clean_17.csv').mergecode))

mutual_code = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [6]:
all_data_drop = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'dropout_clean_{i}.csv')
    data = data.loc[data['mergecode'].isin(mutual_code)]
    all_data_drop = pd.concat([all_data_drop, data])

In [7]:
grade_check = pd.read_csv('grade_check.csv').loc[:, ['Org. Code', 'School Code', 'Grade',
'School Name / Category (include or exclude alternative schools)']].dropna().rename(
columns = {'Org. Code':'districtcode', 'School Code':'schoolcode', 'School Name':'schoolname',
          'School Name / Category (include or exclude alternative schools)':'schoolname'})
grade_check.loc[:, ['districtcode', 'schoolcode']] = grade_check.loc[
    :, ['districtcode', 'schoolcode']].astype('int').astype('str')
grade_check = grade_check[~grade_check['schoolname'].str.contains('DISTRICT|STATE')]
grade_check[grade_check['Grade'] != 'ALL']
grade_check['mergecode'] = grade_check['districtcode'] + '-' + grade_check['schoolcode']
grade_check = grade_check.loc[:,['mergecode', 'Grade', 'schoolname']].pivot(
    columns = 'Grade', index = 'mergecode', values = 'schoolname')
middle = grade_check[((grade_check[' 9'].isna()) & (grade_check['10'].isna()) 
             & (grade_check['11'].isna()) & (grade_check['12'].isna()))].reset_index().mergecode.to_list()
all_data_drop = all_data_drop[~all_data_drop.mergecode.isin(middle)]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
def frpm_clean(x, year):
    data = pd.read_csv(x).rename(columns = {'DISTRICT CODE':'districtcode', 
                        'DISTRICT NAME\t':'districtname', 'SCHOOL CODE':'schoolcode',
                        'SCHOOL NAME':'schoolname', '% FREE AND REDUCED':'lowincome'}).loc[:, [
        'districtcode', 'districtname', 'schoolcode', 'schoolname', 'lowincome'
    ]].dropna()
    data = data[data['lowincome'] != '*']
    data.loc[:, ['districtcode', 'schoolcode']] = data.loc[:, ['districtcode', 
                                                               'schoolcode']].astype('int').astype('str')
    data['mergecode'] = data['districtcode'] + '-' + data['schoolcode']
    data['year'] = year
    data.lowincome = round(data.lowincome.astype('float') * 100, 2)
    data.loc[:, ['year', 'mergecode', 'lowincome']].set_index('year').to_csv(f'frpm_clean_{year}.csv')

In [9]:
for i in year:
    frpm_clean(f'frpm{i}.csv', i)

In [10]:
data21 = set(list(pd.read_csv('frpm_clean_21.csv').mergecode))
data19 = set(list(pd.read_csv('frpm_clean_19.csv').mergecode))
data18 = set(list(pd.read_csv('frpm_clean_18.csv').mergecode))
data17 = set(list(pd.read_csv('frpm_clean_17.csv').mergecode))

mutual_code_frpm = list(data21.intersection(data19).intersection(data18).intersection(data17))

In [11]:
all_data_frpm = pd.DataFrame()
for i in year:
    data = pd.read_csv(f'frpm_clean_{i}.csv')
    data = data.loc[((data['mergecode'].isin(mutual_code_frpm)) & (data['mergecode'].isin(mutual_code)))]
    all_data_frpm = pd.concat([all_data_frpm, data])

In [12]:
all_data = all_data_drop.merge(all_data_frpm, on = ['year', 'mergecode'])

all_data['totalenroll'] = all_data['totalenroll'].astype('int')

def code_change(x):
    code_len = len(str(x))
    if code_len != 4:
        add_0 = 4 - code_len
        return add_0 * '0' + str(x)
    else:
        return str(x)
    
all_data['schoolcode'] = all_data['schoolcode'].apply(lambda x: code_change(x))
all_data['districtcode'] = all_data['districtcode'].apply(lambda x: code_change(x))
all_data['mergecode'] = all_data['districtcode'] + '-' + all_data['schoolcode']

all_data = all_data.drop(columns = 'schoolcode')

In [13]:
mode = pd.read_csv('learning_mode.csv')
mode = mode[mode['LearningModel']!='Closed']

charter = mode.loc[:, ['StateAssignedSchoolID', 'Charter']].drop_duplicates().rename(
    columns = {'StateAssignedSchoolID':'mergecode', 'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter['charter'] = charter['charter'].map(charter_map)

mode = mode.groupby(['StateAssignedSchoolID', 'NCESSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index().rename(
columns = {'StateAssignedSchoolID':'mergecode'})

mode = mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = ['mergecode', 'NCESSchoolID']).reset_index().fillna(0).loc[:, ['mergecode', 'NCESSchoolID',
                                                                'Hybrid', 'Virtual', 'In-person']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper', 'In-person':'inpersonper'})

mode['schoolmode'] = 1 * mode['virtualper'] + 0.5 * mode['hybridper']

mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']] = round(
    mode.loc[:, ['hybridper', 'virtualper', 'inpersonper', 'schoolmode']], 2)

mode['NCESSchoolID'] = mode['NCESSchoolID'].astype('str')

In [14]:
csv = all_data.merge(charter, on = 'mergecode').merge(mode, on = 'mergecode')
csv['state'] = 'colorado'
csv['mergecode'] = csv['mergecode'] + csv['state']
csv['districtcode'] = csv['districtcode'].astype('str') + csv['state']
csv.loc[csv['year']!=21, ['hybridper', 'virtualper', 'schoolmode']] = 0
csv.loc[csv['year']!=21, ['inpersonper']] = 1
csv['year_21'] = 1
csv.loc[csv['year']!=21, ['year_21']] = 0
csv.set_index('year').to_csv('colorado_dropout.csv')