In [45]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/co/proficiency/'

proficiency_files = [
    'co_proficiency_2015_cleaned.csv'
    , 'co_proficiency_2016_ela_cleaned.csv'
    , 'co_proficiency_2016_math_cleaned.csv'
    , 'co_proficiency_2017_cleaned.csv'
    , 'co_proficiency_2018_cleaned.csv'
    , 'co_proficiency_2019_cleaned.csv'
]

years = [x[15:19] for x in proficiency_files]

df = pd.read_csv(proficiency_path + proficiency_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(proficiency_files)):
    df2 = pd.read_csv(proficiency_path + proficiency_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort=True)

In [46]:
# drop All Schools records
df = df[~df['school_id'].isna()]
df = df[df['school_id'] != 0]

In [47]:
# rename columns
df = df.rename(columns = {
    'Number Approached Expectations': 'Approached Expectations'
    , 'Number Did Not Yet Meet Expectations': 'Did Not Yet Meet Expectations'
    , 'Number Exceeded Expectations': 'Exceeded Expectations'
    , 'Number Met Expectations': 'Met Expectations'
    , 'Number Partially Met Expectations': 'Partially Met Expectations'
})

In [48]:
# reshape scores
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'subject', 'num_tested']
value_vars = ['Did Not Yet Meet Expectations', 'Approached Expectations', 'Partially Met Expectations', 'Met Expectations', 'Exceeded Expectations']
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, var_name = 'performance_level', value_name = 'num_at_level')

In [49]:
# clean grades
df['grade'] = df['grade'].str.replace('ELA Grade ', '')
df['grade'] = df['grade'].str.replace('Math Grade ', '')
df['grade'] = df['grade'].str.replace('English Language Arts Grade ', '')
df['grade'] = df['grade'].str.replace('Mathematics Grade ', '')
df['grade'] = df['grade'].str.replace('0', '')
df['grade'] = df['grade'].str.replace('*', '')

In [50]:
# clean subjects
def clean_subjects(row):
    if row['subject'] == 'MATH' or row['subject'] == 'Mathematics':
        return 'Math'
    if row['subject'] == 'English Language Arts':
        return 'ELA'
    return row['subject']
    
df['subject'] = df.apply(lambda x: clean_subjects(x), axis = 1)

In [51]:
# add group_state
df['group_state'] = df.apply(lambda x: 'All Groups', axis = 1)

In [52]:
# import subgroup files
subgroup_files_2015 = [
    'co_proficiency_2015_cleaned_math_migrant.csv'
    , 'co_proficiency_2015_cleaned_math_lep.csv'
    , 'co_proficiency_2015_cleaned_math_iep.csv'
    , 'co_proficiency_2015_cleaned_math_gifted.csv'
    , 'co_proficiency_2015_cleaned_math_frl.csv'
    , 'co_proficiency_2015_cleaned_math_ethnicity.csv'
    , 'co_proficiency_2015_cleaned_ela_migrant.csv'
    , 'co_proficiency_2015_cleaned_ela_lep.csv'
    , 'co_proficiency_2015_cleaned_ela_iep.csv'
    , 'co_proficiency_2015_cleaned_ela_gifted.csv'
    , 'co_proficiency_2015_cleaned_ela_frl.csv'
    , 'co_proficiency_2015_cleaned_ela_ethnicity.csv'
]

subjects = [x[28:29] for x in subgroup_files]

df_sg_2015 = pd.read_csv(proficiency_path + subgroup_files_2015[0])
df_sg_2015['subject'] = df_sg_2015.apply(lambda x: subjects[0], axis=1)

for i in range(1, len(subgroup_files_2015)):
    df2 = pd.read_csv(proficiency_path + subgroup_files_2015[i])
    df2['subject'] = df2.apply(lambda x: subjects[i], axis=1)
    df_sg_2015 = df_sg_2015.append(df2, ignore_index = True, sort=True)
    
df_sg_2015['year'] = df_sg_2015.apply(lambda x: '2015', axis=1)

subgroup_files = [
    'co_proficiency_2016_cleaned_ela_ethnicity.csv'
    , 'co_proficiency_2016_cleaned_ela_frl.csv'
    , 'co_proficiency_2016_cleaned_ela_gifted.csv'
    , 'co_proficiency_2016_cleaned_ela_iep.csv'
    , 'co_proficiency_2016_cleaned_ela_lep.csv'
    , 'co_proficiency_2016_cleaned_ela_migrant.csv'
    , 'co_proficiency_2016_cleaned_math_ethnicity.csv'
    , 'co_proficiency_2016_cleaned_math_frl.csv'
    , 'co_proficiency_2016_cleaned_math_gifted.csv'
    , 'co_proficiency_2016_cleaned_math_iep.csv'
    , 'co_proficiency_2016_cleaned_math_lep.csv'
    , 'co_proficiency_2016_cleaned_math_migrant.csv'
    , 'co_proficiency_2017_cleaned_ela_ethnicity.csv'
    , 'co_proficiency_2017_cleaned_ela_frl.csv'
    , 'co_proficiency_2017_cleaned_ela_gifted.csv'
    , 'co_proficiency_2017_cleaned_ela_iep.csv'
    , 'co_proficiency_2017_cleaned_ela_migrant.csv'
    , 'co_proficiency_2017_cleaned_math_ethnicity.csv'
    , 'co_proficiency_2017_cleaned_math_frl.csv'
    , 'co_proficiency_2017_cleaned_math_gifted.csv'
    , 'co_proficiency_2017_cleaned_math_iep.csv'
    , 'co_proficiency_2017_cleaned_math_lep.csv'
    , 'co_proficiency_2017_cleaned_math_migrant.csv'
    , 'co_proficiency_2018_cleaned_ela_ethnicity.csv'
    , 'co_proficiency_2018_cleaned_ela_frl.csv'
    , 'co_proficiency_2018_cleaned_ela_gifted.csv'
    , 'co_proficiency_2018_cleaned_ela_iep.csv'
    , 'co_proficiency_2018_cleaned_ela_lep.csv'
    , 'co_proficiency_2018_cleaned_ela_migrant.csv'
    , 'co_proficiency_2018_cleaned_math_ethnicity.csv'
    , 'co_proficiency_2018_cleaned_math_frl.csv'
    , 'co_proficiency_2018_cleaned_math_gifted.csv'
    , 'co_proficiency_2018_cleaned_math_iep.csv'
    , 'co_proficiency_2018_cleaned_math_lep.csv'
    , 'co_proficiency_2018_cleaned_math_migrant.csv'
    , 'co_proficiency_2019_cleaned_ela_ethnicity.csv'
    , 'co_proficiency_2019_cleaned_ela_frl.csv'
    , 'co_proficiency_2019_cleaned_ela_gifted.csv'
    , 'co_proficiency_2019_cleaned_ela_iep.csv'
    , 'co_proficiency_2019_cleaned_ela_lep.csv'
    , 'co_proficiency_2019_cleaned_ela_migrant.csv'
    , 'co_proficiency_2019_cleaned_math_ethnicity.csv'
    , 'co_proficiency_2019_cleaned_math_frl.csv'
    , 'co_proficiency_2019_cleaned_math_gifted.csv'
    , 'co_proficiency_2019_cleaned_math_iep.csv'
    , 'co_proficiency_2019_cleaned_math_lep.csv'
    , 'co_proficiency_2019_cleaned_math_migrant.csv'
]

years = [x[15:19] for x in subgroup_files]
subjects = [x[28:29] for x in subgroup_files]

df_sg = pd.read_csv(proficiency_path + subgroup_files[0])
df_sg['year'] = df_sg.apply(lambda x: years[0], axis=1)
df_sg['subject'] = df_sg.apply(lambda x: subjects[0], axis=1)

for i in range(1, len(subgroup_files)):
    df2 = pd.read_csv(proficiency_path + subgroup_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df2['subject'] = df2.apply(lambda x: subjects[i], axis=1)
    df2['filename'] = df2.apply(lambda x: subgroup_files[i], axis = 1)
    df_sg = df_sg.append(df2, ignore_index = True, sort=True)

In [53]:
# process 2015 data
df_sg_2015 = df_sg_2015[df_sg_2015['school_id'] != 0]
def create_num(row):
    try:
        return int(round(float(row['% MET OR EXCEEDED EXPECTATIONS']) * float(row['num_tested']) / 100))
    except:
        return ''
df_sg_2015['Met or Exceeded Expectations'] = df_sg_2015.apply(lambda x: create_num(x), axis = 1)

In [54]:
# rename columns
df_sg = df_sg.rename(columns = {
    '# Approached Expectations': 'Approached Expectations'
    , '# Did Not Yet Meet Expectations': 'Did Not Yet Meet Expectations'
    , '# Exceeded Expectations': 'Exceeded Expectations'
    , '# Met Expectations': 'Met Expectations'
    , '# Partially Met Expectations': 'Partially Met Expectations'
    , '# of Valid Scores': 'num_tested'
    , 'District Name': 'district'
    , 'District Number': 'district_id'
    , 'School Name': 'school'
    , 'School Number': 'school_id'
    , 'Test': 'grade'
})

In [55]:
# append 2015 to other results
df_sg = df_sg.append(df_sg_2015, ignore_index = True, sort=True)

In [56]:
df_sg.to_csv('temp.csv')

In [57]:
# drop records that aren't at the school level
df_sg = df_sg[df_sg['school_id'] != 0]
df_sg = df_sg[~df_sg['school_id'].isna()]

In [58]:
# clean subjects
def clean_subjects(row):
    if row['subject'] == 'm':
        return 'Math'
    if row['subject'] == 'e':
        return 'ELA'
    return row['subject']

df_sg['subject'] = df_sg.apply(lambda x: clean_subjects(x), axis = 1)

In [59]:
# clean grades
df_sg['grade'] = df_sg['grade'].str.replace('ELA Grade ', '')
df_sg['grade'] = df_sg['grade'].str.replace('Math Grade ', '')
df_sg['grade'] = df_sg['grade'].str.replace('Mathematics Grade ', '')
df_sg['grade'] = df_sg['grade'].str.replace('0', '')
df_sg['grade'] = df_sg['grade'].str.strip()

In [60]:
# reshape scores
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'num_tested']
value_vars = ['Did Not Yet Meet Expectations', 'Approached Expectations', 'Partially Met Expectations', 'Met Expectations', 'Exceeded Expectations', 'Met or Exceeded Expectations']
df_sg = pd.melt(df_sg, id_vars = id_vars, value_vars = value_vars, var_name = 'performance_level', value_name = 'num_at_level')

In [61]:
# append subgroup scores to main
df = df.append(df_sg, ignore_index = True, sort = True)

In [62]:
# add proficient_tf
df['proficient_tf'] = [1 if x == 'Met Expectations' or x == 'Exceeded Expectations' or x == 'Met or Exceeded Expectations'\
                       else 0 for x in df['performance_level']]

In [63]:
# remove rows with no scores
df = df[~df['num_tested'].isna()]

In [64]:
# create pct_at_level
def create_pct(row):
    try:
        return float(row['num_at_level']) / float(row['num_tested'])
    except:
        return '*'
df['pct_at_level'] = df.apply(lambda x: create_pct(x), axis = 1)

In [65]:
# export final dataset
df.to_csv('./data/finalized/co_proficiency.csv', index=False)