In [80]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/ks/proficiency/'

proficiency_files = [
    'ks_proficiency_2015_cleaned.csv'
    , 'ks_proficiency_2016_cleaned.csv'
    , 'ks_proficiency_2017_cleaned.csv'
    , 'ks_proficiency_2018_cleaned.csv'
    , 'ks_proficiency_2019_cleaned.csv'
]

participation_files = [
    'ks_participation_2016_cleaned.csv'
    , 'ks_participation_2017_cleaned.csv'
    , 'ks_participation_2018_cleaned.csv'
    , 'ks_participation_2019_cleaned.csv'
]

participation_years = [
    2016
    , 2017
    , 2018
    , 2019
]

df = pd.read_csv(proficiency_path + proficiency_files[0])

for i in range(1, len(proficiency_files)):
    df2 = pd.read_csv(proficiency_path + proficiency_files[i])
    df = df.append(df2, ignore_index = True)

df = df.rename(columns={
    "Group": "group_state"
    , "Subject": "subject"
    , "Org No": "district_id"
    , "School Year": "year"
    , "Pct Level One": "Level One"
    , "Pct Level Two": "Level Two"
    , "Pct Level Three": "Level Three"
    , "Pct Level Four": "Level Four"
    , "Pct Not Valid": "Not Valid"})

# import participation files
df_participation = pd.read_csv(proficiency_path + participation_files[0])
df_participation['year'] = df_participation.apply(lambda x: int(participation_years[0]), axis=1)
for i in range(1, len(participation_files)):
    df2 = pd.read_csv(proficiency_path + participation_files[i])
    df2['year'] = df2.apply(lambda x: int(participation_years[i]), axis=1)
    df_participation = df_participation.append(df2, ignore_index = True)

# import district mapping file
df_districts = pd.read_csv('./data/ks/unique_districts.csv')

In [81]:
# parse school_id and school names
parse = df['School/District Name'].str.split(pat=' - ', n=1, expand=True)
df['school_id'] = parse[0]
df['school'] = parse[1]

# drop district level results
df = df[df['school_id'].str.len() == 4]

# make grades string and change 13 to All Grades
df['grade'] = [str(x) if x != 13 else 'All Grades' for x in df['Grade']]

# join district names
df = pd.merge(df, df_districts, on=['year', 'district_id'])

In [82]:
# create list of unique year, school_id, school for enrollment data
df_schools = df[['year', 'school_id', 'school']]
df_schools = df_schools.drop_duplicates()
df_schools.to_csv('./data/ks/unique_schools.csv', index=False)

In [83]:
# reshape scores
df = pd.melt(df, id_vars=['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject']
        , value_vars=['Level One'
            , 'Level Two'
            , 'Level Three'
            , 'Level Four'
            , 'Not Valid'],
        var_name='performance_level'
        , value_name='pct_at_level')

In [84]:
# change pct_at_level to decimal
df['temp'] = df['pct_at_level'] / 100
del df['pct_at_level']
df = df.rename(columns={'temp': 'pct_at_level'})

# create proficient_tf
def proficientTF (row):
    if row['performance_level'] == 'Level Three' or row['performance_level'] == 'Level Four':
        return 1
    return 0
df['proficient_tf'] = df.apply(lambda row: proficientTF(row), axis=1)

   year district_id        district school_id           school grade  \
0  2015       D0101  Erie-Galesburg      0111  Erie Elementary     3   
1  2015       D0101  Erie-Galesburg      0111  Erie Elementary     3   
2  2015       D0101  Erie-Galesburg      0111  Erie Elementary     4   
3  2015       D0101  Erie-Galesburg      0111  Erie Elementary     4   
4  2015       D0101  Erie-Galesburg      0111  Erie Elementary     5   

    group_state subject performance_level  pct_at_level  proficient_tf  
0  All Students    Math         Level One        0.1621              0  
1  All Students     ELA         Level One        0.4054              0  
2  All Students     ELA         Level One        0.1944              0  
3  All Students    Math         Level One        0.1388              0  
4  All Students     ELA         Level One        0.1818              0  


In [85]:
# clean subjects in participation file
df_participation['Subject_cleaned'] = [x if x != 'MATH' else 'Math' for x in df_participation['Subject']]
del df_participation['Subject']

# filter to relevant columns and rename for merge
df_participation = df_participation[['year', 'Building_No', 'Subject_cleaned', 'Total_Part_N', 'GroupName']]
df_participation = df_participation.rename(columns={
    "Building_No": "school_id"
    , "Total_Part_N": "num_tested"
    , 'Subject_cleaned': 'subject'
    , 'GroupName': 'group_state'})

# create grade column
df_participation['grade'] = df_participation.apply(lambda x: 'All Grades', axis=1)

   year  school_id      subject  num_tested   group_state       grade
0  2016        112      Science        32.0  All Students  All Grades
1  2016        112          ELA       113.0  All Students  All Grades
2  2016        112         Math       113.0  All Students  All Grades
3  2016        112  History/Gov        72.0  All Students  All Grades
4  2016  Aggregate          ELA       285.0  All Students  All Grades


In [88]:
# join num_tested to scores
df = pd.merge(df, df_participation, on=['year', 'school_id', 'grade', 'group_state', 'subject'])

   year district_id                      district school_id  \
0  2016       Z0029  Kansas City Catholic Diocese      1444   
1  2016       Z0029  Kansas City Catholic Diocese      1444   
2  2016       Z0029  Kansas City Catholic Diocese      1444   
3  2016       Z0029  Kansas City Catholic Diocese      1444   
4  2016       Z0029  Kansas City Catholic Diocese      1444   

                        school       grade   group_state subject  \
0  Sacred Heart Elem [Emporia]  All Grades  All Students    Math   
1  Sacred Heart Elem [Emporia]  All Grades  All Students    Math   
2  Sacred Heart Elem [Emporia]  All Grades  All Students    Math   
3  Sacred Heart Elem [Emporia]  All Grades  All Students    Math   
4  Sacred Heart Elem [Emporia]  All Grades  All Students    Math   

  performance_level  pct_at_level  proficient_tf  num_tested  
0         Level One        0.1785              0        28.0  
1         Level Two        0.4642              0        28.0  
2       Level Three    

In [90]:
# renames All Students group to All Groups
df['temp'] = [x if x != 'All Students' else 'All Groups' for x in df['group_state']]
del df['group_state']
df = df.rename(columns={'temp': 'group_state'})

In [91]:
# save as cleaned proficiency
df.to_csv('./data/finalized/ks_proficiency.csv', index=False)