In [1]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/ks/proficiency/'

proficiency_files = [
    'ks_proficiency_2015_cleaned.csv'
    , 'ks_proficiency_2016_cleaned.csv'
    , 'ks_proficiency_2017_cleaned.csv'
    , 'ks_proficiency_2018_cleaned.csv'
    , 'ks_proficiency_2019_cleaned.csv'
]

participation_files = [
    'ks_participation_2016_cleaned.csv'
    , 'ks_participation_2017_cleaned.csv'
    , 'ks_participation_2018_cleaned.csv'
    , 'ks_participation_2019_cleaned.csv'
]

participation_years = [
    2016
    , 2017
    , 2018
    , 2019
]

df = pd.read_csv(proficiency_path + proficiency_files[0])

for i in range(1, len(proficiency_files)):
    df2 = pd.read_csv(proficiency_path + proficiency_files[i])
    df = df.append(df2, ignore_index = True)

df = df.rename(columns={
    "Group": "group_state"
    , "Subject": "subject"
    , "Org No": "district_id"
    , "School Year": "year"
    , "Pct Level One": "Level One"
    , "Pct Level Two": "Level Two"
    , "Pct Level Three": "Level Three"
    , "Pct Level Four": "Level Four"
    , "Pct Not Valid": "Not Valid"})

# import participation files
df_participation = pd.read_csv(proficiency_path + participation_files[0])
df_participation['year'] = df_participation.apply(lambda x: int(participation_years[0]), axis=1)
for i in range(1, len(participation_files)):
    df2 = pd.read_csv(proficiency_path + participation_files[i])
    df2['year'] = df2.apply(lambda x: int(participation_years[i]), axis=1)
    df_participation = df_participation.append(df2, ignore_index = True)

# import district mapping file
df_districts = pd.read_csv('./data/ks/unique_districts.csv')

In [2]:
# parse school_id and school names
parse = df['School/District Name'].str.split(pat=' - ', n=1, expand=True)
df['school_id'] = parse[0]
df['school'] = parse[1]

# drop district level results
df = df[df['school_id'].str.len() == 4]

# make grades string and change 13 to All Grades
df['grade'] = [str(x) if x != 13 else 'All Grades' for x in df['Grade']]

# join district names
df = pd.merge(df, df_districts, on=['year', 'district_id'])

In [3]:
# create list of unique year, school_id, school for enrollment data
df_schools = df[['year', 'school_id', 'school']]
df_schools = df_schools.drop_duplicates()
df_schools.to_csv('./data/ks/unique_schools.csv', index=False)

In [4]:
# reshape scores
df = pd.melt(df, id_vars=['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject']
        , value_vars=['Level One'
            , 'Level Two'
            , 'Level Three'
            , 'Level Four'
            , 'Not Valid'],
        var_name='performance_level'
        , value_name='pct_at_level')

In [5]:
# change pct_at_level to decimal
df['temp'] = df['pct_at_level'] / 100
del df['pct_at_level']
df = df.rename(columns={'temp': 'pct_at_level'})

# create proficient_tf
def proficientTF (row):
    if row['performance_level'] == 'Level Three' or row['performance_level'] == 'Level Four':
        return 1
    return 0
df['proficient_tf'] = df.apply(lambda row: proficientTF(row), axis=1)

In [6]:
# clean subjects in participation file
df_participation['Subject_cleaned'] = [x if x != 'MATH' else 'Math' for x in df_participation['Subject']]
del df_participation['Subject']

# filter to relevant columns and rename for merge
df_participation = df_participation[['year', 'Building_No', 'Subject_cleaned', 'Total_Part_N', 'GroupName']]
df_participation = df_participation.rename(columns={
    "Building_No": "school_id"
    , "Total_Part_N": "num_tested"
    , 'Subject_cleaned': 'subject'
    , 'GroupName': 'group_state'})

# create grade column
df_participation['grade'] = df_participation.apply(lambda x: 'All Grades', axis=1)

In [7]:
# join num_tested to scores
df = pd.merge(df, df_participation, on=['year', 'school_id', 'grade', 'group_state', 'subject'])

In [8]:
# renames All Students group to All Groups
df['temp'] = [x if x != 'All Students' else 'All Groups' for x in df['group_state']]
del df['group_state']
df = df.rename(columns={'temp': 'group_state'})

In [10]:
# drop all non-proficiency rows
print(df.shape)
df = df[df.proficient_tf == 1]
print(df.shape)

(894680, 12)
(357872, 12)


In [12]:
# drop null records
print(df.shape)
df = df.dropna(subset=['num_tested', 'pct_at_level'])
print(df.shape)

(357872, 12)
(357778, 12)


In [13]:
# unify data types
print(df.dtypes)
df['year'] = df['year'].astype(str)
df['proficient_tf'] = df['proficient_tf'].astype(bool)
print(df.dtypes)

year                   int64
district_id           object
district              object
school_id             object
school                object
grade                 object
subject               object
performance_level     object
pct_at_level         float64
proficient_tf          int64
num_tested           float64
group_state           object
dtype: object
year                  object
district_id           object
district              object
school_id             object
school                object
grade                 object
subject               object
performance_level     object
pct_at_level         float64
proficient_tf           bool
num_tested           float64
group_state           object
dtype: object


In [14]:
# create num_at_level
df['num_at_level'] = df.apply(lambda x: x['pct_at_level'] * x['num_tested'], axis = 1)
print(df.dtypes)

year                  object
district_id           object
district              object
school_id             object
school                object
grade                 object
subject               object
performance_level     object
pct_at_level         float64
proficient_tf           bool
num_tested           float64
group_state           object
num_at_level         float64
dtype: object


In [16]:
# roll up performance levels
print(df.shape)
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'proficient_tf']
df = df.groupby(grouped_by, as_index=False).agg({'num_tested': 'sum', 'num_at_level': 'sum'})
print(df.shape)

(357778, 13)
(137788, 11)


In [17]:
# recalc pct_at_level
df['pct_at_level'] = df.apply(lambda x: x['num_at_level'] / x['num_tested'], axis = 1)
print(df.dtypes)

year              object
district_id       object
district          object
school_id         object
school            object
subject           object
grade             object
group_state       object
proficient_tf       bool
num_tested       float64
num_at_level     float64
pct_at_level     float64
dtype: object


In [19]:
# save as cleaned proficiency
df.to_csv('./data/finalized/ks_proficiency.csv', index=False)