In [1]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/ks/enrollment/'

years = [
    2014
    , 2015
    , 2016
    , 2017
    , 2018
    , 2019
]

enrollment_files = [
    'ks_enrollment_2014_cleaned.csv'
    , 'ks_enrollment_2015_cleaned.csv'
    , 'ks_enrollment_2016_cleaned.csv'
    , 'ks_enrollment_2017_cleaned.csv'
    , 'ks_enrollment_2018_cleaned.csv'
    , 'ks_enrollment_2019_cleaned.csv'
]

# CURRENTLY AREN'T USING THESE FILES, BUT THEY DO HAVE NUMBERS FOR ALL GRADES/FRL THAT COULD BE MORE ACCURATE THAN ROLLUPS
# enrollment_frl_files = [
#     'ks_enrollment_frl_2014_cleaned.csv'
#     , 'ks_enrollment_frl_2015_cleaned.csv'
#     , 'ks_enrollment_frl_2016_cleaned.csv'
#     , 'ks_enrollment_frl_2017_cleaned.csv'
#     , 'ks_enrollment_frl_2018_cleaned.csv'
#     , 'ks_enrollment_frl_2019_cleaned.csv'
# ]

subgroups = [
    'AMER. INDIAN OR ALASKA NATIVE'
    , 'ASIAN'
    , 'BLACK'
    , 'FREE LUNCH'
    , 'HISPANIC'
    , 'MULTI-ETHNIC'
    , 'REDUCED-PRICE LUNCH'
    , 'SPECIAL EDUC'
    , 'WHITE'
]

df = pd.read_csv(enrollment_path + enrollment_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True)

# import school mapping file
df_schools = pd.read_csv('./data/ks/unique_schools.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [2]:
# strip rows with null district IDS
df = df[pd.notnull(df['ORG. #'])]
df = df[df['ORG. #'].str.strip() != '']

# rename columns
df = df.rename(columns={"ORG. #": "district_id", "TOTAL ALL": "All Groups"})

# parse district and school names
parse = df['ORG. & BUILDING NAMES'].str.split(pat=' - ', n=1, expand=True)
df['district'] = parse[0]
df['school'] = parse[1]

# join school ids
df = pd.merge(df, df_schools, on=['year', 'school'])

In [3]:
# create list of unique year, district_id, district for proficiency data
df_districts = df[['year', 'district_id', 'district']]
df_districts = df_districts.drop_duplicates()
df_districts.to_csv('./data/ks/unique_districts.csv', index=False)

In [4]:
# create function to combine gender columns
def combineGenders (row, column1, column2):
    if row[column1] == '<10*' and row[column2] == '<10*':
        return '<20'
    if (row[column1] == '<10*' or row[column1] == 'N/A*') and (row[column2] == '<10*' or row[column2] == 'N/A*'):
        return 'N/A'
    if row[column1] == '<10*' or row[column1] == 'N/A*':
        return row[column2]
    if row[column2] == '<10*' or row[column2] == 'N/A*':
        return row[column1]
    return str(int(row[column1]) + int(row[column2]))

# combine genders for every subgroup
for subgroup in subgroups:
    df[subgroup] = df.apply(lambda row: combineGenders(row, subgroup + ' FEMALE', subgroup + ' MALE'), axis=1)

# clean grades
df['grade'] = [x if x != 'Total' else 'All Grades' for x in df['GRADE']]

In [5]:
# subset to only columns we care about
df = df[[
    'year'
    , 'district_id'
    , 'district'
    , 'school'
    , 'grade'
    , 'All Groups'
    , 'AMER. INDIAN OR ALASKA NATIVE'
    , 'ASIAN'
    , 'BLACK'
    , 'FREE LUNCH'
    , 'HISPANIC'
    , 'MULTI-ETHNIC'
    , 'REDUCED-PRICE LUNCH'
    , 'SPECIAL EDUC'
    , 'WHITE'
]]

In [6]:
# reshape wide to long
df = pd.melt(df, id_vars=['year', 'district_id', 'district', 'school', 'grade']
        , value_vars=['All Groups'
            , 'AMER. INDIAN OR ALASKA NATIVE'
            , 'ASIAN'
            , 'BLACK'
            , 'FREE LUNCH'
            , 'HISPANIC'
            , 'MULTI-ETHNIC'
            , 'REDUCED-PRICE LUNCH'
            , 'SPECIAL EDUC'
            , 'WHITE'],
        var_name='group_state'
        , value_name='num')

In [10]:
# drop bad num values
def toInt(row, column):
    try:
        return int(row[column])
    except:
        return np.nan

df['num'] = df.apply(lambda x: toInt(x, 'num'), axis = 1)
print(df.shape)
df = df.dropna(subset=['num'])
print(df.shape)

(624620, 7)
(404325, 7)


In [11]:
# unify data types
print(df.dtypes)
df['year'] = df['year'].astype(str)
df['num'] = df['num'].astype('int64')
print(df.dtypes)

year             int64
district_id     object
district        object
school          object
grade           object
group_state     object
num            float64
dtype: object
year           object
district_id    object
district       object
school         object
grade          object
group_state    object
num             int64
dtype: object


In [12]:
# save as cleaned enrollment
df.to_csv('./data/finalized/ks_enrollment.csv', index=False)