In [1]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/nj/enrollment/'

years = [
    2017
    , 2018
    , 2019
]

enrollment_files = [
    'nj_enrollment_2017_cleaned.csv'
    , 'nj_enrollment_2018_cleaned.csv'
    , 'nj_enrollment_2019_cleaned.csv'
]

# import main enrollment files
df = pd.read_csv(enrollment_path + enrollment_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort=True)

In [2]:
# rename TOTAL to All Grades
df['grade'] = df['grade'].str.upper()
df['grade'] = [x if x != 'TOTAL' else 'All Grades' for x in df['grade']]

In [3]:
# collapse genders into a single subgroup column
gender_subgroups = [
    'AM'
    , 'AS'
    , 'BL'
    , 'HI'
    , 'MU'
    , 'PI'
    , 'WH'
]

# combine genders for every subgroup
for subgroup in gender_subgroups:
    df[subgroup] = df.apply(lambda x: x[subgroup + '_M'] + x[subgroup + '_F'], axis=1)

In [4]:
# reshape data
subgroups = [
    'AM'
    , 'AS'
    , 'BL'
    , 'HI'
    , 'MU'
    , 'PI'
    , 'WH'
    , 'FREE_LUNCH'
    , 'LEP'
    , 'MIGRANT'
    , 'All Groups'
]
df = pd.melt(df, id_vars=['year', 'district_id', 'district', 'school_id', 'school', 'grade']
        , value_vars=subgroups
        , var_name='group_state'
        , value_name='num')

In [5]:
# drop district results
df = df[df['school_id'] != 999]

In [8]:
# clean 0s and nulls
print(df.shape)
df = df.dropna(subset=['num'])
df = df[df['num'] != 0]
print(df.shape)

(590425, 8)
(294300, 8)


In [10]:
# unify data types
print(df.dtypes)
df['year'] = df['year'].astype(str)
df['district_id'] = df['district_id'].astype(str)
df['school_id'] = df['school_id'].astype(str)
df['num'] = df['num'].astype('int64')
print(df.dtypes)

year             int64
district_id      int64
district        object
school_id        int64
school          object
grade           object
group_state     object
num            float64
dtype: object
year           object
district_id    object
district       object
school_id      object
school         object
grade          object
group_state    object
num             int64
dtype: object


In [11]:
# export
df.to_csv('./data/finalized/nj_enrollment.csv', index=False)