In [13]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/pa/enrollment/'

enrollment_files = [
    'pa_enrollment_2019_cleaned.csv'
    , 'pa_enrollment_2018_cleaned.csv'
    , 'pa_enrollment_2017_cleaned.csv'
    , 'pa_enrollment_2016_cleaned.csv'
    , 'pa_enrollment_2015_cleaned.csv'
    , 'pa_enrollment_2014_cleaned.csv'
    , 'pa_enrollment_2013_cleaned.csv'
    , 'pa_enrollment_2012_cleaned.csv'
    , 'pa_enrollment_2011_cleaned.csv'
    , 'pa_enrollment_2010_cleaned.csv'
    , 'pa_enrollment_2009_cleaned.csv'
    , 'pa_enrollment_2008_cleaned.csv'
]

years = [x[14:18] for x in enrollment_files]

df = pd.read_csv(enrollment_path + enrollment_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort = True)

In [14]:
print(df.columns)

Index(['001', '002', '003', '004', '005', '006', '007', '008', '009', '010',
       '011', '012', 'AUN', 'County', 'EUG', 'K4A', 'K4F', 'K4P', 'K5A', 'K5F',
       'K5P', 'LEA Name', 'LEA Type', 'PKA', 'PKF', 'PKP', 'SUG',
       'School Name', 'School Number', 'Total', 'Unnamed: 28', 'Unnamed: 30',
       'year'],
      dtype='object')


In [15]:
# rename fields
df = df.rename(columns = {
    'AUN': 'district_id'
    , 'LEA Name': 'district'
    , 'School Number': 'school_id'
    , 'School Name': 'school'
    , 'Total': 'All Grades'
})

df['group_state'] = df.apply(lambda x: 'All Groups', axis = 1)

In [16]:
# reshape file
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state']
value_vars = ['001', '002', '003', '004', '005', '006', '007', '008', '009', '010', '011', '012', 'EUG', 'K4A', 'K4F', 'K4P', 'K5A', 'K5F', 'K5P', 'PKA', 'PKF', 'PKP', 'SUG', 'All Grades']
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, var_name = 'grade', value_name = 'num')
print(df.head())

   year  district_id                               district  school_id  \
0  2019   1.2415e+08                  21st Century Cyber CS     7691.0   
1  2019   1.0302e+08              A W Beattie Career Center     5273.0   
2  2019  1.26514e+08                   ACT Academy Cyber CS     8217.0   
3  2019     1.28e+08                             ARIN IU 28        0.0   
4  2019  1.81519e+08  ASPIRA Bilingual Cyber Charter School     8148.0   

                                  school group_state grade num  
0                  21st Century Cyber CS  All Groups   001   0  
1              A W Beattie Career Center  All Groups   001   0  
2                   ACT Academy Cyber CS  All Groups   001   0  
3                             ARIN IU 28  All Groups   001   1  
4  ASPIRA Bilingual Cyber Charter School  All Groups   001   8  


In [17]:
# convert num to int
def toInt(row, column):
    try:
        return int(row[column])
    except:
        return np.nan
df['num'] = df.apply(lambda x: toInt(x, 'num'), axis = 1)

In [18]:
# drop records with no students
print(df.shape)
df = df[df['num'] != 0]
df = df.dropna(subset=['num'])
print(df.shape)

(951480, 8)
(258185, 8)


In [19]:
print(df.dtypes)
df['school_id'] = df['school_id'].astype('int64')
df['school_id'] = df['school_id'].astype(str)
df['num'] = df['num'].astype('int64')
print(df.dtypes)

year            object
district_id     object
district        object
school_id      float64
school          object
group_state     object
grade           object
num            float64
dtype: object
year           object
district_id    object
district       object
school_id      object
school         object
group_state    object
grade          object
num             int64
dtype: object


In [20]:
print(df.shape)
df = df[~df.school_id.eq('0')]
print(df.shape)

(258185, 8)
(255283, 8)


In [21]:
# export results
df.to_csv('./data/finalized/pa_enrollment.csv', index=False)