In [1]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/pa/enrollment/'

enrollment_files = [
    'pa_enrollment_2019_cleaned.csv'
    , 'pa_enrollment_2018_cleaned.csv'
    , 'pa_enrollment_2017_cleaned.csv'
    , 'pa_enrollment_2016_cleaned.csv'
    , 'pa_enrollment_2015_cleaned.csv'
    , 'pa_enrollment_2014_cleaned.csv'
    , 'pa_enrollment_2013_cleaned.csv'
    , 'pa_enrollment_2012_cleaned.csv'
    , 'pa_enrollment_2011_cleaned.csv'
    , 'pa_enrollment_2010_cleaned.csv'
    , 'pa_enrollment_2009_cleaned.csv'
    , 'pa_enrollment_2008_cleaned.csv'
]

years = [x[14:18] for x in enrollment_files]

df = pd.read_csv(enrollment_path + enrollment_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i])
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort = True)
print(df.head())

   001   002   003   004   005 006    007  008  009  010  ...  PKA PKF  PKP  \
0  0.0   0.0   0.0   0.0   0.0  73  115.0  161  256  214  ...  0.0   0  0.0   
1  0.0   0.0   0.0   0.0   0.0   0    0.0    0    0  183  ...  0.0   0  0.0   
2  0.0   0.0   0.0   0.0   0.0   0    0.0    0   19   26  ...  0.0   0  0.0   
3  1.0   3.0   0.0   3.0   2.0   4    1.0    7    5    6  ...  0.0   0  0.0   
4  8.0  16.0  11.0  17.0  19.0  13   41.0   27   44   44  ...  0.0   0  0.0   

  SUG                            School Name  School Number  Total  \
0 NaN                  21st Century Cyber CS         7691.0  1,235   
1 NaN              A W Beattie Career Center         5273.0    779   
2 NaN                   ACT Academy Cyber CS         8217.0     76   
3 NaN                             ARIN IU 28            0.0     57   
4 NaN  ASPIRA Bilingual Cyber Charter School         8148.0    365   

   Unnamed: 28 Unnamed: 30  year  
0          NaN         NaN  2019  
1          NaN         NaN  2019  

In [2]:
print(df.columns)

Index(['001', '002', '003', '004', '005', '006', '007', '008', '009', '010',
       '011', '012', '7', 'AUN', 'County', 'EUG', 'K4A', 'K4F', 'K4P', 'K5A',
       'K5F', 'K5P', 'LEA Name', 'LEA Type', 'PKA', 'PKF', 'PKP', 'SUG',
       'School Name', 'School Number', 'Total', 'Unnamed: 28', 'Unnamed: 30',
       'year'],
      dtype='object')


In [3]:
# rename fields
df = df.rename(columns = {
    'AUN': 'district_id'
    , 'LEA Name': 'district'
    , 'School Number': 'school_id'
    , 'School Name': 'school'
    , 'Total': 'All Grades'
})

df['group_state'] = df.apply(lambda x: 'All Groups', axis = 1)

In [5]:
# reshape file
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state']
value_vars = ['001', '002', '003', '004', '005', '006', '007', '008', '009', '010', '011', '012', 'EUG', 'K4A', 'K4F', 'K4P', 'K5A', 'K5F', 'K5P', 'PKA', 'PKF', 'PKP', 'SUG', 'All Grades']
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, var_name = 'grade', value_name = 'num')
print(df.head())

   year  district_id                               district  school_id  \
0  2019   1.2415e+08                  21st Century Cyber CS     7691.0   
1  2019   1.0302e+08              A W Beattie Career Center     5273.0   
2  2019  1.26514e+08                   ACT Academy Cyber CS     8217.0   
3  2019     1.28e+08                             ARIN IU 28        0.0   
4  2019  1.81519e+08  ASPIRA Bilingual Cyber Charter School     8148.0   

                                  school group_state grade num  
0                  21st Century Cyber CS  All Groups   001   0  
1              A W Beattie Career Center  All Groups   001   0  
2                   ACT Academy Cyber CS  All Groups   001   0  
3                             ARIN IU 28  All Groups   001   1  
4  ASPIRA Bilingual Cyber Charter School  All Groups   001   8  


In [6]:
# drop records with no students
df = df[df['num'] != 0]

(951480, 8)
(475543, 8)


In [7]:
# export results
df.to_csv('./data/finalized/pa_enrollment.csv', index=False)