In [22]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/tx/enrollment/raw/'

enrollment_files = [
    'Enrollment Report_Statewide_Campuses_Grade_2012-2013.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2013-2014.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2014-2015.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2015-2016.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2016-2017.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2017-2018.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2018-2019.csv'
]

ethnicity_files = [
    'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2012-2013.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2013-2014.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2014-2015.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2015-2016.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2016-2017.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2017-2018.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2018-2019.csv'
]

years = [x[48:52] for x in enrollment_files]

df = pd.read_csv(enrollment_path + enrollment_files[0], skiprows = 4)
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i], skiprows = 4)
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort = True)
    
df_eth = pd.read_csv(enrollment_path + ethnicity_files[0], skiprows = 4)
df_eth['year'] = df_eth.apply(lambda x: years[0], axis=1)

for i in range(1, len(ethnicity_files)):
    df2 = pd.read_csv(enrollment_path + ethnicity_files[i], skiprows = 4)
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df_eth = df_eth.append(df2, ignore_index = True, sort = True)

In [23]:
# rename fields in normal and ethnicity files
df = df.rename(columns = {
    'District Number': 'district_id'
    , 'District Name': 'district'
    , 'Campus Number': 'school_id'
    , 'Campus Name': 'school'
    , 'Grade Level Name': 'grade'
    , 'Enrollment by Grade Level': 'num'
})

df_eth = df_eth.rename(columns = {
    'District Number': 'district_id'
    , 'District Name': 'district'
    , 'Campus Number': 'school_id'
    , 'Campus Name': 'school'
    , 'Ethnicity Name': 'group_state'
    , 'Grade Level Name': 'grade'
    , 'Ethnicity Count': 'num'
})

In [24]:
# add All Groups field to main data
df['group_state'] = df.apply(lambda x: 'All Groups', axis = 1)

In [25]:
# append data
df = df.append(df_eth, ignore_index = True, sort = True)

In [26]:
# change negative numbers to '*'
df['num'] = [x if x > 0 else '*' for x in df['num']]

In [27]:
# drop unneeded columns
df = df[['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'num']]

In [28]:
# drop rows with a null school_id
df = df[~df['school_id'].isna()]

In [29]:
# clean up grades
df['grade'] = df['grade'].str.replace('  ', ' ')

In [30]:
# create All Grades rollups
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state']
df_grouped = df[df['num'] != '*'].groupby(by = grouped_by, as_index = False)['num'].sum()
df_grouped['grade'] = df_grouped.apply(lambda x: 'All Grades', axis = 1)
df = df.append(df_grouped, ignore_index = True, sort = True)

In [31]:
# export results
df.to_csv('./data/finalized/tx_enrollment.csv', index=False)