In [4]:
import pandas as pd
import numpy as np
import util

enrollment_path = './data/tx/enrollment/raw/'

enrollment_files = [
    'Enrollment Report_Statewide_Campuses_Grade_2012-2013.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2013-2014.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2014-2015.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2015-2016.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2016-2017.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2017-2018.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_2018-2019.csv'
]

ethnicity_files = [
    'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2012-2013.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2013-2014.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2014-2015.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2015-2016.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2016-2017.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2017-2018.csv'
    , 'Enrollment Report_Statewide_Campuses_Grade_Ethnicity_2018-2019.csv'
]

li_files = [
    'StudPgmStateCampus15state.csv'
    , 'StudPgmStateCampus16state.csv'
    , 'StudPgmStateCampus17state.csv'
    , 'StudPgmStateCampus18state.csv'
    , 'StudPgmStateCampus19state.csv'
]

years = [x[48:52] for x in enrollment_files]
years_li = ['2015', '2016', '2017', '2018', '2019']

df = pd.read_csv(enrollment_path + enrollment_files[0], skiprows = 4)
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(enrollment_files)):
    df2 = pd.read_csv(enrollment_path + enrollment_files[i], skiprows = 4)
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df = df.append(df2, ignore_index = True, sort = True)
    
df_eth = pd.read_csv(enrollment_path + ethnicity_files[0], skiprows = 4)
df_eth['year'] = df_eth.apply(lambda x: years[0], axis=1)

for i in range(1, len(ethnicity_files)):
    df2 = pd.read_csv(enrollment_path + ethnicity_files[i], skiprows = 4)
    df2['year'] = df2.apply(lambda x: years[i], axis=1)
    df_eth = df_eth.append(df2, ignore_index = True, sort = True)
    
df_li = pd.DataFrame()

for i in range(0, len(li_files)):
    df2 = pd.read_csv(enrollment_path + li_files[i], skiprows = 6)
    df2['year'] = df2.apply(lambda x: years_li[i], axis=1)
    df_li = df_li.append(df2, ignore_index = True, sort = True)

In [5]:
print(df_li.columns)

Index(['AGG_LEVEL', 'CAMPUS NAME', 'CAMPUS NUMBER', 'COUNTY NAME',
       'COUNTY NUMBER', 'DISTRICT NAME', 'DISTRICT NUMBER', 'GRADE GROUP',
       'REGION', 'SCOPE', 'SUM_HOMELESS', 'TOTAL AT RISK STUDENTS',
       'TOTAL BILINGUAL STUDENTS', 'TOTAL CTE STUDENTS',
       'TOTAL DYSLEXIC STUDENTS', 'TOTAL ECONOMICALLY DISADVANTAGED STUDENTS',
       'TOTAL ENROLLMENT', 'TOTAL ESL STUDENTS', 'TOTAL FOSTER CARE STUDENTS',
       'TOTAL G & T STUDENTS', 'TOTAL LEP STUDENTS',
       'TOTAL MILITARY CONNECTED STUDENTS',
       'TOTAL STUDENTS RECEIVING SPECIAL EDUCATION SERVICES',
       'TOTAL TITLE I STUDENTS', 'YEAR', 'year'],
      dtype='object')


In [6]:
# rename fields in normal and ethnicity and li files
df = df.rename(columns = {
    'District Number': 'district_id'
    , 'District Name': 'district'
    , 'Campus Number': 'school_id'
    , 'Campus Name': 'school'
    , 'Grade Level Name': 'grade'
    , 'Enrollment by Grade Level': 'num'
})

df_eth = df_eth.rename(columns = {
    'District Number': 'district_id'
    , 'District Name': 'district'
    , 'Campus Number': 'school_id'
    , 'Campus Name': 'school'
    , 'Ethnicity Name': 'group_state'
    , 'Grade Level Name': 'grade'
    , 'Ethnicity Count': 'num'
})

df_li = df_li.rename(columns = {
    'DISTRICT NUMBER': 'district_id'
    , 'DISTRICT NAME': 'district'
    , 'CAMPUS NUMBER': 'school_id'
    , 'CAMPUS NAME': 'school'
    , 'TOTAL ECONOMICALLY DISADVANTAGED STUDENTS': 'num'
})

In [7]:
# add All Groups field to main data and 'All Grades' field to df_li
df['group_state'] = df.apply(lambda x: 'All Groups', axis = 1)
df_li['group_state'] = 'Low-Income'
df_li['grade'] = 'All Grades'

In [8]:
# append data
df = df.append(df_eth, ignore_index = True, sort = True)
df = df.append(df_li, ignore_index = True, sort = True)

In [9]:
# change negative numbers to '*'
df['num'] = [x if x > 0 else '*' for x in df['num']]

In [10]:
# drop unneeded columns
df = df[['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'num']]

In [11]:
# drop rows with a null school_id
df = df[~df['school_id'].isna()]

In [12]:
# clean up grades
df['grade'] = df['grade'].str.replace('  ', ' ')

In [13]:
# create All Grades rollups
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state']
df_grouped = df[df['num'] != '*'].groupby(by = grouped_by, as_index = False)['num'].sum()
df_grouped['grade'] = df_grouped.apply(lambda x: 'All Grades', axis = 1)
df = df.append(df_grouped, ignore_index = True, sort = True)

In [14]:
def toInt(row, column):
    try:
        return int(row[column])
    except:
        return np.nan
df['num'] = df.apply(lambda x: toInt(x, 'num'), axis = 1)

In [15]:
# clean 0s and nulls
print(df.shape)
df = df.dropna(subset=['num'])
df = df[df['num'] != 0]
print(df.shape)

(1995948, 8)
(1477200, 8)


In [16]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype(int)
df['district_id'] = df['district_id'].astype(str)
df['school_id'] = df['school_id'].astype(int)
df['school_id'] = df['school_id'].astype(str)
df['num'] = df['num'].astype('int64')
print(df.dtypes)

district        object
district_id    float64
grade           object
group_state     object
num            float64
school          object
school_id      float64
year            object
dtype: object
district       object
district_id    object
grade          object
group_state    object
num             int64
school         object
school_id      object
year           object
dtype: object


In [17]:
print(df[['year', 'grade', 'group_state']].drop_duplicates())

         year             grade                                group_state
0        2013          Grade 12                                 All Groups
1        2013           Grade 6                                 All Groups
2        2013           Grade 9                                 All Groups
3        2013          Grade 10                                 All Groups
4        2013          Grade 11                                 All Groups
6        2013           Grade 7                                 All Groups
7        2013           Grade 8                                 All Groups
9        2013  Pre-kindergarten                                 All Groups
10       2013      Kindergarten                                 All Groups
11       2013           Grade 1                                 All Groups
12       2013           Grade 2                                 All Groups
13       2013           Grade 3                                 All Groups
14       2013           G

In [20]:
# export results
df.to_csv('./data/finalized/tx_enrollment.csv', index=False)