# Importing and cleaning data
This notebook will clean data from the California Department of Education for the analysis. Our goal is to merge all the data into a single cohesive geodataframe, which will happen in a subsequent notebook.  

 - lcff data
 - grades
 - demographics/shapefiles

In [1]:
import pandas as pd

## LCFF data
importing from [California Department of Education](https://ias.cde.ca.gov/lcffsnapshot/lcff.aspx) and cleaning.

In [2]:
lcff = pd.read_csv('raw_data/lcffsnapshot18an.csv')
# import lcff data from source

In [3]:
lcff.columns = lcff.columns.str.strip()
# strip leading/trailing spaces

In [4]:
lcff.drop(lcff.tail(1).index,inplace=True)
# extracting state totals, removing from df

In [5]:
lcff['County Code'] = lcff['County Code'].astype(str)
lcff['District Code'] = lcff['District Code'].astype(int)
lcff['District Code'] = lcff['District Code'].astype(str)
lcff['School Code'] = lcff['School Code'].astype(int)
lcff['School Code'] = lcff['School Code'].astype(str)
# changing to strings
# slicing by district level cds code

In [6]:
lcff['School Code'] = lcff['School Code'].apply(lambda x: '{0:0>7}'.format(x))
# front fill school codes to == 7

In [7]:
lcff = lcff.loc[lcff['School Code'] == '0000000']
# select districts, district entries do not have a school level cds code

In [8]:
lcff['cds'] = lcff['County Code'] + lcff['District Code'] + lcff['School Code']
# combining into full cds code
col_name='cds'
first_col = lcff.pop(col_name)
lcff.insert(0, col_name, first_col)
# moving to front
lcff.drop('County Code', axis=1, inplace=True)
lcff.drop('School Code', axis=1, inplace=True)
# dropping excess

In [9]:
lcff = lcff.rename(columns={'Charter Number': 'charter_number',
                     'Unduplicated Pupil Percentage\nTarget, D-1': 'unduplicated_pupil_percentage',
                     'Base Grant Funding\nTarget, C-5': 'base_grant',
                     'Supplemental Grant Funding\nTarget, D-7': 'supplemental_grant',
                     'Concentration Grant Funding\nTarget, E-8': 'concentration_grant',
                     'Total LCFF Target Entitlement\nTarget, G-1/F-1': 'total_grants',
                     'Unduplicated Pupil Percentage\nTarget, D-1': 'unduplicated_pupil_count',
                     'Necessary Small Schools Allowance\nTarget, F-1': 'nec_small_schools',
                     'Local Educational Agency' : 'districtname'      
                           })
# renaming for readability
lcff = lcff[[  'districtname',
               'unduplicated_pupil_count', 
               'base_grant', 
               'supplemental_grant', 
               'concentration_grant', 
               'total_grants', 
               'cds' ]].copy()
# copying relevant columns into a new dataframe

In [10]:
lcff.to_csv("clean_data/lcff.csv")
# save to disk

## Grade data 
importing from [California Department of Education](https://www.cde.ca.gov/ta/ac/cm/datafilesfall18.asp) and cleaning

Math first

In [11]:
math = pd.read_csv('raw_data/mathdownload2018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
replacements = dict(
    studentgroup = {'ALL':'all_students',
                        'AA':'african_american',
                        'AI':'american_indian',
                        'AS':'asian',
                        'FI':'filipino',
                        'HI':'hispanic',
                        'PI':'pacific_islander',
                        'WH':'white',
                        'MR':'multiple_races',
                        'EL':'english_learners',
                        'ELO':'english_learners_only',
                        'RFP':'rfep_only',
                        'EO':'english_only',
                        'SED':'ses_disad',
                        'SWD':'disabilities',
                        'FOS':'foster_youth',
                        'HOM':'homeless_youth',
                })
# dictionary for replacing student group codes

In [13]:
math['studentgroup'].astype(str)
math = math.replace(replacements)
# replace student group codes

fixing cds codes to match to enable merges

In [14]:
math['cds'].astype(str)
# changing to string
math = math[math.coe_flag !='Y']
# removing county offices of education
math = math.loc[math['rtype'] == 'D']
# select district-level performance
math['cds'] = math['cds'].apply(lambda x: '{0:0>14}'.format(x))
# adding leading zero to 13 digit int
math['District Code'] = [x[2:7] for x in math['cds']]
# adding district only code

In [15]:
math = pd.concat([math.pop(x) for x in ['District Code',
                                          'districtname',                                          
                                          'studentgroup', 
                                          'currstatus', 
                                          'priorstatus', 
                                          'change',
                                          'countyname']],
                    1)
# slice out columns

In [16]:
math.to_csv("clean_data/math.csv")
# save to disk

English grades now

In [17]:
ela = pd.read_csv('raw_data/eladownload2018.csv')

In [18]:
ela['studentgroup'].astype(str)
ela = ela.replace(replacements)
# replace student group codes

fixing cds codes to match to enable merges

In [19]:
ela['cds'].astype(str)
# changing cds code to string
ela = ela[ela.coe_flag !='Y']
# removing county offices of education
ela = ela.loc[ela['rtype'] == 'D']
# select district level entries
ela['cds'] = ela['cds'].apply(lambda x: '{0:0>14}'.format(x))
# adding leading zero to 13 digit int
ela['District Code'] = [x[2:7] for x in ela['cds']]

In [20]:
#creating district-level ela performance variable (dictionary)
ela = pd.concat([ela.pop(x) for x in ['District Code',
                                          'districtname',                                          
                                          'studentgroup', 
                                          'currstatus', 
                                          'priorstatus', 
                                          'change',
                                          'countyname']],
                    1)
# slice out columns

In [21]:
ela.to_csv("clean_data/ela.csv")
# save to disk

## Student attribute and shapefile data

This requires geopandas.

In [22]:
import geopandas as gpd

In [23]:
t = gpd.read_file("raw_data/DistrictAreas1819.shp")

In [24]:
t.to_file("clean_data/attributes.shp")
# save to disk