# Importing and cleaning data
This notebook will clean data from the California Department of Education for the analysis. Our goal is to merge all the data into a single cohesive geodataframe, which will happen in a subsequent notebook.  

 - lcff data
 - grades
 - demographics/shapefiles

In [1]:
import pandas as pd

## LCFF data
importing from [California Department of Education](https://ias.cde.ca.gov/lcffsnapshot/lcff.aspx) and cleaning.

In [2]:
lcff = pd.read_csv('raw_data/lcffsnapshot18an.csv')
# import lcff data from source

In [3]:
lcff.columns = lcff.columns.str.strip()
# strip leading/trailing spaces

In [4]:
lcff.drop(lcff.tail(1).index,inplace=True)
# extracting state totals, removing from df

In [5]:
lcff['County Code'] = lcff['County Code'].astype(str)
lcff['District Code'] = lcff['District Code'].astype(int)
lcff['District Code'] = lcff['District Code'].astype(str)
lcff['School Code'] = lcff['School Code'].astype(int)
lcff['School Code'] = lcff['School Code'].astype(str)
# changing to strings
# slicing by district level cds code

In [6]:
lcff['School Code'] = lcff['School Code'].apply(lambda x: '{0:0>7}'.format(x))
# front fill school codes to == 7

In [7]:
lcff = lcff.loc[lcff['School Code'] == '0000000']
# select districts, district entries do not have a school level cds code

In [8]:
lcff['cds'] = lcff['County Code'] + lcff['District Code'] + lcff['School Code']
# combining into full cds code
col_name='cds'
first_col = lcff.pop(col_name)
lcff.insert(0, col_name, first_col)
# moving to front
lcff.drop('County Code', axis=1, inplace=True)
lcff.drop('School Code', axis=1, inplace=True)
# dropping excess

In [9]:
lcff = lcff.rename(columns={'Charter Number': 'charter_number',
                     'Unduplicated Pupil Percentage\r\nTarget, D-1': 'unduplicated_pupil_percentage',
                     'Base Grant Funding\r\nTarget, C-5': 'base_grant',
                     'Supplemental Grant Funding\r\nTarget, D-7': 'supplemental_grant',
                     'Concentration Grant Funding\r\nTarget, E-8': 'concentration_grant',
                     'Total LCFF Target Entitlement\r\nTarget, G-1/F-1': 'total_grants',
                     'Unduplicated Pupil Percentage\r\nTarget, D-1': 'unduplicated_pupil_count',
                     'Necessary Small Schools Allowance\nTarget, F-1': 'nec_small_schools',
                     'Local Educational Agency' : 'districtname'      
                           })

In [10]:
lcff

Unnamed: 0,cds,District Code,districtname,charter_number,"TK/K-3 ADA\r\nTarget, B-5/B-1","4 - 6 ADA\r\nTarget, B-6/B-2","7 - 8 ADA\r\nTarget, B-7/B-3","9 - 12 ADA\r\nTarget, B-8/B-4",unduplicated_pupil_count,base_grant,...,"Add-On (Based on 2012-13 Small School District Bus Replacement Program)\r\nTarget, F-4",total_grants,"Total LCFF Floor Entitlement\r\nTransition, B-13","Current Year Gap Funding (100%)\r\nTransition, C-3","Economic Recovery Target\r\nTransition, D-1","Miscellaneous Adjustments\r\nTransition, E-1","Total Local Revenue or In-Lieu of Property Taxes\r\nTransition, F-2/F-7","Education Protection Account Entitlement\r\nTransition, F-5/F-9","Net State Aid\r\nTransition, F-6/F-10","Additional SA for MSA Guarantee\r\nTransition, H-1/H-2"
10,01611190000000,61119,Alameda Unified,,2870.43,1917.91,1219.66,3050.17,0.3486,75938983,...,-,82158116,77176229,4981887,-,-,29277891,14772333,38107892,-
16,01611270000000,61127,Albany City Unified,,1060.09,806.37,554.25,1131.64,0.2833,29644972,...,-,31352540,29460891,1891649,-,-,11005115,5777202,14570223,-
17,01611430000000,61143,Berkeley Unified,,2828.54,2101.08,1397.55,3070.55,0.3519,78556532,...,-,89369621,84726528,4643093,820253,-,43425650,8875387,37888837,-
19,01611500000000,61150,Castro Valley Unified,,2617.44,2032.50,1400.40,2918.48,0.2904,74911585,...,-,79554607,74489641,5064966,-,-,26572525,14660230,38321852,-
20,01611680000000,61168,Emery Unified,,237.65,151.54,104.99,191.97,0.7953,5702229,...,-,7326875,6698947,627928,108723,-,5582328,137230,1716040,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233,58727280000000,72728,Camptonville Elementary,,21.70,19.04,12.55,-,0.6988,420692,...,-,537244,574074,-,-,-,26503,94238,416503,36830
2235,58727360000000,72736,Marysville Joint Unified,,3309.27,2265.78,1308.64,2328.80,0.8168,76193862,...,-,100261913,93267520,6994393,-,-,17303554,14953622,68004737,-
2238,58727440000000,72744,Plumas Lake Elementary,,615.24,400.21,277.70,-,0.3868,10261440,...,-,11108901,10433605,675296,355998,-,1029589,2348862,8086448,-
2239,58727510000000,72751,Wheatland,,580.88,389.53,284.05,-,0.4783,9947133,...,-,11234349,10546226,688123,309994,-,1007306,1944794,8592243,-


In [11]:
# renaming for readability
lcff = lcff[[  'districtname',
               'unduplicated_pupil_count', 
               'base_grant', 
               'supplemental_grant', 
               'concentration_grant', 
               'total_grants', 
               'cds' ]].copy()
# copying relevant columns into a new dataframe

In [12]:
lcff.to_csv("clean_data/lcff.csv")
# save to disk

## Grade data 
importing from [California Department of Education](https://www.cde.ca.gov/ta/ac/cm/datafilesfall18.asp) and cleaning

Math first

In [13]:
math = pd.read_csv('raw_data/mathdownload2018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
replacements = dict(
    studentgroup = {'ALL':'all_students',
                        'AA':'african_american',
                        'AI':'american_indian',
                        'AS':'asian',
                        'FI':'filipino',
                        'HI':'hispanic',
                        'PI':'pacific_islander',
                        'WH':'white',
                        'MR':'multiple_races',
                        'EL':'english_learners',
                        'ELO':'english_learners_only',
                        'RFP':'rfep_only',
                        'EO':'english_only',
                        'SED':'ses_disad',
                        'SWD':'disabilities',
                        'FOS':'foster_youth',
                        'HOM':'homeless_youth',
                })
# dictionary for replacing student group codes

In [15]:
math['studentgroup'].astype(str)
math = math.replace(replacements)
# replace student group codes

fixing cds codes to match to enable merges

In [16]:
math['cds'].astype(str)
# changing to string
math = math[math.coe_flag !='Y']
# removing county offices of education
math = math.loc[math['rtype'] == 'D']
# select district-level performance
math['cds'] = math['cds'].apply(lambda x: '{0:0>14}'.format(x))
# adding leading zero to 13 digit int
math['District Code'] = [x[2:7] for x in math['cds']]
# adding district only code

In [17]:
math = pd.concat([math.pop(x) for x in ['District Code',
                                          'districtname',                                          
                                          'studentgroup', 
                                          'currstatus', 
                                          'priorstatus', 
                                          'change',
                                          'countyname']],
                    1)
# slice out columns

In [18]:
math.to_csv("clean_data/math.csv")
# save to disk

English grades now

In [19]:
ela = pd.read_csv('raw_data/eladownload2018.csv')

In [20]:
ela['studentgroup'].astype(str)
ela = ela.replace(replacements)
# replace student group codes

fixing cds codes to match to enable merges

In [21]:
ela['cds'].astype(str)
# changing cds code to string
ela = ela[ela.coe_flag !='Y']
# removing county offices of education
ela = ela.loc[ela['rtype'] == 'D']
# select district level entries
ela['cds'] = ela['cds'].apply(lambda x: '{0:0>14}'.format(x))
# adding leading zero to 13 digit int
ela['District Code'] = [x[2:7] for x in ela['cds']]

In [22]:
#creating district-level ela performance variable (dictionary)
ela = pd.concat([ela.pop(x) for x in ['District Code',
                                          'districtname',                                          
                                          'studentgroup', 
                                          'currstatus', 
                                          'priorstatus', 
                                          'change',
                                          'countyname']],
                    1)
# slice out columns

In [23]:
ela.to_csv("clean_data/ela.csv")
# save to disk

## Student attribute and shapefile data

This requires geopandas.

In [24]:
import geopandas as gpd

In [25]:
t = gpd.read_file("raw_data/DistrictAreas1819.shp")

In [26]:
t.to_file("clean_data/attributes.shp")
# save to disk