# Title: Calculate_school_district_density

Author(s): <br>
Project Manager: Jaren Haber, PhD Candidate <br>
Contact: jhaber@berkeley.edu

Institution: University of California, Berkeley <br>
Program: Undergraduate Research Apprentice Program (URAP) <br>

Date created: 11|21|2018
Last modified: 11|21|2018



## Initialize

### Import packages

In [61]:
import pandas as pd # For working with DataFrames
import gc # For speeding up loading pickle files ('gc' = 'garbage collector')
import ast # For working with strings
import numpy as np # For numerical things
import re # For cleaning webtext
import _pickle as cPickle # Optimized version of pickle



### Import files

In [62]:
# Input files:
pubframe = pd.read_pickle("../../nowdata/backups/pubschools_full_2015_v2a_geoleaid.pkl")
pubframe.head()

Unnamed: 0,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,TOTFRL,CHARTER_TEXT,...,PCT_SE_T139_002,PCT_SE_T139_034,PCT_SE_T139_067,PCT_SE_T139_085,PCT_SE_T139_090,SE_T145_001,PCT_SE_T145_002,PCT_SE_T145_003,PCT_SE_T145_004,PCT_SE_T145_005
0,32.521681,-86.530132,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,15.22,32.31,0.0,7.21,45.26,54387.0,8.92,91.08,31.71,72.73
1,32.374847,-86.082332,,,,,,,,No,...,9.4,47.9,6.84,0.0,35.86,213902.0,12.5,87.5,37.64,63.07
2,33.583385,-86.710058,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,7.27,25.08,11.45,0.33,55.88,209221.0,14.9,85.1,41.87,54.08
3,31.938444,-87.750529,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,0.0,100.0,0.0,0.0,0.0,5377.0,6.73,93.27,46.22,63.55
4,33.673661,-86.628755,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,8.34,28.37,7.88,0.12,55.3,227223.0,9.97,90.03,34.06,69.21


In [63]:
# Load ACS file (school district data)
acs = pd.read_csv("../../data_management/data/ACS_2016_sd-merged_FULL.csv", header = [0, 1])

## 'Find how many schools within each district'

In [64]:
# Generate a new_frame GroupUp by LEAID, with disctinct NCESSCH, and Display the counts
# That is, within each LEAID(school district), there are "counts" many schools.
new_frame = pubframe.groupby(['GEO_LEAID'])['NCESSCH'].count().reset_index(name='All_school_counts')
new_frame.head()

Unnamed: 0,GEO_LEAID,All_school_counts
0,100001.0,2
1,100003.0,1
2,100005.0,6
3,100006.0,61
4,100007.0,18


In [65]:
# select FIPS(LEAID in pubframe) and Area of each district from ACS
area_frame = acs[[('FIPS', 'Geo_FIPS'),("Area (Land)", "Geo_AREALAND")]] 
area_frame.head()

Unnamed: 0_level_0,FIPS,Area (Land)
Unnamed: 0_level_1,Geo_FIPS,Geo_AREALAND
0,100001.0,233067400.0
1,100003.0,8476777.0
2,100005.0,68780660.0
3,100006.0,1267794000.0
4,100007.0,121365400.0


In [66]:
# Join new_frame and area_frame with a share key where FIPS = LEAID
merge_frame = pd.merge(new_frame, area_frame, how='outer', left_on=['GEO_LEAID'], right_on=[('FIPS', 'Geo_FIPS')])
merge_frame.head()



Unnamed: 0,GEO_LEAID,All_school_counts,"(FIPS, Geo_FIPS)","(Area (Land), Geo_AREALAND)"
0,100001.0,2.0,100001.0,233067400.0
1,100003.0,1.0,100003.0,8476777.0
2,100005.0,6.0,100005.0,68780660.0
3,100006.0,61.0,100006.0,1267794000.0
4,100007.0,18.0,100007.0,121365400.0


## Add column 'density', by dividing counts by area.

In [68]:
merge_frame['Density'] = merge_frame['All_school_counts']/merge_frame[("Area (Land)", "Geo_AREALAND")]
merge_frame.head()

Unnamed: 0,GEO_LEAID,All_school_counts,"(FIPS, Geo_FIPS)","(Area (Land), Geo_AREALAND)",Density
0,100001.0,2.0,100001.0,233067400.0,8.581209e-09
1,100003.0,1.0,100003.0,8476777.0,1.179694e-07
2,100005.0,6.0,100005.0,68780660.0,8.723382e-08
3,100006.0,61.0,100006.0,1267794000.0,4.811506e-08
4,100007.0,18.0,100007.0,121365400.0,1.483125e-07


## Drop the duplicate column "FIPS, Geo_FIPS", and we have the final output we want

In [69]:
data = merge_frame.drop(columns=('FIPS', 'Geo_FIPS'))
data.head()

Unnamed: 0,GEO_LEAID,All_school_counts,"(Area (Land), Geo_AREALAND)",Density
0,100001.0,2.0,233067400.0,8.581209e-09
1,100003.0,1.0,8476777.0,1.179694e-07
2,100005.0,6.0,68780660.0,8.723382e-08
3,100006.0,61.0,1267794000.0,4.811506e-08
4,100007.0,18.0,121365400.0,1.483125e-07


## Merge the Column Density to the orgininal frame

In [70]:
final_frame = pd.merge(pubframe, data, how='outer', on=['GEO_LEAID'])

In [71]:
final_frame.head()

Unnamed: 0,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,TOTFRL,CHARTER_TEXT,...,PCT_SE_T139_085,PCT_SE_T139_090,SE_T145_001,PCT_SE_T145_002,PCT_SE_T145_003,PCT_SE_T145_004,PCT_SE_T145_005,All_school_counts,"(Area (Land), Geo_AREALAND)",Density
0,32.521681,-86.530132,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,7.21,45.26,54387.0,8.92,91.08,31.71,72.73,17.0,1539582000.0,1.104196e-08
1,32.43965,-86.472581,,,,,,,,No,...,7.21,45.26,54387.0,8.92,91.08,31.71,72.73,17.0,1539582000.0,1.104196e-08
2,32.457364,-86.466194,1.0,11.0,93.0,8.0,1.0,25.0,209.0,No,...,7.21,45.26,54387.0,8.92,91.08,31.71,72.73,17.0,1539582000.0,1.104196e-08
3,32.462388,-86.46838,2.0,8.0,177.0,15.0,0.0,28.0,374.0,No,...,7.21,45.26,54387.0,8.92,91.08,31.71,72.73,17.0,1539582000.0,1.104196e-08
4,32.456134,-86.45417,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,7.21,45.26,54387.0,8.92,91.08,31.71,72.73,17.0,1539582000.0,1.104196e-08


## Count public school and charter school from the Pubframe

In [72]:
## filter out the public school
pub_filter = pubframe[pubframe.TRUE_CHARTER == 0]



In [75]:
## redo the group by procedure as before
pub_count = pub_filter.groupby(['GEO_LEAID'])['NCESSCH'].count().reset_index(name='PUBLIC_DENSITY')
pub_count.head()

Unnamed: 0,GEO_LEAID,PUBLIC_DENSITY
0,100001.0,2
1,100003.0,1
2,100005.0,6
3,100006.0,61
4,100007.0,18


12184

In [76]:
## Similar procedures for counting charter schools
charter_filter = pubframe.query('TRUE_CHARTER == 1')
charter_count = charter_filter.groupby(['GEO_LEAID'])['NCESSCH'].count().reset_index(name='CHARTER_DENSITY')
charter_count.head()

Unnamed: 0,GEO_LEAID,CHARTER_DENSITY
0,100360.0,1
1,100390.0,1
2,100540.0,1
3,100630.0,1
4,100720.0,1


In [77]:
## now merge the PUB_DENSITY & CHARTER_DENSITY back to the orginal frame
Merged_frame = pd.merge(final_frame, pub_count, how='outer', on=['GEO_LEAID'])



In [78]:
Merged_frame = pd.merge(Merged_frame, charter_count, how='outer', on=['GEO_LEAID'])


In [79]:
Merged_frame.head(100)

Unnamed: 0,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,TOTFRL,CHARTER_TEXT,...,SE_T145_001,PCT_SE_T145_002,PCT_SE_T145_003,PCT_SE_T145_004,PCT_SE_T145_005,All_school_counts,"(Area (Land), Geo_AREALAND)",Density,PUBLIC_DENSITY,CHARTER_DENSITY
0,32.521681,-86.530132,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
1,32.439650,-86.472581,,,,,,,,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
2,32.457364,-86.466194,1.0,11.0,93.0,8.0,1.0,25.0,209.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
3,32.462388,-86.468380,2.0,8.0,177.0,15.0,0.0,28.0,374.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
4,32.456134,-86.454170,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
5,32.433718,-86.659643,0.0,0.0,319.0,1.0,0.0,5.0,313.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
6,32.496773,-86.473458,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
7,32.700696,-86.474550,1.0,2.0,50.0,5.0,0.0,4.0,227.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
8,32.589386,-86.463004,1.0,4.0,109.0,27.0,2.0,24.0,508.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,
9,32.454222,-86.451616,2.0,11.0,165.0,18.0,1.0,11.0,338.0,No,...,54387.0,8.92,91.08,31.71,72.73,17.0,1.539582e+09,1.104196e-08,17.0,


## Save output/ Display results

In [81]:
# Save data to disk
Merged_frame.to_csv("../../nowdata/backups/pubschools_full_2015_v2a_density.pkl")


## Challenge! 

Make your own copy of this notebook in the training/111418_meeting3/ folder. Include your last name in the beginning of the file name, for instance 'haber_template_notebook'. 

In your copy, print your name to screen using the combine_strings() function defined above. 

Then push the notebook to git! By default this will use git-LFS. Get used to it!

In [5]:
combine_strings("Yitong ", "Chen")

'Yitong Chen'