In [952]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

import geopandas as gpd
import shapely
from shapely.geometry import Point

import re
import difflib
from collections import Counter
from collections import defaultdict

import dill
import folium


# Collect Crime data from New York Dept of Criminal Justice

In [None]:
# download all the crime data in NYS

r = request.get(...)

# get all the links
soup = BeautifulSoup(r.text,'lxml')
links = [ str(tag.get('href')) for tag in soup.table.find_all('a') if re.search(r'xls',str(tag))  ]

urlbase = 'http://www.criminaljustice.ny.gov/crimnet/ojsa/indexcrimes/'
s = requests.Session()
for tag in links:
    print tag
    rx = s.get(urlbase+str(tag))
    with open (tag,'w') as outfile:
        outfile.write(rx.content)


### Compute county averages for school districts with no mappable police districts

In [905]:
# input crime counts
county_read=pd.read_excel('./data_CrimeNYS/County-totals.xls',skiprows=7)
df_county_crime=county_read[county_read['Year']==2014][['County','Total','Total.1','Total.2']]
df_county_crime[:3]

Unnamed: 0,County,Total,Total.1,Total.2
4,Albany,9168,998,8170
9,Allegany,614,68,546
14,Bronx,36030,12713,23317


In [1034]:
# population data
countyshape_read= gpd.read_file('./Shapefiles/NYS_GIS_civil/Counties.shp').to_crs(epsg='4326')
df_countypop = countyshape_read[['NAME','POP2010']]
df_countypop[:3]

Unnamed: 0,NAME,POP2010
0,Albany,304204
1,Allegany,48946
2,Bronx,1385108


In [1036]:
df_county_crimerate = pd.merge(df_county_crime,df_countypop,left_on='County',right_on='NAME')
df_county_crimerate['crimerate_total']=df_county_crimerate['Total']/df_county_crimerate['POP2010']*100
df_county_crimerate['crimerate_violent']=df_county_crimerate['Total.1']/df_county_crimerate['POP2010']*100
df_county_crimerate['crimerate_property']=df_county_crimerate['Total.2']/df_county_crimerate['POP2010']*100
print len(df_county_crimerate)
df_county_crimerate[:3]

62


Unnamed: 0,County,Total,Total.1,Total.2,NAME,POP2010,crimerate_total,crimerate_violent,crimerate_property
0,Albany,9168,998,8170,Albany,304204,3.013767,0.328069,2.685698
1,Allegany,614,68,546,Allegany,48946,1.254444,0.138929,1.115515
2,Bronx,36030,12713,23317,Bronx,1385108,2.601241,0.917835,1.683407


### Map police districts to civil districts (which has population info)

In [169]:
filelist = ["Albany.xls","Allegany.xls","Bronx.xls","Broome.xls","Cattaraugus.xls","Cayuga.xls","Chautauqua.xls","Chemung.xls",
            "Chenango.xls","Clinton.xls","Columbia.xls","Cortland.xls","Delaware.xls","Dutchess.xls",
            "Erie.xls","Essex.xls","Franklin.xls","Fulton.xls","Genesee.xls","Greene.xls","Hamilton.xls","Herkimer.xls",
            "Jefferson.xls","Kings.xls","Lewis.xls","Livingston.xls","Madison.xls","Monroe.xls","Montgomery.xls","Nassau.xls",
            "NewYork.xls","Niagara.xls","Oneida.xls","Onondaga.xls","Ontario.xls","Orange.xls","Orleans.xls","Oswego.xls",
            "Otsego.xls","Putnam.xls","Queens.xls","Rensselaer.xls","Richmond.xls","Rockland.xls","Saratoga.xls",
            "Schenectady.xls","Schoharie.xls","Schuyler.xls","Seneca.xls","StLawrence.xls","Steuben.xls","Suffolk.xls",
            "Sullivan.xls","Tioga.xls","Tompkins.xls","Ulster.xls","Warren.xls","Washington.xls","Wayne.xls",
            "Westchester.xls","Wyoming.xls","Yates.xls"]

In [469]:
def getLatestRow(df):
    return df.sort_values('Year',ascending=False)[0:1]

def filterCrimeStats(df):

    # => remove rows with NaN data in total column
    df0 = df[~pd.isnull(df['Total'])]
    
    # => keep only city/town/village police (remove county,university police)    
    df1 = df0[df0['PD'].str.contains(r' City| Town| Vg')]
    #return df[df['PD'].str.contains('County|SUNY|College|Tech|Westchester|NYC|University|Investigat')]
    
    # => add a 'locale' column stripped of police district suffice and civil unit type
    df1['locale'] = df1['PD'].replace('( City| Vg| Town| PD)','',regex=True)
    
    # => for locales with multiple pds, take the larger one
    c = Counter(df1['locale'].tolist())
    df_out = pd.DataFrame()
    
    for locale, count in c.items():            
        
        df_sub = df1[df1['locale'].str.contains('^%s$' % locale,regex=True)]

        if count > 1:
            pd2use = sorted(df_sub['PD'].tolist(),key = lambda x:x[-2])[0]
            print pd2use
            df_locale = df_sub[df_sub['PD']==pd2use]
        else:
            df_locale = df_sub
        #print df_out
        df_out = df_out.append(df_locale)
    return(df_out)

#print df_latestrow[:3]
#filterCrimeStats(df_latestrow)

In [995]:
# concatenate all counties (latest year) into a dataframe
crime_allcounties = pd.DataFrame()
for fname in filelist:
    c_read=pd.read_excel('./data_CrimeNYS/%s' % fname,skiprows=7)
    
    df_latestrows = c_read.groupby('PD').apply(lambda df: getLatestRow(df))
    df_filtered = filterCrimeStats(df_latestrows)
    crime_allcounties = crime_allcounties.append(df_filtered)

# drop the 'PD' index that comes from groupby
crime_allcounties.index= pd.MultiIndex.droplevel(crime_allcounties.index)
crime_allcounties.index = range(len(crime_allcounties))

print len(crime_allcounties)
crime_allcounties[:5]

Elmira City PD
Fishkill Town PD
Poughkeepsie City PD
Tonawanda City PD
Hamburg Town PD
Frankfort Town PD
Montgomery Town PD
Chester Town PD
Goshen Town PD
Newburgh City PD
East Hampton Town PD
Southampton Town PD
Mamaroneck Town PD
391


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,County,PD,Year,Rptd,Total,Total.1,Murder,Rape,Robbery,Assault,Total.2,Burglary,Larceny,Theft,locale
0,Albany,Albany City PD,2014.0,,4650.0,770.0,8.0,27.0,236.0,499.0,3880.0,681.0,3077.0,122.0,Albany
1,Albany,Green Island Vg PD,2014.0,,49.0,3.0,0.0,0.0,0.0,3.0,46.0,6.0,37.0,3.0,Green Island
2,Albany,Cohoes City PD,2014.0,,263.0,39.0,0.0,1.0,12.0,26.0,224.0,50.0,153.0,21.0,Cohoes
3,Albany,Guilderland Town PD,2014.0,,680.0,19.0,0.0,3.0,4.0,12.0,661.0,58.0,598.0,5.0,Guilderland
4,Albany,Coeymans Town PD,2014.0,,87.0,24.0,1.0,4.0,3.0,16.0,63.0,14.0,44.0,5.0,Coeymans


In [657]:
# (for matching pd_names to civil_names)
# store the index and (cleaned) locale names as values in the county dict

replacelist_pd=[' pd',' police district', ' vg', ' town', ' city'] # cleaning
def clean_pdnames(slist):
    s_out = []
    for s in slist:
        s_out.append(re.sub('-',' ',re.sub(r'|'.join(replacelist_pd),'',s.lower())) )
    return s_out
    
pd_dict = {}

for county in crime_allcounties['County'].unique():
    
    df_sub = crime_allcounties[crime_allcounties['County'] == county]
    pd_it = zip(df_sub.index, clean_pdnames(df_sub['locale'].tolist()))

    pd_dict[county.lower()] = pd_it



# Population from census (not used currently)

In [702]:
# minor civil divisions = 62 cities and 932 towns
pop_read1 = pd.read_csv('./Census_population/MinorCivilDivisions/PEP_2014_PEPANNRES.csv',skiprows=1)
pop_read1[:3]

Unnamed: 0,Id,Id2,Geography,"April 1, 2010 - Census","April 1, 2010 - Estimates Base",Population Estimate (as of July 1) - 2010,Population Estimate (as of July 1) - 2011,Population Estimate (as of July 1) - 2012,Population Estimate (as of July 1) - 2013,Population Estimate (as of July 1) - 2014
0,0610000US3600101000,3600101000,"Albany city, Albany County, New York",97856,97856,97748,98012,98474,98628,98566
1,0610000US3600106211,3600106211,"Berne town, Albany County, New York",2794,2794,2792,2800,2813,2818,2826
2,0610000US3600106354,3600106354,"Bethlehem town, Albany County, New York",33656,33658,33650,33812,34150,34471,34685


In [703]:
# incorporated = 62 cities and 550 villages; towns are assumed to be in MCU list 
# and not reflected here, but many towns and villages share the same name

pop_read2 = pd.read_csv('./Census_population/Incorporated/PEP_2014_PEPANNRES.csv',skiprows=1)
pop_read2[:3]


Unnamed: 0,Id,Id2,Geography,"April 1, 2010 - Census","April 1, 2010 - Estimates Base",Population Estimate (as of July 1) - 2010,Population Estimate (as of July 1) - 2011,Population Estimate (as of July 1) - 2012,Population Estimate (as of July 1) - 2013,Population Estimate (as of July 1) - 2014
0,1620000US3600199,3600199,"Adams village, New York",1775.0,1782,1788,1819,1860,1839,1839
1,1620000US3600276,3600276,"Addison village, New York",1763.0,1763,1760,1763,1749,1742,1730
2,1620000US3600342,3600342,"Afton village, New York",822.0,832,830,828,824,819,818


In [704]:
pop_read=pop_read1.append(pop_read2)


In [706]:
pop_read[pop_read['Geography'].str.contains('Scar')]

Unnamed: 0,Id,Id2,Geography,"April 1, 2010 - Census","April 1, 2010 - Estimates Base",Population Estimate (as of July 1) - 2010,Population Estimate (as of July 1) - 2011,Population Estimate (as of July 1) - 2012,Population Estimate (as of July 1) - 2013,Population Estimate (as of July 1) - 2014
979,0610000US3611965442,3611965442,"Scarsdale town, Westchester County, New York",17166.0,17166,17194,17345,17466,17625,17729
495,1620000US3665431,3665431,"Scarsdale village, New York",17166.0,17166,17194,17345,17466,17625,17729


In [996]:
df_pop = pop_read[['Id2','Geography','Population Estimate (as of July 1) - 2014']]
df_pop.rename(columns={'Id2':'geoid2','Geography':'locale','Population Estimate (as of July 1) - 2014':'pop_2014'},inplace=True)
df_pop['county'] = df_pop['locale'].str.replace('(County.*$|^.*city, |^.*town, )','')
df_pop['locale'] = df_pop['locale'].str.replace('( city.*$| town.*$)','')
df_pop[:3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,geoid2,locale,pop_2014,county
0,3600101000,Albany,98566,Albany
1,3600106211,Berne,2826,Albany
2,3600106354,Bethlehem,34685,Albany


# New York GIS Civil Boundaries

In [710]:
civshape_read1 = gpd.read_file('./Shapefiles/NYS_GIS_civil/Cities_Towns.shp').to_crs(epsg='4326')
df_civshape1 = civshape_read1[['CALC_SqMi','COUNTY','NAME','POP2010','SWIS','geometry']]


In [709]:
civshape_read2 = gpd.read_file('./Shapefiles/NYS_GIS_civil/Villages.shp').to_crs(epsg='4326')
civshape_read2[:3]
df_civshape2 = civshape_read2[['CALC_SqMi','COUNTY','NAME','POP2010','SWIS','geometry']]


In [732]:
df_civshape = df_civshape1.append(df_civshape2)
df_civshape = df_civshape.groupby('NAME').first()
df_civshape.reset_index(level = 0, inplace = True)


In [733]:
print len(civshape_read1)
print len(civshape_read2)
print len(df_civshape)


994
545
1287


In [739]:
df_civshape[:3]

Unnamed: 0,NAME,CALC_SqMi,COUNTY,POP2010,SWIS,geometry
0,Adams,42.382699,Jefferson,5143,222000,POLYGON ((-75.97855203259267 43.80364563132086...
1,Addison,25.678418,Steuben,2595,462000,POLYGON ((-77.22875495979802 42.09075298909016...
2,Afton,46.683722,Chenango,2851,82000,POLYGON ((-75.44110294422454 42.19511895955584...


In [740]:
# store a county dict: (index, cleaned-name) 
def clean_civnames(slist):
    s_out = []
    for s in slist:
        s_out.append(re.sub('-',' ',s.lower()))
    return s_out

civ_dict = {}
for county in df_civshape['COUNTY'].unique():
    
    df_sub = df_civshape[df_civshape['COUNTY'] == county]
    civ_it = zip(df_sub.index, clean_civnames(df_sub['NAME'].tolist()))

    civ_dict[county.lower()] = civ_it


In [213]:
# naive assignment:
# within each county, assign pd to civ

def getSimilarity(s1,s2):
    return SequenceMatcher(None,s1,s2).ratio() 

def getBestMatchID(targetlist,candidatelist,cutoff):
    bestid_target = []

    for tname in targetlist:
        max_id=-1
        max_ratio = 0
        for j,cname in enumerate(candidatelist):
            r = getSimilarity(tname,cname)
            if r>=cutoff and r > max_ratio:
                max_ratio = r
                max_id = j
            best_cand = candidatelist[max_id] if max_id > -1 else 'NOT_FOUND'
        bestid_target.append((max_id,best_cand,max_ratio))
    return bestid_target


In [678]:
# reciprocal assignment analogous to ortholog pairing
# inputs are target and candidate (index in df, locale_name)

def reciprocal_assignment (ZT, ZC):

    cutoff=0.7

    bestid_T2C = getBestMatchID(ZT[1],ZC[1],cutoff)
    bestid_C2T = getBestMatchID(ZC[1],ZT[1],cutoff)

    T2C_assignment = [(-1,-1)]*len(ZT[1])

    for i,tname in enumerate(ZT[1]):
        j = bestid_T2C[i][0]
        if j == -1:
            T2C_assignment[i] = (ZT[0][i],-1)

        if bestid_C2T[j][0] == i or bestid_C2T[j][2] == bestid_T2C[i][2]:
            T2C_assignment[i] = (ZT[0][i],ZC[0][j])
            # civ_bAssigned[j] = 1
            # print pd_names_cleaned[i]+' '+str(bestid_pd[i])+' YES'
            # nAssigned += 1
        #else:
            # print pd_names_cleaned[i]+' '+str(bestid_pd[i])+str(bestid_sd[j])+' NO'
            # nNotAssigned += 1

    return T2C_assignment


In [741]:
pd2civ_assignment = []

for county in pd_dict.keys():
#for county in ['westchester','albany']:

    zp = zip(*pd_dict[county])
    zc = zip(*civ_dict[county])

    #pd2civ_assignment.extend( zip (zp[0],assignments) 
    pd2civ_assignment.extend( reciprocal_assignment (zp, zc) )



In [742]:
len(pd2civ_assignment)


391

In [1037]:
#t=crime_allcounties[crime_allcounties['County']=='Westchester']
t=crime_allcounties
df_assign = pd.DataFrame(pd2civ_assignment,columns=['oldind','targetind'])
df_assign
t1=pd.merge(t,df_assign,left_index=True,right_on='oldind')
t2=pd.merge(t1,df_civshape,left_on='targetind',right_index = True,how='inner')

t2[['County','PD','Total','Total.1','Total.2','COUNTY','NAME','POP2010','geometry']]

df_crimerate_shape = t2[['County','PD','Total','Total.1','Total.2','COUNTY','NAME','POP2010','geometry']]
df_crimerate_shape['crimerate_total'] = df_crimerate_shape['Total']/df_crimerate_shape['POP2010']*100
df_crimerate_shape['crimerate_violent'] = df_crimerate_shape['Total.1']/df_crimerate_shape['POP2010']*100
df_crimerate_shape['crimerate_property'] = df_crimerate_shape['Total.2']/df_crimerate_shape['POP2010']*100
print len(crime_allcounties)
print len(df_assign)
print len(t1)
print len(t2)
print len(df_crimerate_refshape)
df_crimerate_shape[:3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


391
391
389
385
672


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,County,PD,Total,Total.1,Total.2,COUNTY,NAME,POP2010,geometry,crimerate_total,crimerate_violent,crimerate_property
110,Albany,Albany City PD,4650.0,770.0,3880.0,Albany,Albany,97856,POLYGON ((-73.81123351640107 42.69007664226881...,4.75188,0.786871,3.96501
111,Albany,Green Island Vg PD,49.0,3.0,46.0,Albany,Green Island,2620,POLYGON ((-73.68429398585005 42.76076701652282...,1.870229,0.114504,1.755725
112,Albany,Cohoes City PD,263.0,39.0,224.0,Albany,Cohoes,16168,POLYGON ((-73.69874375781777 42.75128521139592...,1.62667,0.241217,1.385453


In [744]:
len(t2[t2['targetind']!=-1])

385

# refshape : school district

In [1038]:
df_refshape = dill.load(open('../OUTDATA/refshape_sch2015.dill','rb'))
refSHP=df_refshape['geometry'].tolist()


In [1039]:
# assign each school district to a crime data region

def findEncompassingShape(point,shapelist):
    for refshape_ind,refshape in shapelist:
        if refshape.contains(point):
            return refshape_ind
    return -1

ref_i_centroids = zip(df_refshape.index,df_refshape['centroid'])
crimerate_shapes = zip(df_crimerate_shape.index,df_crimerate_shape['geometry'])
index_map = []

for refindex,centroid in ref_i_centroids:
    p = Point(centroid[0],centroid[1])
    crime_index = findEncompassingShape(p,crimerate_shapes)
    index_map.append((refindex,crime_index))

#index_map

In [1040]:
# join crime data to school data
df_assign=pd.DataFrame(index_map,columns=['oldind','targetind'])
t1=pd.merge(df_refshape,df_assign,left_index=True,right_on='oldind',how='left')
t2=pd.merge(t1,df_crimerate_shape[['PD','crimerate_total','crimerate_violent','crimerate_property']],left_on='targetind',right_index = True,how='left')
#t2[:3]
df_crimerate_refshape = t2[['name','county','geometry','PD','crimerate_total','crimerate_violent','crimerate_property']]
print len(df_crimerate_refshape)
df_crimerate_refshape[:3]


672


Unnamed: 0,name,county,geometry,PD,crimerate_total,crimerate_violent,crimerate_property
0,North Shore Central School District,Nassau,"POLYGON ((-73.67107299999999 40.859299, -73.65...",,,,
1,Seaford Union Free School District,Nassau,"POLYGON ((-73.50683099999999 40.666806, -73.50...",Hempstead Vg PD,0.190324,0.06331,0.127014
2,Uniondale Union Free School District,Nassau,"POLYGON ((-73.619737 40.68251799999999, -73.61...",Hempstead Vg PD,0.190324,0.06331,0.127014


In [1041]:
# separate school districts with (a) pd data and (b)those that do not
df_crimerate_refshape_pd = df_crimerate_refshape[~ pd.isnull(df_crimerate_refshape['crimerate_total']) ]
df_crimerate_refshape_pd['refshpindex']=df_crimerate_refshape_pd.index.values
print len(df_crimerate_refshape_pd)
df_crimerate_refshape_pd[:3]

286


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,name,county,geometry,PD,crimerate_total,crimerate_violent,crimerate_property,refshpindex
1,Seaford Union Free School District,Nassau,"POLYGON ((-73.50683099999999 40.666806, -73.50...",Hempstead Vg PD,0.190324,0.06331,0.127014,1
2,Uniondale Union Free School District,Nassau,"POLYGON ((-73.619737 40.68251799999999, -73.61...",Hempstead Vg PD,0.190324,0.06331,0.127014,2
3,Wantagh Union Free School District,,"POLYGON ((-73.52614799999999 40.691816, -73.52...",Hempstead Vg PD,0.190324,0.06331,0.127014,3


In [1042]:
df_crimerate_refshape_nopd = df_crimerate_refshape[pd.isnull(df_crimerate_refshape['crimerate_total'])][['name','county','geometry','PD']]
df_crimerate_refshape_nopd['refshpindex']=df_crimerate_refshape_nopd.index.values
print len(df_crimerate_refshape_nopd)
df_crimerate_refshape_nopd[:3]

386


Unnamed: 0,name,county,geometry,PD,refshpindex
0,North Shore Central School District,Nassau,"POLYGON ((-73.67107299999999 40.859299, -73.65...",,0
4,Bradford Central School District,Schuyler,"POLYGON ((-77.188081 42.376986, -77.182283 42....",,4
5,Hammondsport Central School District,Steuben,"POLYGON ((-77.285324 42.395859, -77.283485 42....",,5


In [1045]:
# for those with no police districts assigned, use the county average
# left join so all refshapes are listed even if no merge val found
t2 = pd.merge(df_crimerate_refshape_nopd,df_county_crimerate[['County','crimerate_total','crimerate_violent','crimerate_property']], left_on='county',right_on='County',how='left')
del t2['County']
print len(t2)
t2[:3]

386


Unnamed: 0,name,county,geometry,PD,refshpindex,crimerate_total,crimerate_violent,crimerate_property
0,North Shore Central School District,Nassau,"POLYGON ((-73.67107299999999 40.859299, -73.65...",,0,1.319789,0.152516,1.167273
1,Bradford Central School District,Schuyler,"POLYGON ((-77.188081 42.376986, -77.182283 42....",,4,0.899526,0.054517,0.845009
2,Hammondsport Central School District,Steuben,"POLYGON ((-77.285324 42.395859, -77.283485 42....",,5,1.49106,0.112133,1.378927


In [1046]:
df_crimerate_refshape_join = pd.concat([df_crimerate_refshape_pd,t2])
df_crimerate_refshape_join.index=df_crimerate_refshape_join['refshpindex']
print len(df_crimerate_refshape_join)
df_crimerate_refshape_join[:3]


672


Unnamed: 0_level_0,PD,county,crimerate_property,crimerate_total,crimerate_violent,geometry,name,refshpindex
refshpindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Hempstead Vg PD,Nassau,0.127014,0.190324,0.06331,"POLYGON ((-73.50683099999999 40.666806, -73.50...",Seaford Union Free School District,1
2,Hempstead Vg PD,Nassau,0.127014,0.190324,0.06331,"POLYGON ((-73.619737 40.68251799999999, -73.61...",Uniondale Union Free School District,2
3,Hempstead Vg PD,,0.127014,0.190324,0.06331,"POLYGON ((-73.52614799999999 40.691816, -73.52...",Wantagh Union Free School District,3


In [1026]:
df_crimerate_refshape_join[df_crimerate_refshape_join.index.values==1]


Unnamed: 0_level_0,PD,county,crimerate_property,crimerate_total,crimerate_violent,geometry,name,refshpindex
refshpindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Hempstead Vg PD,Nassau,0.127014,0.190324,0.06331,"POLYGON ((-73.50683099999999 40.666806, -73.50...",Seaford Union Free School District,1


In [1050]:
# save the data
dill.dump(df_crimerate_refshape_join,open('../OUTDATA/refshape_crime2014.dill','wb'))


# Plotting with Folium

In [1047]:
crs = crs = {'init': 'epsg:4326', 'no_defs': True}
geo_df = gpd.GeoDataFrame(df_crimerate_refshape_join, crs=crs)
geo_str = geo_df.to_json()

In [1048]:
print geo_df['crimerate_total'].describe()

count    657.000000
mean       1.704280
std        2.075190
min        0.000000
25%        0.911491
50%        1.523083
75%        2.112267
max       47.045730
Name: crimerate_total, dtype: float64


In [1031]:
# function to provide color (from colorbrewer)
def getColor(d):
    if d > 5: 
        return '#8e0152'
#    elif d > 45:
#        return '#c51b7d'
    elif d > 2.5:
        return '#de77ae'
    elif d > 2:
        return '#f1b6da'
#    elif d > 2.5:
#        return '#fde0ef'
    elif d > 1.5:
        return '#f7f7f7' 
    elif d > 1:
        return '#e6f5d0'
    elif d > 0.5:
        return '#b8e186'
#    elif d > .5:    
#        return '#7fbc41'
    elif d > 0: 
        return '#4d9221'
    return '#d9d9d9'

In [1051]:
#
m = folium.Map([40.6,-73.938], zoom_start=8,tiles='cartodbpositron')

folium.GeoJson(geo_df,    style_function=lambda feature: {
        'fillColor': getColor(feature['properties']['crimerate_total']),
        'color' : 'black',
        'weight' : 1,
        'fillOpacity':1, 'opacity':0.2
        }
              ).add_to(m)
path='crimemap.html'
m.save(path)


In [997]:
#testing
t = gpd.GeoDataFrame({'state':['a','b','c','d'],'score':range(4),'geometry':[Point(1,1),Point(3,3),Point(4,5),Point(2,2)]})
jstring = t.to_json()
print jstring

{"type": "FeatureCollection", "features": [{"geometry": {"type": "Point", "coordinates": [1.0, 1.0]}, "type": "Feature", "id": "0", "properties": {"state": "a", "score": 0}}, {"geometry": {"type": "Point", "coordinates": [3.0, 3.0]}, "type": "Feature", "id": "1", "properties": {"state": "b", "score": 1}}, {"geometry": {"type": "Point", "coordinates": [4.0, 5.0]}, "type": "Feature", "id": "2", "properties": {"state": "c", "score": 2}}, {"geometry": {"type": "Point", "coordinates": [2.0, 2.0]}, "type": "Feature", "id": "3", "properties": {"state": "d", "score": 3}}]}
