In [1]:
__author__ = "me"
__date__ = "2015_10_13"

%pylab inline
import pandas as pd
import geopandas as gp
import numpy as np
import random

import pylab as plt
import os

from geopandas.tools import sjoin
from shapely.geometry import Point

from fuzzywuzzy import process

import requests
s = requests.get("https://raw.githubusercontent.com/Casyfill/CUSP_templates/master/Py/fbMatplotlibrc.json").json()
plt.rcParams.update(s)

numpy.random.seed(2015)

PARQA = os.getenv('PARQA')

Populating the interactive namespace from numpy and matplotlib




## Split calls to named and geolocated

In [2]:
calls = pd.read_csv(PARQA + 'data/311/311DPR.csv', encoding='utf8', na_values='Unspecified')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
myCalls = calls[['Park Facility Name','Descriptor','Created Date','Closed Date','Longitude','Latitude','Location Type', 'Complaint Type']]
myCalls['Park Facility Name'] = myCalls['Park Facility Name'].str.lower()
myCalls['Park Facility Name'].head()


namedCalls = myCalls[pd.notnull(myCalls['Park Facility Name'])]
geoCalls = myCalls[(pd.isnull(myCalls['Park Facility Name'])) & (pd.notnull(myCalls.Latitude))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [4]:
geoCalls.head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,Complaint Type
0,,Snow or Ice,12/31/2010 09:04:48 PM,01/03/2011 12:03:59 PM,-73.93112,40.668798,Park,Maintenance or Facility
3,,Snow or Ice,12/31/2010 03:36:37 PM,01/03/2011 09:41:24 AM,-73.962835,40.688556,Park,Maintenance or Facility
4,,Snow or Ice,12/31/2010 03:03:16 PM,01/03/2011 12:15:38 PM,-73.999809,40.636935,Park,Maintenance or Facility
6,,Snow or Ice,12/31/2010 12:59:59 PM,01/03/2011 12:23:04 PM,-73.999456,40.609951,Park,Maintenance or Facility
7,,Snow or Ice,12/31/2010 12:12:02 PM,01/03/2011 12:19:51 PM,-73.977616,40.633153,Park,Maintenance or Facility


## GeoCalls: spatial join with parks to get parkName

In [5]:
parks = gp.read_file(PARQA + 'data/SHP/DPR_ParksProperties_001/DPR_ParksProperties_001.shp')[['geometry','SIGNNAME']]

In [6]:
parks.columns

Index([u'geometry', u'SIGNNAME'], dtype='object')

In [7]:
def toGeoDataFrame(df, lat='Latitude',lon='Longitude'):
    '''dataframe to geodataframe'''
    df['geometry'] = df.apply(lambda z: Point(z[lon], z[lat]), axis=1)
    df = gp.GeoDataFrame(df)
    df.crs = {'init': 'epsg:4326', 'no_defs': True}
    return df 

In [8]:
geoCalls = toGeoDataFrame(geoCalls).to_crs(parks.crs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [9]:
geoCalls = sjoin(geoCalls, parks, how="left").to_crs(epsg=4326)

In [10]:
geoCalls = geoCalls[pd.notnull(geoCalls.SIGNNAME)]
geoCalls['Park Facility Name'] = geoCalls['SIGNNAME']
geoCalls = geoCalls[['Park Facility Name','Descriptor','Created Date','Closed Date','Longitude','Latitude','Location Type', 'Complaint Type']]

In [11]:
geoCalls.head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,Complaint Type
398,Grand Army Plaza,Rodent Sighting,12/02/2010 10:45:42 AM,12/06/2010 09:34:00 AM,-73.96873,40.673928,Park,Maintenance or Facility
744,Grand Army Plaza,Rodent Sighting,11/11/2010 03:08:10 PM,11/15/2010 09:52:24 AM,-73.969,40.674713,Park,Maintenance or Facility
1543,Grand Army Plaza,Rodent Sighting,10/03/2010 07:15:35 PM,10/05/2010 09:06:58 AM,-73.970749,40.672638,Park,Maintenance or Facility
2235,Grand Army Plaza,Rodent Sighting,09/11/2010 10:14:20 PM,09/15/2010 10:35:05 AM,-73.96873,40.673928,Park,Maintenance or Facility
2705,Grand Army Plaza,Rodent Sighting,08/31/2010 10:18:33 AM,09/01/2010 04:18:46 PM,-73.968925,40.672937,Park,Maintenance or Facility


In [12]:
calls2 = pd.concat([namedCalls, geoCalls])

In [13]:
calls2.head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,Complaint Type
1,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,Maintenance or Facility
2,brookville park,Snow or Ice,12/31/2010 04:17:22 PM,01/06/2011 08:58:30 AM,,,Park,Maintenance or Facility
5,highland park,Snow or Ice,12/31/2010 02:57:34 PM,01/03/2011 11:31:26 AM,,,Park,Maintenance or Facility
10,prospect park - east parade grounds,Dead Animal,12/31/2010 11:26:34 AM,01/03/2011 11:12:37 AM,,,Park,Animal in a Park
11,central park - east 96th street playground,Snow or Ice,12/31/2010 11:18:31 AM,01/04/2011 12:11:02 PM,,,Park,Maintenance or Facility


In [14]:
calls2.shape

(55095, 8)

## Now match those names

In [15]:
calls2['Park Facility Name'] = calls2['Park Facility Name'].str.lower()
calls2['Park Facility Name'].value_counts()

central park                                  2554
riverside park                                1208
prospect park                                  961
rockaway beach boardwalk                       761
flushing meadows corona park                   755
beach - brighton                               593
hudson river park                              485
morningside park                               439
tompkins square park                           417
marine park                                    410
forest park                                    409
van cortlandt park                             375
washington square park                         373
mccarren park                                  352
astoria park                                   350
east river park                                325
sunset park                                    324
inwood hill park                               300
inwood hill park - nature center               300
fort tryon park                

In [38]:
onto = pd.DataFrame(calls2['Park Facility Name'].unique())

## ParkID

In [39]:
prop = pd.read_excel(PARQA + 'data/Input/Parks_Data/CUSP_Adjusted_Spatial_Data.xlsx')[['ParkID','Name']]
prop = prop.dropna()
prop.Name = prop.Name.str.lower()
prop['type']='pid'

In [40]:
def trySplit(x, spl='-'):
    '''get rid of addons'''
    if spl in x:
        return x.split(spl)[0].strip()
    else:
        return x

In [42]:
### empirical dictionary

d = [
    {'ParkID':'B073','type':'abstr','Name': 'prospect park'},
    {'ParkID':'M010','type':'abstr','Name': 'central park'},
    {'ParkID':'Q004','type':'abstr','Name': 'astoria park'},
    {'ParkID':'X010','type':'abstr','Name': 'crotona park'},
    {'ParkID':'Q162','type':'abstr', 'Name': 'rockaway beach boardwalk'},
    {'ParkID':'M014','type':'abstr', 'Name': 'jackie robinson park'},
    {'ParkID':'M028','type':'abstr', 'Name': 'fort washington park'},
    {'ParkID':'M037','type':'abstr', 'Name': 'highbridge park'},
    {'ParkID':'M098','type':'abstr', 'Name': 'washington square park'},
    {'ParkID':'M105','type':'abstr', 'Name': 'sara d. roosevelt park'},
    {'ParkID':'M107','type':'abstr', 'Name': "hell's kitchen park"},
    {'ParkID':'M283','type':'abstr', 'Name': 'battery park city'},
    {'ParkID':'Q001','type':'abstr', 'Name': 'alley pond park'},
    {'ParkID':'Q005','type':'abstr', 'Name': 'baisley pond park'},
    {'ParkID':'Q009','type':'abstr', 'Name': 'macneil park'},
    {'ParkID':'Q012','type':'abstr', 'Name': 'crocheron park'},
    {'ParkID':'Q020','type':'abstr', 'Name': 'highland park'},
    {'ParkID':'Q021','type':'abstr', 'Name': 'cunningham park'},
    {'ParkID':'Q024','type':'abstr', 'Name': 'kissena park'},
    {'ParkID':'Q102','type':'abstr', 'Name': 'juniper valley park'},
    {'ParkID':'R129','type':'abstr', 'Name': 'greenbelt native plant center'},
    {'ParkID':'B058','type':'abstr', 'Name': 'mccarren park'},
    {'ParkID':'M071','type':'abstr', 'Name': 'riverside park'},
    {'ParkID':'M360','type':'abstr', 'Name': 'the high line'},
    {'ParkID':'X001','type':'abstr', 'Name': 'aqueduct walk'},
    {'ParkID':'X092','type':'abstr', 'Name': 'van cortlandt park'},
    {'ParkID':'Q099','type':'abstr', 'Name': 'flushing meadows corona park'},
    {'ParkID':'X039','type':'abstr', 'Name': 'pelham bay park'},
    {'ParkID':'Q015','type':'abstr', 'Name': 'forest park'},
    {'ParkID':'M042','type':'abstr', 'Name': 'inwood hill park'},
    {'ParkID':'B057','type':'abstr', 'Name': 'marine park'},
    {'ParkID':'B126','type':'abstr', 'Name': 'red hook park'},
    {'ParkID':'Q300','type':'abstr', 'Name': 'kissena corridor park'},
    {'ParkID':'X045','type':'abstr', 'Name': "st mary's playground"},
    {'ParkID':'X002','type':'abstr', 'Name': "bronx park"},
    {'ParkID':'M058','type':'abstr', 'Name': "marcus garvey park"},
    {'ParkID':'B371','type':'abstr', 'Name': "spring creek park"},
    {'ParkID':'B371','type':'abstr', 'Name': "spring creek park"},
    {'ParkID':'X039','type':'abstr', 'Name': "orchard beach and promenade"}
    
] 

abstr = pd.DataFrame(d)

In [44]:
# prop[prop.type=='abstr']

def getIDList(ID):
    '''get list of pID for this park'''
    return prop.ParkID[prop.ParkID.str.startswith(ID)].tolist()

In [45]:
abstr.ParkID = abstr.ParkID.apply(getIDList)

In [46]:
abstr.head()

Unnamed: 0,Name,ParkID,type
0,prospect park,"[B073-02D, B073-02, B073-10, B073-20, B073-09,...",abstr
1,central park,"[M010-151, M010-089, M010-090, M010-143, M010-...",abstr
2,astoria park,"[Q004-01, Q004-ZN01, Q004-ZN02, Q004-ZN03, Q004A]",abstr
3,crotona park,"[X010-10, X010-05, X010-12, X010-08, X010-03, ...",abstr
4,rockaway beach boardwalk,"[Q162-ZN01A, Q162-ZN01B, Q162-ZN02A, Q162-ZN02...",abstr


In [47]:
# prop.groupby('ParkID').agg(lambda x: x.tolist())
prop.ParkID = prop.ParkID.apply(lambda x: [x])

In [48]:
prop.head()

Unnamed: 0,ParkID,Name,type
0,[M058-07],marcus garvey memorial park,pid
1,[M058-06],marcus garvey memorial park,pid
2,[M058-01],mt. morris east,pid
3,[M047-03],thomas jefferson park,pid
4,[M273-01],othmar ammann playground,pid


In [49]:
prop2 = pd.concat((prop, abstr))

## Check how ontology works

In [61]:
ontoMask = pd.read_csv(PARQA + 'parqa/311/ONTOLOGY/Ontology_districts.csv', index_col=0)[['cleanName','NAME']]
ontoMask.rename(columns={'NAME':'newName','cleanName':'Name'}, inplace=1)
ontoMask.head()

Unnamed: 0_level_0,Name,newName
Park Facility Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Geo Soilan Park - Battery Park City,geo soilan park - battery park city,battery park city
Brookville Park,brookville park,brookville park
Highland Park,highland park,highland park
Highland Park,highland park,highland park
Prospect Park - East Parade Grounds,prospect park - east parade grounds,prospect park


In [62]:
onto2 = prop2.merge(ontoMask, on='Name', how='left')
onto2.newName[pd.isnull(onto2.newName)] = onto2.Name[pd.isnull(onto2.newName)]

In [63]:
### edit Onto
onto2.newName[onto2.Name.str.contains('mccarren park')] = 'mccarren park'
onto2.newName[onto2.newName.str.contains('hunt')] = 'hunts point riverside park'
onto2.newName[onto2.newName.str.contains('waring')] = 'waring plgd'
onto2.newName[onto2.newName.str.contains('red hook')] = 'red hook park'
onto2.newName[onto2.newName.str.contains('rockaway beach and boardwalk')] = 'rockaway beach boardwalk'


In [64]:
##### CHecking quality of the recognition

# print len(onto1)
# x = onto2.merge(prop2[['Name','type','ParkID']], how='left', left_on='newName', right_on='Name')

# print len(x[pd.isnull(x['type'])])
# x[pd.isnull(x['type'])].newName.value_counts()

In [65]:
onto2.rename(inplace=1, columns={'Name':'Park Facility Name','newName':'Name'})

## get Pid for Calls

In [66]:
onto2.head()

Unnamed: 0,Park Facility Name,ParkID,type,Name
0,marcus garvey memorial park,[M058-07],pid,marcus garvey memorial park
1,marcus garvey memorial park,[M058-06],pid,marcus garvey memorial park
2,mt. morris east,[M058-01],pid,mt. morris east
3,thomas jefferson park,[M047-03],pid,thomas jefferson park
4,othmar ammann playground,[M273-01],pid,othmar ammann playground


In [60]:
onto.head()

Unnamed: 0_level_0,Name,newName
Park Facility Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Geo Soilan Park - Battery Park City,geo soilan park - battery park city,battery park city
Brookville Park,brookville park,brookville park
Highland Park,highland park,highland park
Highland Park,highland park,highland park
Prospect Park - East Parade Grounds,prospect park - east parade grounds,prospect park


In [67]:
pids = onto.merge(onto2[['Name','type','ParkID']], how='left', on='Name')

In [68]:
pids.head(2)

Unnamed: 0,Name,newName,type,ParkID
0,geo soilan park - battery park city,battery park city,,
1,brookville park,brookville park,pid,[Q008-02]


In [34]:
calls3 = calls2.merge(pids[['Park Facility Name','ParkID']], on='Park Facility Name', how='left')
calls3 = calls3[pd.notnull(calls3.ParkID)]

KeyError: "['ParkID'] not in index"

In [516]:
### randomly chosen ParkID in the list

calls3['rParkID'] = calls3['ParkID'].apply(lambda x: random.choice(x))

In [517]:
calls3.head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,Complaint Type,ParkID,rParkID
0,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,Maintenance or Facility,[M283A],M283A
1,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,Maintenance or Facility,"[M283-03, M283-02, M283-01, M283-ZN01, M283A]",M283-ZN01
2,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,Maintenance or Facility,[M283A],M283A
3,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,Maintenance or Facility,"[M283-03, M283-02, M283-01, M283-ZN01, M283A]",M283-02
4,brookville park,Snow or Ice,12/31/2010 04:17:22 PM,01/06/2011 08:58:30 AM,,,Park,Maintenance or Facility,[Q008-02],Q008-02


In [1]:
# calls3.Descriptor.value_counts()
calls3[['Created Date','rParkID']][calls3.Descriptor == 'Garbage or Litter'].to_csv(PARQA + 'data/311/311_rPID_litter.csv')
calls3[['Created Date','rParkID']].to_csv(PARQA + 'data/311/311_rPID_all.csv')

NameError: name 'calls3' is not defined