# Creating Ontology of 311 Park Names -> official park names

This notebook creates a match dictionary between park names in 311 data and in NYC Parks official dataset. 
Using this dictionary we can match each park-related call to specific park or park District

Here I am using [fuzzywuzzy module](https://github.com/seatgeek/fuzzywuzzy)

In [305]:
__author__ = "Philipp Kats"
__date__ = "2015_10_15"

import pandas as pd
import geopandas as gp
import numpy as np
import os
from fuzzywuzzy import process  ## fuzzy string match in python

%pylab inline
PARQA = os.getenv('PARQA') #basic path

Populating the interactive namespace from numpy and matplotlib


## Getting 311 Data

In [306]:
calls = pd.read_csv(PARQA + '/data/311DPR.csv',encoding='utf8', na_values='Unspecified')
calls= calls[pd.notnull(calls['Park Facility Name'])]

In [307]:
callParks = calls[['Park Facility Name','Borough']].drop_duplicates()
calls[pd.notnull(calls['Park Facility Name'])]
callParks['cleanName'] = callParks['Park Facility Name'].str.lower()
print '311_call_names total:', len(callParks['Park Facility Name'])

311_call_names total: 1666


In [308]:
callParks.head()

Unnamed: 0,Park Facility Name,Borough,cleanName
1,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city
2,Brookville Park,QUEENS,brookville park
5,Highland Park,BROOKLYN,highland park
10,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds
11,Central Park - East 96th Street Playground,MANHATTAN,central park - east 96th street playground


## Getting Park names

In [309]:
## using geojson produced of OPEN DATA 
parkNames = gp.read_file(PARQA + 'data/parks_computed.geojson')[['SIGNNAME','BOROUGH']].drop_duplicates()
parkNames['SIGNNAME'] = parkNames['SIGNNAME'].str.lower()
len(parkNames['SIGNNAME'].drop_duplicates())

1787

In [310]:
#create a dictionary of parks for each borough 
#that will add accuracy and boost spead of search ( I hope)

ofParks = {k:[unicode(x) for x in list(v)] for k,v in parkNames.groupby("BOROUGH")["SIGNNAME"]}

tmp = []
for v in ofParks.values():
    tmp.extend(v)
ofParks['?'] = tmp   #['???'] ### to mark parks with unspecified Borough

print ofParks.keys()

[u'B', u'M', u'Q', u'R', u'X', '?']


In [311]:
# ofParks['B']

## I. Direct matching

In [312]:
cParksMatched = callParks.merge(parkNames, how='left', left_on = 'cleanName', right_on='SIGNNAME', copy=1)
cParksMatched['Type'] = np.nan

In [313]:
len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)])

714

In [314]:
cParksMatched.ix[(pd.notnull(cParksMatched.SIGNNAME)&pd.isnull(cParksMatched.Type)), 'Type'] = 'park_direct'

In [315]:
cParksMatched.head()

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type
0,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city,,,
1,Brookville Park,QUEENS,brookville park,brookville park,Q,park_direct
2,Highland Park,BROOKLYN,highland park,highland park,B,park_direct
3,Highland Park,BROOKLYN,highland park,highland park,Q,park_direct
4,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds,,,


## II. Empiric decoding

In [316]:
#some manual improvisation, based on emirical experience

def empiric(x):
    'recognition, based on emirical experience'
    general = {'flushing meadows':'flushing meadows and corona park',
               'central park':'central park', 
               'prospect park':'prospect park',
               'greenbelt': 'greenbelt native plant center',
               'red hook park':'red hook recreation area',
               'crotona park':'crotona park',
               'marine park': 'marine park', 
               'van cortlandt':'van cortlandt park',
               'seravalli': 'corporal john a. seravalli playground',
               'inwood hill': 'inwood hill park',
               'forest park':'forest park',
               'pelham bay park':'pelham bay park',
               'kissena park': 'kissena park',
               'asser levy': 'asser levy park',
               'highland park':'highland park'
                }
    
    
    for k in general.keys():
        if k in x.cleanName:
            return general[k]
    return np.nan
    

In [317]:
cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME), 'SIGNNAME'] = cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME)].apply(empiric,1)

In [318]:
cParksMatched.ix[(pd.notnull(cParksMatched.SIGNNAME)&pd.isnull(cParksMatched.Type)), 'Type'] = 'empiric'

In [319]:
cParksMatched.head(10)

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type
0,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city,,,
1,Brookville Park,QUEENS,brookville park,brookville park,Q,park_direct
2,Highland Park,BROOKLYN,highland park,highland park,B,park_direct
3,Highland Park,BROOKLYN,highland park,highland park,Q,park_direct
4,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds,prospect park,,empiric
5,Central Park - East 96th Street Playground,MANHATTAN,central park - east 96th street playground,central park,,empiric
6,Washington Hall Park,BROOKLYN,washington hall park,washington hall park,B,park_direct
7,Callahan-Kelly Playground,BROOKLYN,callahan-kelly playground,callahan-kelly playground,B,park_direct
8,William H Seward Park,MANHATTAN,william h seward park,,,
9,Col David Marcus Playground,BROOKLYN,col david marcus playground,,,


In [320]:
print 'Recognized:'
print len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)]), '/', len(cParksMatched)


Recognized:
923 / 1678


## III. Playgrounds

In [321]:
playgrounds = [x.decode('utf8') for x in pd.read_csv(PARQA + 'data/DPR_property/playgrounds.csv')['Name'].tolist()]

In [330]:
# cParksMatched[pd.isnull(cParksMatched.SIGNNAME)][cParksMatched.cleanName.str.contains('play')]

In [323]:
cParksMatched['match'] = np.nan
cParksMatched['ratio'] = np.nan

In [324]:
cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME) & cParksMatched.cleanName.str.contains('play'), 'Type'] = 'pgs'
cParksMatched.ix[cParksMatched.Type=='pgs', 'match'] = \
cParksMatched.ix[cParksMatched.Type=='pgs', 'cleanName'].apply(lambda x: process.extractOne(x, playgrounds))

In [326]:
cParksMatched.ix[cParksMatched.Type=='pgs', 'SIGNNAME'] = \
cParksMatched.ix[cParksMatched.Type=='pgs', 'match'].apply(lambda x: x[0])

cParksMatched.ix[cParksMatched.Type=='pgs', 'ratio'] = \
cParksMatched.ix[cParksMatched.Type=='pgs', 'match'].apply(lambda x: x[1])

In [327]:
cParksMatched[cParksMatched.Type=='pgs']

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type,match,ratio
9,Col David Marcus Playground,BROOKLYN,col david marcus playground,Col David Marcus Mem Playground,,pgs,"(Col David Marcus Mem Playground, 95)",95
13,Mccarren Park - Vincent V Abate Playground,BROOKLYN,mccarren park - vincent v abate playground,Vincent V Abate Playground,,pgs,"(Vincent V Abate Playground, 90)",90
24,Harold L Ickes Playground,BROOKLYN,harold l ickes playground,Harold Ickes Playground,,pgs,"(Harold Ickes Playground, 96)",96
27,Hoover Manton Playground,QUEENS,hoover manton playground,Manton Playground,,pgs,"(Manton Playground, 95)",95
33,Emerald Playground - PS 200,QUEENS,emerald playground - ps 200,Playground,,pgs,"(Playground, 90)",90
36,Doctor Gertrude B Kelly Playground,MANHATTAN,doctor gertrude b kelly playground,Dr Gertrude B Kelly Playground,,pgs,"(Dr Gertrude B Kelly Playground, 94)",94
46,May Matthews Playground,MANHATTAN,may matthews playground,May Matthews Playground,,pgs,"(May Matthews Playground, 100)",100
47,Cuyler Gore Playground,BROOKLYN,cuyler gore playground,Cuyler Gore,,pgs,"(Cuyler Gore, 90)",90
52,Terrapin Playground - JHS 51,BROOKLYN,terrapin playground - jhs 51,Terrapin Playground,,pgs,"(Terrapin Playground, 95)",95
53,Jamaica Playground - PS 40,QUEENS,jamaica playground - ps 40,Jamaica Playground (ps 40),,pgs,"(Jamaica Playground (ps 40), 98)",98


In [328]:
print 'Recognized:'
print len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)]), '/', len(cParksMatched)


Recognized:
1302 / 1678


## IV. Pools

In [331]:
pools = [x.decode('utf8').lower() for x in pd.read_csv(PARQA + 'data/DPR_property/pools.csv')['Name'].tolist()]

In [345]:
cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME) & cParksMatched.cleanName.str.contains('pool'), 'Type'] = 'pool'
# cParksMatched[cParksMatched.Type=='pool']


In [334]:
cParksMatched.ix[cParksMatched.Type=='pool', 'match'] = \
cParksMatched.ix[cParksMatched.Type=='pool', 'cleanName'].apply(lambda x: process.extractOne(x, pools))

In [338]:
cParksMatched.ix[cParksMatched.Type=='pool', 'SIGNNAME'] = \
cParksMatched.ix[cParksMatched.Type=='pool', 'match'].apply(lambda x: x[0])

cParksMatched.ix[cParksMatched.Type=='pool', 'ratio'] = \
cParksMatched.ix[cParksMatched.Type=='pool', 'match'].apply(lambda x: x[1])

In [343]:
cParksMatched[cParksMatched.Type=='pool']

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type,match,ratio
140,Pool - Metropolitan Avenue,BROOKLYN,pool - metropolitan avenue,claremont pool,,pool,"(claremont pool, 85)",85
156,Pool - St. John's Recreation Center,BROOKLYN,pool - st. john's recreation center,recreation center 54 pool,,pool,"(recreation center 54 pool, 89)",89
179,Pool - Brownsville Recreation Center,BROOKLYN,pool - brownsville recreation center,recreation center 54 pool,,pool,"(recreation center 54 pool, 89)",89
218,Recreation Center - Metropolitan Pool and Fitn...,BROOKLYN,recreation center - metropolitan pool and fitn...,metropolitan pool,,pool,"(metropolitan pool, 90)",90
309,Pool - Hansborough,MANHATTAN,pool - hansborough,hansborough pool,,pool,"(hansborough pool, 95)",95
352,Pool - JHS 57 and HS 26,BROOKLYN,pool - jhs 57 and hs 26,jhs 57/hs 26 pool,,pool,"(jhs 57/hs 26 pool, 95)",95
416,John Jay Park and Pool,MANHATTAN,john jay park and pool,claremont pool,,pool,"(claremont pool, 85)",85
482,Williamsbridge Oval - P G Mini Pool and Courts,BRONX,williamsbridge oval - p g mini pool and courts,claremont pool,,pool,"(claremont pool, 85)",85
602,Pool - Mullaly,BRONX,pool - mullaly,mullaly pool,,pool,"(mullaly pool, 95)",95
614,Pool - Hamilton Fish,MANHATTAN,pool - hamilton fish,hamilton fish pool,,pool,"(hamilton fish pool, 95)",95


In [344]:
print 'Recognized:'
print len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)]), '/', len(cParksMatched)


Recognized:
1360 / 1678


## V. Parks

In [346]:
cParksMatched[pd.isnull(cParksMatched.SIGNNAME)]

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type,match,ratio
0,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city,,,,,
8,William H Seward Park,MANHATTAN,william h seward park,,,,,
18,St Catherine's Park,MANHATTAN,st catherine's park,,,,,
35,Louis J Valentino Jr Park and Pier,BROOKLYN,louis j valentino jr park and pier,,,,,
43,Hells Kitchen Park,MANHATTAN,hells kitchen park,,,,,
45,Brooklyn Civic Center Parks,BROOKLYN,brooklyn civic center parks,,,,,
51,J J Byrne Memorial Park,BROOKLYN,j j byrne memorial park,,,,,
57,Sherman Creek Park,MANHATTAN,sherman creek park,,,,,
58,Kolbert Park,BROOKLYN,kolbert park,,,,,
77,Alice Kornegay,MANHATTAN,alice kornegay,,,,,
