# Creating Ontology of 311 Park Names -> official park names

This notebook creates a match dictionary between park names in 311 data and in NYC Parks official dataset. 
Using this dictionary we can match each park-related call to specific park or park District

Here I am using [fuzzywuzzy module](https://github.com/seatgeek/fuzzywuzzy)

In [180]:
__author__ = "Philipp Kats"
__date__ = "2015_10_15"

import pandas as pd
import geopandas as gp
import numpy as np
import os
from fuzzywuzzy import process  ## fuzzy string match in python

%pylab inline
PARQA = os.getenv('PARQA') #basic path

Populating the interactive namespace from numpy and matplotlib


## Getting 311 Data

In [181]:
calls = pd.read_csv(PARQA + '/data/311DPR.csv',encoding='utf8', na_values='Unspecified')
calls= calls[pd.notnull(calls['Park Facility Name'])]

In [182]:
callParks = calls[['Park Facility Name','Borough']].drop_duplicates()
calls[pd.notnull(calls['Park Facility Name'])]
callParks['cleanName'] = callParks['Park Facility Name'].str.lower()
print '311_call_names total:', len(callParks['Park Facility Name'])

311_call_names total: 1666


In [183]:
callParks.head()

Unnamed: 0,Park Facility Name,Borough,cleanName
1,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city
2,Brookville Park,QUEENS,brookville park
5,Highland Park,BROOKLYN,highland park
10,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds
11,Central Park - East 96th Street Playground,MANHATTAN,central park - east 96th street playground


## Getting Park names

In [184]:
## using geojson produced of OPEN DATA 
parkNames = gp.read_file(PARQA + 'data/parks_computed.geojson')[['SIGNNAME','BOROUGH']].drop_duplicates()
parkNames['SIGNNAME'] = parkNames['SIGNNAME'].str.lower()
len(parkNames['SIGNNAME'].drop_duplicates())

1787

In [185]:
#create a dictionary of parks for each borough 
#that will add accuracy and boost spead of search ( I hope)

ofParks = {k:[unicode(x) for x in list(v)] for k,v in parkNames.groupby("BOROUGH")["SIGNNAME"]}

tmp = []
for v in ofParks.values():
    tmp.extend(v)
ofParks['?'] = tmp   #['???'] ### to mark parks with unspecified Borough

print ofParks.keys()

[u'B', u'M', u'Q', u'R', u'X', '?']


In [186]:
# ofParks['B']

## I. Direct matching

In [198]:
cParksMatched = callParks.merge(parkNames, how='left', left_on = 'cleanName', right_on='SIGNNAME', copy=1)
cParksMatched['Type'] = np.nan

In [199]:
len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)])

714

In [201]:
cParksMatched.ix[(pd.notnull(cParksMatched.SIGNNAME)&pd.isnull(cParksMatched.Type)), 'Type'] = 'park_direct'

In [202]:
cParksMatched.head()

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type
0,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city,,,
1,Brookville Park,QUEENS,brookville park,brookville park,Q,park_direct
2,Highland Park,BROOKLYN,highland park,highland park,B,park_direct
3,Highland Park,BROOKLYN,highland park,highland park,Q,park_direct
4,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds,,,


## II. Empiric decoding

In [203]:
#some manual improvisation, based on emirical experience

def empiric(x):
    'recognition, based on emirical experience'
    general = {'flushing meadows':'flushing meadows and corona park',
               'central park':'central park', 
               'prospect park':'prospect park',
               'greenbelt': 'greenbelt native plant center',
               'red hook park':'red hook recreation area',
               'crotona park':'crotona park',
               'marine park': 'marine park', 
               'van cortlandt':'van cortlandt park',
               'seravalli': 'corporal john a. seravalli playground',
               'inwood hill': 'inwood hill park',
               'forest park':'forest park',
               'pelham bay park':'pelham bay park',
               'kissena park': 'kissena park',
               'asser levy': 'asser levy park',
               'highland park':'highland park'
                }
    
    
    for k in general.keys():
        if k in x.cleanName:
            return general[k]
    return np.nan
    

In [205]:
cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME), 'SIGNNAME'] = cParksMatched.ix[pd.isnull(cParksMatched.SIGNNAME)].apply(empiric,1)

In [207]:
cParksMatched.ix[(pd.notnull(cParksMatched.SIGNNAME)&pd.isnull(cParksMatched.Type)), 'Type'] = 'empiric'

In [208]:
cParksMatched.head(10)

Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type
0,Geo Soilan Park - Battery Park City,MANHATTAN,geo soilan park - battery park city,,,
1,Brookville Park,QUEENS,brookville park,brookville park,Q,park_direct
2,Highland Park,BROOKLYN,highland park,highland park,B,park_direct
3,Highland Park,BROOKLYN,highland park,highland park,Q,park_direct
4,Prospect Park - East Parade Grounds,BROOKLYN,prospect park - east parade grounds,prospect park,,empiric
5,Central Park - East 96th Street Playground,MANHATTAN,central park - east 96th street playground,central park,,empiric
6,Washington Hall Park,BROOKLYN,washington hall park,washington hall park,B,park_direct
7,Callahan-Kelly Playground,BROOKLYN,callahan-kelly playground,callahan-kelly playground,B,park_direct
8,William H Seward Park,MANHATTAN,william h seward park,,,
9,Col David Marcus Playground,BROOKLYN,col david marcus playground,,,


In [212]:
print 'Recognized:'
print len(cParksMatched[pd.notnull(cParksMatched.SIGNNAME)]), '/', len(cParksMatched)


Recognized:
923 / 1678


## III. Playgrounds

In [213]:
cParksMatched[pd.isnull(cParksMatched.SIGNNAME)][cParksMatched.cleanName.str.contains('play')]



Unnamed: 0,Park Facility Name,Borough,cleanName,SIGNNAME,BOROUGH,Type
9,Col David Marcus Playground,BROOKLYN,col david marcus playground,,,
13,Mccarren Park - Vincent V Abate Playground,BROOKLYN,mccarren park - vincent v abate playground,,,
24,Harold L Ickes Playground,BROOKLYN,harold l ickes playground,,,
27,Hoover Manton Playground,QUEENS,hoover manton playground,,,
33,Emerald Playground - PS 200,QUEENS,emerald playground - ps 200,,,
36,Doctor Gertrude B Kelly Playground,MANHATTAN,doctor gertrude b kelly playground,,,
46,May Matthews Playground,MANHATTAN,may matthews playground,,,
47,Cuyler Gore Playground,BROOKLYN,cuyler gore playground,,,
52,Terrapin Playground - JHS 51,BROOKLYN,terrapin playground - jhs 51,,,
53,Jamaica Playground - PS 40,QUEENS,jamaica playground - ps 40,,,
