In [24]:
%matplotlib inline
import pandas as pd
import geopandas as gp
import numpy as np
import os

PARQA = os.getenv('PARQA')

In [25]:
ontology = pd.read_csv(PARQA + 'parqa/311/ONTOLOGY/onto_data/Ontology_verified2.csv')[['cleanName',
                                                                                       'NAME',
                                                                                       'Type',
                                                                                       'valid']]

In [26]:
ontology.head(2)

Unnamed: 0,cleanName,NAME,Type,valid
0,geo soilan park - battery park city,battery park city,other,av
1,brookville park,brookville park,park_direct,av


In [27]:
# SHOW AND SAVE FAILED TO RECOGNIZE, THEN REMOVE
# ontology[ontology.Type=='?'].drop('NAME', axis=1).to_csv(PARQA + 'parqa/311/ONTOLOGY/onto_data/failed_to_match.csv' )
ontology[ontology.valid=='?']
ontology = ontology[~ (ontology.valid=='?')]

## I. Ontology to DPR property data

In [28]:
# ontology['pDistrict'] = np.nan

In [29]:
ontology.Type.value_counts()

park_direct    719
pgs            368
other          284
empiric        209
pool            60
beach           12
golf            12
school           5
recr             4
Name: Type, dtype: int64

In [30]:
## data for all types of keys in ontology

dfs = {'park_direct': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'parks_pd_ll_3.geojson' ),
       'pgs': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'playground_pd_ll.geojson' ),
       'other': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'parks_pd_ll_3.geojson' ),
       'empiric': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'parks_pd_ll_3.geojson' ),
       'pool': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'pools_pd_ll.geojson' ),
       'beach': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'beaches_pd_ll_2.geojson' ),
       'golf': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'golf_courses_pd_ll.geojson' ),
       'school':gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'ps_pd_ll.geojson' ),
       'recr': gp.read_file(PARQA + 'data/DPR_property/csv_ll_pD/' + 'recr_c_pd_ll.geojson' )}

In [31]:
# how many datasets have unrecognised parkDistrict (most of them are just happend to be on piers)

for key in dfs:
    print key, len(dfs[key][pd.isnull(dfs[key].parkDistrict)])

school 0
other 22
empiric 22
pgs 0
recr 0
golf 0
beach 0
pool 1
park_direct 22


In [32]:
# rename all naming columns to NAME

dfs['school'] = dfs['school'].rename(columns={'PSID':'NAME'})
dfs['other'] = dfs['other'].rename(columns={'SIGNNAME':'NAME'})
dfs['empiric'] = dfs['empiric'].rename(columns={'SIGNNAME':'NAME'})
dfs['pool'] = dfs['pool'].rename(columns={'Name':'NAME'})
dfs['park_direct'] = dfs['park_direct'].rename(columns={'SIGNNAME':'NAME'})

for key in dfs:
    if 'NAME'not in dfs[key].columns:
        print key,dfs[key].columns

In [33]:
# lover all keys

for item in dfs.values():
    item['NAME'] = item.NAME.str.lower()

In [34]:
### checks how well each specific dataframe fits ontology

# def mergePartial(df,dfs, key):
#     x = ontology.ix[ontology.Type==key, :].merge( dfs[key][['NAME','parkDistrict']], how='left', on='NAME')
#     print k, x.columns
#     return  x


# # check all failed merges
# for k in dfs.keys():
#     x = mergePartial(ontology, dfs, k)
#     if len(x[pd.isnull(x.parkDistrict)])>0:
#         print k, len(x[pd.isnull(x.parkDistrict)])
#         print x[pd.isnull(x.parkDistrict)].NAME

In [35]:
for key in dfs.keys():
    dfs[key]['Type'] = key
    
superduperDataFrame = pd.concat([dfs[key][['NAME','parkDistrict','Type','geometry']] for key in dfs.keys()])

In [36]:
# superduperDataFrame.head(2)

In [37]:
ontoMatched = ontology.merge(superduperDataFrame, how='left',on=['Type','NAME']) 
len(ontoMatched[pd.isnull(ontoMatched.geometry)]) # those unrecognized guys

0

In [38]:
# ontoMatched[pd.isnull(ontoMatched.geometry)]

In [39]:
ontoMatched.head(2)

Unnamed: 0,cleanName,NAME,Type,valid,parkDistrict,geometry
0,geo soilan park - battery park city,battery park city,other,av,M-01,POINT (-74.01689299366825 40.71188154925514)
1,geo soilan park - battery park city,battery park city,other,av,M-01,POINT (-74.01689047726609 40.71271929265585)


In [40]:
def gdfToCsv(p):    
    p['lat'] = p.geometry.apply(lambda x: x.coords[0][0])
    p['lon'] = p.geometry.apply(lambda x: x.coords[0][1])
    return p.drop('geometry',1)

In [41]:
ontoMatched = gdfToCsv(gp.GeoDataFrame(ontoMatched))
ontoMatched.head(2)

Unnamed: 0,cleanName,NAME,Type,valid,parkDistrict,lat,lon
0,geo soilan park - battery park city,battery park city,other,av,M-01,-74.016893,40.711882
1,geo soilan park - battery park city,battery park city,other,av,M-01,-74.01689,40.712719


In [43]:
ontoMatched.to_csv(PARQA + 'parqa/311/ONTOLOGY/onto_data/Ontology_matched.csv')