In [1]:
# __author__ = "Philipp Kats"
# __date__ = "2015_11_05"
%pylab inline
import pandas as pd
import geopandas as gp
import numpy as np

from geopandas.tools import sjoin
from shapely.geometry import Point

import os


PARQA = os.getenv('PARQA') #basic path

Populating the interactive namespace from numpy and matplotlib


In [2]:
calls = pd.read_csv(PARQA + 'data/311/311DPR.csv', encoding='utf8', na_values='Unspecified')
# calls = calls.rename(columns={'Longitude':'lon','Latitude':'lat'})

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
calls.shape

(82590, 54)

In [4]:
calls.columns

Index([                    u'Unnamed: 0',                     u'Unique Key',
                         u'Created Date',                    u'Closed Date',
                               u'Agency',                    u'Agency Name',
                       u'Complaint Type',                     u'Descriptor',
                        u'Location Type',                   u'Incident Zip',
                     u'Incident Address',                    u'Street Name',
                       u'Cross Street 1',                 u'Cross Street 2',
                u'Intersection Street 1',          u'Intersection Street 2',
                         u'Address Type',                           u'City',
                             u'Landmark',                  u'Facility Type',
                               u'Status',                       u'Due Date',
               u'Resolution Description', u'Resolution Action Updated Date',
                      u'Community Board',                        u'Borough',

In [5]:
myCalls = calls[['Park Facility Name','Descriptor','Created Date','Closed Date','Longitude','Latitude','Location Type']]
myCalls['Park Facility Name'] = myCalls['Park Facility Name'].str.lower()
myCalls['Park Facility Name'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


0                                    NaN
1    geo soilan park - battery park city
2                        brookville park
3                                    NaN
4                                    NaN
Name: Park Facility Name, dtype: object

In [6]:
myCalls[pd.isnull(myCalls['Park Facility Name'])].head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type
0,,Snow or Ice,12/31/2010 09:04:48 PM,01/03/2011 12:03:59 PM,-73.93112,40.668798,Park
3,,Snow or Ice,12/31/2010 03:36:37 PM,01/03/2011 09:41:24 AM,-73.962835,40.688556,Park
4,,Snow or Ice,12/31/2010 03:03:16 PM,01/03/2011 12:15:38 PM,-73.999809,40.636935,Park
6,,Snow or Ice,12/31/2010 12:59:59 PM,01/03/2011 12:23:04 PM,-73.999456,40.609951,Park
7,,Snow or Ice,12/31/2010 12:12:02 PM,01/03/2011 12:19:51 PM,-73.977616,40.633153,Park


In [7]:
## -------  Descriptor for complains without Park Facility Name
myCalls[pd.isnull(myCalls['Park Facility Name'])].Descriptor.value_counts().head(10)

Structure - Outdoors              6437
Garbage or Litter                 3160
Rodent Sighting                   1554
New Tree Complaint                1368
Grass/Weeds                       1085
Structure - Indoors                975
Tree Alive - in Poor Condition     962
Unsecured Facility                 903
Snow or Ice                        897
Hours of Operation                 764
Name: Descriptor, dtype: int64

In [8]:
## -------- Descriptor for complains with Park Facility Name 
myCalls[pd.notnull(myCalls['Park Facility Name'])].Descriptor.value_counts().head(10)

Structure - Outdoors      15303
Garbage or Litter          6140
Structure - Indoors        3227
Obstructing Public Use     2806
Dog Off Leash              2366
Rodent Sighting            2220
Grass/Weeds                1922
Aided/Injury               1893
Unsecured Facility         1866
Unlicensed Vendors         1660
Name: Descriptor, dtype: int64

In [9]:
## we might need to check the closest park and if distance is small, attach to it. 
##for now - using spatial joint to match to Disricts
print 100.0*len(myCalls[pd.isnull(myCalls['Park Facility Name'])])/len(myCalls), '%'

33.9060418937 %


In [10]:
notNamedCalls = myCalls[pd.isnull(myCalls['Park Facility Name'])]
namedCalls = myCalls[pd.notnull(myCalls['Park Facility Name'])]
print notNamedCalls.shape
print namedCalls.shape

(28003, 7)
(54587, 7)


## I. Ontology

In [11]:
onto = pd.read_csv(PARQA + 'parqa/311/ONTOLOGY/onto_data/Ontology_matched.csv', index_col=0)

In [12]:
onto.head(2)

Unnamed: 0,cleanName,NAME,Type,valid,parkDistrict,lat,lon
0,geo soilan park - battery park city,battery park city,other,av,M-01,-74.016893,40.711882
1,geo soilan park - battery park city,battery park city,other,av,M-01,-74.01689,40.712719


In [13]:
ncMatched = namedCalls.merge(onto, how='left',left_on='Park Facility Name', right_on='cleanName')
ncMatched.head(1)

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,cleanName,NAME,Type,valid,parkDistrict,lat,lon
0,geo soilan park - battery park city,Graffiti or Vandalism,12/31/2010 04:31:52 PM,12/31/2010 05:36:58 PM,,,Park,geo soilan park - battery park city,battery park city,other,av,M-01,-74.016893,40.711882


In [14]:
# ncMatched[pd.isnull(ncMatched.NAME)] # those i coldn't recognize

## II. Geolocated

In [15]:
pDistricts = gp.read_file(PARQA + 'data/SHP/Park_Districts/ParkDistrict.shp')
pD = pDistricts[['SYSTEM','geometry']]

In [16]:
def toGeoDataFrame(df, lat='Latitude',lon='Longitude'):
    '''dataframe to geodataframe'''
    df['geometry'] = df.apply(lambda z: Point(z[lon], z[lat]), axis=1)
    df = gp.GeoDataFrame(df)
    df.crs = {'init': 'epsg:4326', 'no_defs': True}
    return df 

In [17]:
nn = toGeoDataFrame(notNamedCalls[pd.notnull(notNamedCalls.Longitude)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [18]:
# nn.plot()

In [19]:
nn = nn.to_crs(pD.crs)
nn = sjoin(nn, pD, how="left").rename(columns={'SYSTEM':'parkDistrict'}).to_crs(epsg=4326)
# nn.head(2)

In [20]:
nn = nn[pd.notnull(nn['parkDistrict'])]

In [21]:
nn.head()

Unnamed: 0,Park Facility Name,Descriptor,Created Date,Closed Date,Longitude,Latitude,Location Type,geometry,index_right,parkDistrict
0,,Snow or Ice,12/31/2010 09:04:48 PM,01/03/2011 12:03:59 PM,-73.93112,40.668798,Park,POINT (-73.9311201453 40.66879788259987),29,B-09
3,,Snow or Ice,12/31/2010 03:36:37 PM,01/03/2011 09:41:24 AM,-73.962835,40.688556,Park,POINT (-73.96283467790001 40.68855565819985),22,B-02
4,,Snow or Ice,12/31/2010 03:03:16 PM,01/03/2011 12:15:38 PM,-73.999809,40.636935,Park,POINT (-73.99980903869999 40.63693511549985),32,B-12
6,,Snow or Ice,12/31/2010 12:59:59 PM,01/03/2011 12:23:04 PM,-73.999456,40.609951,Park,POINT (-73.9994561603 40.60995107579983),31,B-11
7,,Snow or Ice,12/31/2010 12:12:02 PM,01/03/2011 12:19:51 PM,-73.977616,40.633153,Park,POINT (-73.97761555779999 40.63315336439984),34,B-14


## NOW SAVING DATA

In [22]:
def gdfToCsv(p):    
    p['lat'] = p.geometry.apply(lambda x: x.coords[0][0])
    p['lon'] = p.geometry.apply(lambda x: x.coords[0][1])
    return p.drop('geometry',1)

In [23]:
nn = gdfToCsv(nn)

In [24]:
# nn.drop(['index_right'], axis=1, inplace=1)
# nn.drop(['Latitude','Longitude'], axis=1, inplace=1)
nn.columns

Index([u'Park Facility Name',         u'Descriptor',       u'Created Date',
              u'Closed Date',          u'Longitude',           u'Latitude',
            u'Location Type',        u'index_right',       u'parkDistrict',
                      u'lat',                u'lon'],
      dtype='object')

In [25]:
ncMatched.columns

Index([u'Park Facility Name',         u'Descriptor',       u'Created Date',
              u'Closed Date',          u'Longitude',           u'Latitude',
            u'Location Type',          u'cleanName',               u'NAME',
                     u'Type',              u'valid',       u'parkDistrict',
                      u'lat',                u'lon'],
      dtype='object')

In [26]:
MatchedCalls = pd.concat([nn,ncMatched])

In [27]:
MatchedCalls.shape

(88974, 15)

In [28]:
MatchedCalls.head(2)

Unnamed: 0,Closed Date,Created Date,Descriptor,Latitude,Location Type,Longitude,NAME,Park Facility Name,Type,cleanName,index_right,lat,lon,parkDistrict,valid
0,01/03/2011 12:03:59 PM,12/31/2010 09:04:48 PM,Snow or Ice,40.668798,Park,-73.93112,,,,,29,-73.93112,40.668798,B-09,
3,01/03/2011 09:41:24 AM,12/31/2010 03:36:37 PM,Snow or Ice,40.688556,Park,-73.962835,,,,,22,-73.962835,40.688556,B-02,


In [32]:
len(MatchedCalls[pd.isnull(MatchedCalls.parkDistrict)])

24

In [33]:
MatchedCalls[['Closed Date',
              'Created Date',
              'Location Type',
              'NAME',
              'lat',
              'lon',
              'parkDistrict']].to_csv(PARQA + 'data/311/MatchedCalls.cav')