In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./streetview_parcel_basic.csv")

In [5]:
# collect all lat lon points from dataframe
from shapely.geometry import Polygon, Point, shape

n = len(df.index)
locations = []
for i in range(n):
    point = Point(df.lon[i],df.lat[i])
    locations.append(point)

In [8]:
from xml.etree import ElementTree
import keytree

doc = open("/Users/damoncrockett/Desktop/FISP/SD_CPA.kml").read()
tree = ElementTree.fromstring(doc)
# kml namespace
kmlns = tree.tag.split('}')[0][1:]
# find all placemarks
placemks = tree.findall(".//{%s}Placemark" % kmlns)
# filter out those without polygon elements
placemks_with_polygons = []

for p in placemks:
    if p.findall(".//{%s}Polygon" % kmlns):
        placemks_with_polygons.append(p)

In [9]:
# func: extract kml LinearRings, convert to shapely LinearRings, make dict with cpa names

def coords_names(placemks):
  coords_names_dict = {}
  for placemk in placemks:
    name = placemk.getchildren()[0].text
    coord_text = placemk.findtext(".//{%s}coordinates" % kmlns)
    coords = []
    for elems in coord_text.split():
      points = elems.split(",")
      coords.append((float(points[0]), float(points[1])))
    coords_names_dict[Polygon(coords)] = name
  return coords_names_dict

In [10]:
# make dataframe from function

cpa_polygons = pd.DataFrame(coords_names(placemks_with_polygons).items(),
            columns=['Polygon','CPA'])

In [11]:
# crucial step: build spatial index

from rtree import index
idx = index.Index()
count = -1
for item in cpa_polygons.Polygon:
    count +=1
    idx.insert(count, item.bounds)

In [12]:
# assign a cpa to each point

m = len(locations)
hoods = []
stops = [10000,20000,30000,40000,50000,60000,70000,
         80000,90000,100000,125000,150000,175000,
         200000,300000]
for i in range(m):
    if i in stops:
        print i
    tmp = 'nan'
    for j in idx.intersection((df.lon[i],df.lat[i])):
        if locations[i].within(cpa_polygons.Polygon.loc[j]):
            tmp = cpa_polygons.CPA[j]
            break
    hoods.append(tmp)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
125000
150000
175000
200000
300000


In [13]:
df['CPA'] = hoods

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382756 entries, 0 to 382755
Data columns (total 13 columns):
filename      382756 non-null object
apn           382756 non-null int64
asr_land      382756 non-null int64
asr_impr      382756 non-null int64
asr_total     382756 non-null int64
shape_area    382756 non-null float64
unitqty       382756 non-null int64
nucleus_zo    382756 non-null int64
nucleus_us    382756 non-null int64
address       382756 non-null object
lon           382756 non-null float64
lat           382756 non-null float64
CPA           382756 non-null object
dtypes: float64(3), int64(7), object(3)
memory usage: 40.9+ MB


In [14]:
df.CPA.value_counts()

CLAIREMONT MESA                        26198
MIRA MESA                              22959
NAVAJO                                 19192
SKYLINE-PARADISE HILLS                 16407
DOWNTOWN                               15976
RANCHO BERNARDO                        15719
UNIVERSITY                             15574
LA JOLLA                               15164
OTAY MESA-NESTOR                       14054
RANCHO PENASQUITOS                     13558
PENINSULA                              13315
UPTOWN                                 13202
PACIFIC BEACH                          12954
CARMEL VALLEY                          12856
GREATER NORTH PARK                     12719
MID-CITY:CITY HEIGHTS                  11111
ENCANTO NEIGHBORHOODS,SOUTHEASTERN     10759
MID-CITY:EASTERN AREA                  10498
SOUTHEASTERN SAN DIEGO,SOUTHEASTERN     9727
LINDA VISTA                             7392
MISSION VALLEY                          7296
SCRIPPS MIRAMAR RANCH                   7285
TIERRASANT

In [17]:
df.to_csv('/Users/damoncrockett/Desktop/FISP/streetview_parcel_CPA.csv',index=False)