In [117]:
import pandas as pd

In [118]:
# read and clean crime data
crime = pd.read_csv('data/detroit-crime.csv')
crime = crime[['ADDRESS', 'LAT', 'LON']]
crime.columns = ['address', 'lat', 'long']
crime['origin'] = 'crime'
# remove lats outside detroit
crime = crime[crime['lat'] > 41.0]
crime = crime[crime['lat'] < 45.0]
crime.head()

Unnamed: 0,address,lat,long,origin
0,09100 PETOSKEY,42.3678,-83.1221,crime
1,00 PLYMOUTH AND MANSFIELD,42.3724,-83.2035,crime
2,00 E 7 MILE VAN DYKE,42.4338,-83.0241,crime
3,06600 BARTON,42.3496,-83.1381,crime
4,00900 W WILLIS,42.3481,-83.0692,crime


In [119]:
# read and clean 311 data
_311 = pd.read_csv('data/detroit-311.csv')
_311 = _311[['address', 'lat', 'lng']]
_311.columns = ['address', 'lat', 'long']
_311['origin'] = '311'
_311.head()

Unnamed: 0,address,lat,long,origin
0,"13120-13130 Ilene Street Detroit, MI 48238, USA",42.383998,-83.161039,311
1,"1485 E. Outer Drive Detroit, Michigan",42.440471,-83.080919,311
2,"15460 Eastburn Detroit, Michigan",42.445244,-82.962038,311
3,"17541 Mendota St Detroit, Michigan",42.421043,-83.166194,311
4,"Griggs Detroit, Michigan",42.402033,-83.162874,311


In [120]:
# read and clean demo data
demo = pd.read_csv('data/detroit-demolition-permits.tsv', sep='\t')
demo = demo[['SITE_ADDRESS', 'site_location']]
demo['lat'] = demo['site_location'].str.split('\n', expand=True)[2].str.split(', ', expand=True)[0].str.replace('(', '')
demo['long'] = demo['site_location'].str.split('\n', expand=True)[2].str.split(', ', expand=True)[1].str.replace(')', '')
demo = demo[['SITE_ADDRESS', 'lat', 'long']]
demo.columns = ['address', 'lat', 'long']
demo['origin'] = 'demo'
demo = demo[pd.notnull(demo['lat'])]
demo = demo[pd.notnull(demo['long'])]
demo['lat'] = demo['lat'].astype(float)
demo['long'] = demo['long'].astype(float)
# remove lats/longs outside detroit
demo = demo[demo['lat'] > 41.0]
demo = demo[demo['lat'] < 45.0]
demo.head()

Unnamed: 0,address,lat,long,origin
0,4331 BARHAM,42.394106,-82.9474,demo
1,9707 BESSEMORE,42.395122,-83.005077,demo
2,5315 BERKSHIRE,42.40322,-82.946476,demo
3,16670 BRINGARD DR,42.44664,-82.947207,demo
4,1454 BEATRICE,42.276956,-83.147536,demo


In [121]:
# read and clean blight data
blight = pd.read_csv('data/detroit-blight-violations.csv')
blight['address'] = blight['ViolationAddress'].str.split('\n', expand=True)[0]
blight['lat'] = blight['ViolationAddress'].str.split('\n', expand=True)[2].str.split(', ', expand=True)[0].str.replace('(', '')
blight['long'] = blight['ViolationAddress'].str.split('\n', expand=True)[2].str.split(', ', expand=True)[1].str.replace(')', '')
blight = blight[['address', 'lat', 'long']]
blight['origin'] = 'blight'
blight = blight[pd.notnull(blight['lat'])]
blight = blight[pd.notnull(blight['long'])]
blight['lat'] = blight['lat'].astype(float)
blight['long'] = blight['long'].astype(float)
blight.head()

Unnamed: 0,address,lat,long,origin
0,2566 GRAND BLVD,42.363182,-83.091677,blight
1,19014 ASHTON,42.429391,-83.220394,blight
2,18735 STAHELIN,42.428707,-83.227548,blight
3,20125 MONICA,42.441698,-83.145018,blight
4,17397 PRAIRIE,42.420318,-83.145328,blight


In [122]:
d = crime.append([_311, demo, blight])
d.to_csv('d.csv', index=False)
d = d.reset_index(drop=True)
print d.shape
d.head()

(453343, 4)


Unnamed: 0,address,lat,long,origin
0,09100 PETOSKEY,42.3678,-83.1221,crime
1,00 PLYMOUTH AND MANSFIELD,42.3724,-83.2035,crime
2,00 E 7 MILE VAN DYKE,42.4338,-83.0241,crime
3,06600 BARTON,42.3496,-83.1381,crime
4,00900 W WILLIS,42.3481,-83.0692,crime


In [123]:
import gmplot
gmap = gmplot.GoogleMapPlotter(42.4, -83.1, 12)
gmap.heatmap(d['lat'], d['long'])
gmap.draw("detroit.html")

In [124]:
# format addresses to do fuzzy matching
# 1) normalize
# 2) featurize
# 3) classify
# 4) scoring

# taken largely from http://blog.yhat.com/posts/fuzzy-matching-with-yhat.html

In [131]:
# 1) normalize

# remove punctuation and capitalize everything
import string
import re
exclude = set(string.punctuation)

def clean_string(s):
    # capitalize
    s = s.upper()
    # remove punctuation
    s = ''.join(ch for ch in s if ch not in exclude)
    s = s.replace('DETROIT MICHIGAN', '')
    s = s.replace('DETROIT, MICHIGAN', '')
    s = s.replace('DETROIT, MI', '')
    s = s.replace('DETROIT MI', '')
    s = s.replace('STREET', 'ST')
    s = s.replace('ROAD', 'RD')
    s = s.replace(' R ', 'RD ')
    s = s.replace('BOULEVARD', 'BLVD')
    s = s.replace('AVENUE', 'AVE')
    s = s.replace('DRIVE',  'DR')
    s = s.replace(' W ', ' WEST ')
    s = s.replace(' E ', ' EAST ')
    s = s.replace(' S ', ' SOUTH ')
    s = s.replace(' N ', ' NORTH ')
    return s

d['address_clean'] = d['address'].apply(clean_string)


In [142]:
# test again
# find most common words
addresses = d['address_clean'].values
split_addresses = [x.split(' ') for x in addresses]
unique_words = [item for sublist in split_addresses for item in sublist if len(item) > 0]
from collections import Counter
Counter(unique_words).most_common()

[('00', 24467),
 ('ST', 14694),
 ('WEST', 10010),
 ('GRAND', 8128),
 ('AND', 7686),
 ('EAST', 7653),
 ('RD', 7293),
 ('DR', 6450),
 ('BLVD', 5320),
 ('MCNICHOLS', 5223),
 ('MILE', 4945),
 ('SEVEN', 4808),
 ('RIVER', 4394),
 ('WARREN', 4324),
 ('LIVERNOIS', 4225),
 ('PARK', 4120),
 ('GRATIOT', 3941),
 ('AVE', 3323),
 ('OUTER', 3038),
 ('WOODWARD', 3034),
 ('GREENFIELD', 2963),
 ('USA', 2935),
 ('EVERGREEN', 2808),
 ('JEFFERSON', 2724),
 ('JOY', 2668),
 ('APT', 2650),
 ('FENKELL', 2641),
 ('MICHIGAN', 2519),
 ('ASBURY', 2429),
 ('CHICAGO', 2256),
 ('SCHAEFER', 2231),
 ('00000', 2210),
 ('WYOMING', 2147),
 ('VAN', 2135),
 ('7', 2042),
 ('ASHTON', 2008),
 ('ARCHDALE', 1998),
 ('PLYMOUTH', 1943),
 ('HARPER', 1891),
 ('EIGHT', 1886),
 ('MEYERS', 1884),
 ('APPOLINE', 1867),
 ('SANTA', 1784),
 ('FORRER', 1681),
 ('VERNOR', 1680),
 ('FORT', 1649),
 ('COYLE', 1610),
 ('NORTHLAWN', 1589),
 ('KENTUCKY', 1572),
 ('MANSFIELD', 1559),
 ('DAVISON', 1547),
 ('AUBURN', 1535),
 ('ARDMORE', 1528),
 ('1930

In [143]:
# find what to add/subtract to lat/long to box addresses
import geopy
import geopy.distance

lat_diff = .00035
long_diff = .0005

lat = 42.3678
long = -83.1221

pt1 = geopy.Point(lat, long)
pt2 = geopy.Point(lat + lat_diff, long)
# distance.distance() is the  VincentyDistance by default.
dist = geopy.distance.distance(pt1, pt2).mi
print dist

pt1 = geopy.Point(lat, long)
pt2 = geopy.Point(lat, long + long_diff)
# distance.distance() is the  VincentyDistance by default.
dist = geopy.distance.distance(pt1, pt2).mi
print dist

0.024157763583
0.0255917777667


In [144]:
# group by address
import numpy as np
g = d.groupby(['address_clean']).agg(
    {'lat': [np.size, np.mean, np.min, np.max],
     'long': [np.mean, np.min, np.max]}).reset_index()
g.columns = g.columns.droplevel(0)
g.columns = ['address_clean', 'N', 'lat_avg', 'lat_min', 'lat_max', 'long_avg', 'long_min', 'long_max']

In [145]:
# create .05 mile x .05 mile square around average lat/long of address
g['lat_min_bound'] = g['lat_avg'] - lat_diff
g['lat_max_bound'] = g['lat_avg'] + lat_diff
g['long_min_bound'] = g['long_avg'] - long_diff
g['long_max_bound'] = g['long_avg'] + long_diff

In [146]:
g.head()

Unnamed: 0,address_clean,N,lat_avg,lat_min,lat_max,long_avg,long_min,long_max,lat_min_bound,lat_max_bound,long_min_bound,long_max_bound
0,,1.0,42.331427,42.331427,42.331427,-83.045754,-83.045754,-83.045754,42.331077,42.331777,-83.046254,-83.045254
1,5949 CASMERE ST,1.0,42.408738,42.408738,42.408738,-83.041618,-83.041618,-83.041618,42.408388,42.409088,-83.042118,-83.041118
2,8641 LITTLEFIELD,1.0,42.358054,42.358054,42.358054,-83.175329,-83.175329,-83.175329,42.357704,42.358404,-83.175829,-83.174829
3,WAGNER,1.0,42.338734,42.338734,42.338734,-83.137666,-83.137666,-83.137666,42.338384,42.339084,-83.138166,-83.137166
4,0 10TH,70.0,42.369786,42.369786,42.369786,-83.216326,-83.216326,-83.216326,42.369436,42.370136,-83.216826,-83.215826


In [147]:
# function that returns list of potential addresses given lat/long
def find_address(lat, long):
    sub = g[g['lat_min_bound'] < lat]
    sub = sub[sub['lat_max_bound'] > lat]
    sub = sub[sub['long_min_bound'] < long]
    sub = sub[sub['long_max_bound'] > long]
    
    # calculate distance
    pt1 = geopy.Point(lat, long)
    sub['dist'] = sub.apply(lambda row: geopy.distance.distance(pt1, geopy.Point(row['lat_avg'], row['long_avg'])).mi, axis=1)
    geopy.distance.distance(pt1, pt2).mi
    
    # sort by closest
    sub = sub.sort_values('dist', ascending=True)
    address_list = sub['address_clean'].values
    
    # return
    return address_list

In [148]:
# test it
# google says this location is 2982 Vicksburg St.
lat = 42.364313
long = -83.107868
find_address(lat, long)

array(['2974 VICKSBURG', '2988 VICKSBURG ST ', '2982 VICKSBURG',
       '2968 VICKSBURG', '2981 VICKSBURG ST ', '2960 VICKSBURG',
       '2961 VICKSBURG', '2954 VICKSBURG', '03000 VIRGINA PARK'], dtype=object)

In [139]:
# remaining to do
# DONE 1) remove variants of DETROIT MICHIGAN
# DONE 2) normalize street, St, St. etc. or ave, avenue, etc.
# 3) do something about when there is no st i.e. 2974 Vicksburg vs. 2974 Vicksburg St.