In [19]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from geopy.point import Point
from geopy.geocoders import *
from geopy.distance import *

detroit_demolition_permits = pd.read_json('detroit_demolition_permits_with_lat_lon.json')
detroit_blight_violations = pd.read_json('detroit_blight_violations_with_lat_lon.json')
detroit_311 = pd.read_json('detroit_311_with_lat_lon.json')
detroit_crime= pd.read_json('detroit_crime_with_lat_lon.json')

In [3]:
def get_lat_lon(data):
    return data[['LAT','LON']].as_matrix()

In [4]:
points_demolition = get_lat_lon(detroit_demolition_permits)
points_blight = get_lat_lon(detroit_blight_violations)
points_crime = get_lat_lon(detroit_crime)
points_311 = get_lat_lon(detroit_311)

In [5]:
points_demolition.shape

(7133L, 2L)

In [6]:
points_blight.shape

(307804L, 2L)

In [7]:
points_crime.shape

(119901L, 2L)

In [8]:
points_311.shape

(19680L, 2L)

In [9]:
coords = np.concatenate((points_demolition ,points_blight,points_crime,points_311))

In [10]:
kms_per_radian = 6371.0088
epsilon = 0.03 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine',n_jobs=-1).fit(np.radians(coords))

In [20]:
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
num_blighted = len(set(cluster_labels[:7133]))
print('Number of clusters: {}'.format(num_clusters))
print('Number of blighted: {}'.format(num_blighted))

Number of clusters: 40821
Number of blighted: 3739


In [21]:
detroit_demolition_permits['label'] = pd.Series(cluster_labels[:7133],dtype='int64')
cluster_labels = cluster_labels[7133:]
detroit_demolition_permits = detroit_demolition_permits.drop(['LAT','LON'],1)
detroit_demolition_permits.to_json('detroit_demolition_permits.json')

In [22]:
detroit_blight_violations['label'] = pd.Series(cluster_labels[:307804],dtype='int64')
cluster_labels = cluster_labels[307804:]
detroit_blight_violations = detroit_blight_violations.drop(['LAT','LON'],1)
detroit_blight_violations.to_json('detroit_blight_violations.json')

In [23]:
detroit_crime['label'] = pd.Series(cluster_labels[:119901],dtype='int64')
cluster_labels = cluster_labels[119901:]
detroit_crime = detroit_crime.drop(['LAT','LON'],1)
detroit_crime.to_json('detroit_crime.json')

In [24]:
detroit_311['label'] = pd.Series(cluster_labels[:19680],dtype='int64')
cluster_labels = cluster_labels[19680:]
detroit_311 = detroit_311.drop(['LAT','LON'],1)
detroit_311.to_json('detroit_311.json')

In [25]:
cluster_labels.shape

(0L,)