In [128]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy import distance

In [87]:
data_dir = os.getcwd() + '/yelp_dataset'
print('Loading business data ...')
df_business = pd.read_json(os.path.join(data_dir, 'business.json'), lines=True)

Loading business data ...


In [9]:
df_location = df_business[['business_id', 'latitude', 'longitude']].set_index(['business_id'])
df_location.head()

Unnamed: 0_level_0,latitude,longitude
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1SWheh84yJXfytovILXOAQ,33.522143,-112.018481
QXAEGFB4oINsVuTFxEYKFQ,43.605499,-79.652289
gnKjwL_1w79qoiV3IC_xQQ,35.092564,-80.859132
xvX2CttrVhyG2z1dFg_0xw,33.455613,-112.395596
HhyxOkGAM07SRYtlQ4wMFQ,35.190012,-80.887223


In [120]:
# DBSCAN - Density-Based Spatial Clustering of Applications with Noise
coords = df_business[['latitude', 'longitude']].values
kms_per_radian = 6371.0088
epsilon = .1 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 37720


In [121]:
df_business['cluster_coords'] = cluster_labels
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,cluster_coords
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ,0
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,1
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,2
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ,3
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC,4


In [122]:
counts_cluster = [len(cluster) for cluster in clusters]
counts_cluster

[1,
 19,
 15,
 2,
 4,
 15,
 4,
 28,
 18,
 1,
 1707,
 302,
 16,
 10,
 52,
 10,
 298,
 51,
 29,
 7715,
 1355,
 7,
 7,
 16,
 4,
 4,
 43,
 7,
 216,
 17,
 187,
 8,
 85,
 16,
 44,
 1,
 8,
 4,
 3,
 6,
 1,
 2,
 184,
 39,
 40,
 19,
 3,
 37,
 19,
 3,
 19,
 1,
 13,
 1,
 9,
 39,
 6,
 6,
 1,
 60,
 289,
 17,
 4,
 2546,
 14,
 1,
 72,
 246,
 667,
 20,
 163,
 1,
 1,
 72,
 21,
 2,
 3,
 735,
 1,
 17,
 25,
 4,
 1,
 5,
 6,
 45,
 1,
 1,
 3,
 303,
 4,
 118,
 4,
 13,
 85,
 17,
 1,
 60,
 153,
 35,
 59,
 64,
 3,
 98,
 48,
 6,
 65,
 23,
 191,
 13,
 2,
 4,
 2,
 15,
 3,
 60,
 2,
 1,
 121,
 1,
 591,
 50,
 66,
 2,
 116,
 48,
 1,
 17,
 111,
 83,
 91,
 24,
 10,
 2,
 3,
 8,
 34,
 219,
 158,
 61,
 37,
 1,
 2,
 14,
 30,
 132,
 14,
 2,
 25,
 3,
 1,
 4,
 5,
 24,
 2,
 80,
 52,
 34,
 7,
 258,
 14,
 12,
 2,
 12,
 17,
 5,
 150,
 68,
 20,
 1767,
 8,
 1,
 92,
 5,
 1,
 15,
 39,
 75,
 12,
 50,
 3,
 3,
 2,
 1,
 12,
 113,
 55,
 10,
 21,
 12,
 2,
 39,
 127,
 1,
 27,
 1,
 20,
 79,
 41,
 2,
 23,
 6,
 1,
 1,
 51,
 2,
 2,
 39,
 4,
 9,
 1

In [130]:
df_business[df_business['cluster_coords']==7]
coords_1 = (36.099998, -115.077408)
coords_2 = (36.100548, -115.074212)
# get distance in km
distance.geodesic(coords_1, coords_2).km

0.29419740100547215

In [None]:
# https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

In [23]:
# TODO: Geopandas to visulize one result
df_business.groupby(['state'],sort=True).count()

Unnamed: 0_level_0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AB,8012,6405,8012,7998,8012,5882,8012,8012,8012,8012,8012,8012,8012
AK,2,1,2,2,2,1,2,2,2,2,2,2,2
AL,3,1,3,3,3,2,3,3,3,3,3,3,3
AR,1,1,1,1,1,1,1,1,1,1,1,1,1
AZ,56686,48190,56686,56520,56686,45274,56686,56686,56686,56686,56686,56686,56686
BAS,1,1,1,1,1,0,1,1,1,1,1,1,1
BC,1,1,1,1,1,1,1,1,1,1,1,1,1
CA,19,16,19,19,19,11,19,19,19,19,19,19,19
CON,1,1,1,1,1,0,1,1,1,1,1,1,1
CT,3,3,3,3,3,2,3,3,3,3,3,3,3
