In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy import distance
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from tqdm import tqdm

In [3]:
data_dir = os.getcwd() + '/yelp_dataset'
print('Loading business data ...')
df_business = pd.read_json(os.path.join(data_dir, 'business.json'), lines=True)

Loading business data ...


In [34]:
def business_clustering(df):
    print("Clustering business according to Geo-coordinates ...")
    """
    DBSCAN - Density-Based Spatial Clustering of Applications with Noise
    """
    coords = df[['latitude', 'longitude']].values
    # radius of the earth in km
    kms_per_radian = 6371
    # the minimus distance between two restruants is 50m
    epsilon = .05 / kms_per_radian
    # perform clustering
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    print('Number of clusters: {}'.format(num_clusters))
    df['coords_cluster_label'] = cluster_labels
    # number of resturants in each cluster
    counts_cluster = [len(cluster) for cluster in clusters]
    df['neighbors'] = [counts_cluster[i] for i in cluster_labels]
    # if the number of neighbors > 20, the resturant in a chain
    df['is_chain'] = df['neighbors'] >=20
    
    return df, clusters

df_business, clusters = business_clustering(df_business)

Clustering business according to Geo-coordinates ...
Number of clusters: 71379


In [69]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return centermost_point
centermost_points = clusters.map(get_centermost_point)
# df_business[df_business['postal_code']=='15203']

In [130]:
df_business[df_business['cluster_coords']==7]
coords_1 = (36.099998, -115.077408)
coords_2 = (36.100548, -115.074212)
# get distance in km
distance.geodesic(coords_1, coords_2).km

0.29419740100547215

In [75]:
df_center_neighbor = pd.DataFrame()
df_center=centermost_points.to_frame()
lat = [df_center[0].values[i][0] for i in range(len(df_center))]
lag = [df_center[0].values[i][1] for i in range(len(df_center))]
df_center_neighbor['latitude'] = lat
df_center_neighbor['longitude'] = lag
cluster_nb = pd.pivot_table(df_business, index=['coords_cluster_label', 'neighbors']).index.values
neighbors = [item[1] for item in cluster_nb]
df_center_neighbor['neighbor'] = neighbors

In [76]:
df_center_neighbor

Unnamed: 0,latitude,longitude,neighbor
0,33.522143,-112.018481,1
1,43.605628,-79.652810,10
2,35.092564,-80.859132,6
3,33.455613,-112.395596,2
4,35.190026,-80.887197,2
5,43.599475,-79.711584,1
6,50.943646,-114.001828,4
7,36.099854,-115.074767,5
8,33.654836,-112.188118,14
9,41.440825,-81.854097,1


In [None]:
# https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

In [90]:
# TODO: Geopandas to visulize one result

In [1]:
# mapbox = 'pk.eyJ1IjoieXV4aWFuZ2dvbmciLCJhIjoiY2p6M2I1ejJnMDFsNjNjcXRpdmw4cjAweCJ9.ecfVq0S-fu56dHr50J2YwQ'
# https://plot.ly/python/scattermapbox/#multiple-markers