In [1]:
import pandas as pd

In [11]:
df = pd.read_csv('venues_all.csv')
df.head()

Unnamed: 0,name,categories,lat,lng
0,Sayaji,Hotel,18.599535,73.754995
1,Natural Ice Cream,Ice Cream Shop,18.591192,73.75244
2,Barbeque Nation,BBQ Joint,18.59939,73.75509
3,Courtyard by Marriott,Hotel,18.591527,73.746831
4,Little Italy,Italian Restaurant,18.591513,73.743668


In [12]:
df.shape

(487, 4)

In [13]:
from sklearn.cluster import DBSCAN

In [14]:
Clus_dataSet = df[['lat','lng']] 
Clus_dataSet.head()

Unnamed: 0,lat,lng
0,18.599535,73.754995
1,18.591192,73.75244
2,18.59939,73.75509
3,18.591527,73.746831
4,18.591513,73.743668


In [15]:
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [16]:
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
Clus_dataSet[0:5]

array([[ 0.98668867, -1.31098468],
       [ 0.78919645, -1.3546089 ],
       [ 0.98323731, -1.30935999],
       [ 0.79713373, -1.45034577],
       [ 0.79679557, -1.5043403 ]])

In [17]:
db = DBSCAN(eps=0.35, min_samples=9).fit(Clus_dataSet)
db

DBSCAN(eps=0.35, min_samples=9)

In [18]:
labels = db.labels_
labels[:250]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0, -1, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  1,  0,  0,  0,  1,  0,  1,  1,  0,  1,  1,  1,  0,  1,  0,
        1,  0,  1,  0,  1,  1,  0,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  0,  0,  1

In [24]:
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters

3

In [25]:
print(metrics.silhouette_score(Clus_dataSet, labels))


0.4349055248872309


In [26]:
import folium
from folium import plugins

In [27]:
color_options = ['black', 'blue', 'cadetblue', 'darkblue', 'darkgreen', 'darkpurple', 'darkred', 'gray', 'green', 'lightblue', 'lightgreen', 'lightred', 'orange', 'pink', 'purple', 'red']

In [28]:
map_ = folium.Map(location=[df.lat[0], df.lng[0]], zoom_start=11)
# add markers to map
for lat, lng, Categories, name, lab in zip(df['lat'], df['lng'], df['categories'], df['name'], labels):
    label = '{}, {}'.format(name, Categories, lab)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color=color_options[lab],
        fill=True,
        fill_opacity=1,
        parse_html=False).add_to(map_)  
    
map_

In [29]:
# iterate for best hyperparameters
param_eps = [0.15,0.2,0.225,0.25,0.275,0.3,0.325,0.35]
param_min_samples = [3,4,5,6,7,8,9]
for eps in param_eps:
    for min_samples in param_min_samples:
        #print('Value of eps = ',eps)
        #print('value of of min samples = ', min_samples) 
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(Clus_dataSet)
        labels = db.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        #print('number of clusters = ', n_clusters)
        score = metrics.silhouette_score(Clus_dataSet, labels)
        #print('Silhouette score = {:0.4f}'.format(score))
        max_score = 0
        if score>max_score:
            max_score = score
            best_eps = eps
            best_min_samples = min_samples
print('The best value for eps = ',best_eps)
print('The best value for min_samples is = ',best_min_samples)
print('The highest Silhouette score is = ',max_score)

The best value for eps =  0.35
The best value for min_samples is =  9
The highest Silhouette score is =  0.4349055248872309
