# Training: Cluster Centers Identification

In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import MeanShift #the algorithm we use

import warnings
warnings.simplefilter('ignore')

### Play with model parameters
    bandwidth: the radius of clusters;
               0.07 ~= 3-8 km in the middle latitudes.
    min_sight_freq: we search clusters with radius = <bandwidth>, that have at least <min_sight_freq> items.

In [15]:
bandwidth = 0.07
min_sight_freq = 3

### Loading dataset
    Dataset I've used is known as UMN/Sarwat Foursquare Dataset.
    This dataset contains 2153471 users, 1143092 venues, 1021970 check-ins, 27098490 social connections, and 2809581 ratings that users assigned to venues;
    More on https://archive.org/details/201309_foursquare_dataset_umn

In [16]:
data = pd.read_csv('../data/venues.csv', sep=',', header=0, index_col=0)
data.head()

Unnamed: 0,latitude,longitude
0,38.951705,-92.334072
1,38.576702,-92.173516
2,38.943004,-92.36162
3,38.496657,-92.343726
4,38.199197,-92.833524


#### So, we deal with 11798 venues around the world:

In [17]:
data.shape

(11798, 2)

#### Let's create and fit Mean Shift model with parameters we have applied.

In [18]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=min_sight_freq)
ms.fit(data.values)

labels_unique = np.unique(ms.labels_)
n_clusters_ = len(labels_unique)
print('Total number of clusters: {}'.format(n_clusters_))

cluster_centers = pd.DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude'])

Total number of clusters: 325


#### Loading coordinates of STA Travel offices

In [19]:
STA_coordinates = pd.read_csv('coordinates.txt', sep=', ', header=0).values

#### Calculating the distance between cluster centers and offices

In [20]:
def func(x):  
  return min(list(map(lambda a: ((x[0] - a[0]) ** 2 + (x[1] - a[1]) ** 2) ** 0.5, STA_coordinates)))

In [21]:
cluster_centers['less_dist'] = cluster_centers.apply(func, axis=1)
cluster_centers.head()

Unnamed: 0,latitude,longitude,less_dist
0,40.740829,-73.984755,0.013081
1,13.738213,100.521672,0.316691
2,38.902082,-77.035651,3.549891
3,40.719992,-73.852317,0.141003
4,40.83207,-74.022553,0.1057


#### Output the best cluster centers

In [22]:
cluster_centers = cluster_centers.sort_values(by='less_dist', ascending=True)
best = cluster_centers.iloc[:20, :2].values

In [23]:
for i in best:
  print(i[0], ',', i[1])

51.51336727857142 , -0.14118561428571427
40.74082860641552 , -73.98475529218959
-33.88264309583334 , 151.196539
1.3132710500000002 , 103.84982767999998
40.83207020363457 , -74.02255291759762
32.730421633333336 , -117.1990883888889
13.903950455555554 , 100.60707361111112
40.7199924650692 , -73.85231737179492
51.509116516666666 , -0.2869840666666667
40.61846416666666 , -74.11737955555556
40.811854700000005 , -74.14721413043476
51.433522283333325 , -0.2966435166666667
40.89374621666666 , -73.85312388333334
32.549860800000005 , -117.03512873333334
40.95888477368422 , -73.99458288947369
40.92862703125 , -74.1190614875
40.81513530714285 , -74.25957897142857
40.734656217857136 , -73.70680561071428
41.03679218235294 , -73.93240693529413
40.6059341625 , -74.2825373


### Visualization
    1. Copy coordinates above.
    2. Go to [ff](https://www.mapcustomizer.com/)
    3. Choose 'Bulk entry' in the top right corner.
    4. Paste coordinates and press 'Add locations'.