In [1]:
file = open('checkins.dat')
data = file.read()

In [2]:
data[:1000]

'   id    | user_id | venue_id |     latitude      |     longitude     |     created_at      \n---------+---------+----------+-------------------+-------------------+---------------------\n  984301 | 2041916 |     5222 |                   |                   | 2012-04-21 17:39:01\n  984222 |   15824 |     5222 |        38.8951118 |       -77.0363658 | 2012-04-21 17:43:47\n  984315 | 1764391 |     5222 |                   |                   | 2012-04-21 17:37:18\n  984234 |   44652 |     5222 |         33.800745 |         -84.41052 | 2012-04-21 17:43:43\n  984249 | 2146840 |     5222 |                   |                   | 2012-04-21 17:42:58\n  984268 | 2146843 |     5222 |                   |                   | 2012-04-21 17:42:38\n  984281 | 2146846 |     5222 |                   |                   | 2012-04-21 17:39:40\n  984291 |  105054 |     5222 |        45.5234515 |      -122.6762071 | 2012-04-21 17:39:22\n    6651 | 1338710 |   219703 |                   |                

In [3]:
import pandas as pd
df = pd.read_csv('checkins.dat', sep='|', skiprows=[1], header=0, skipinitialspace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df[:5]

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916.0,5222.0,,,2012-04-21 17:39:01
1,984222,15824.0,5222.0,38.895112,-77.036366,2012-04-21 17:43:47
2,984315,1764391.0,5222.0,,,2012-04-21 17:37:18
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840.0,5222.0,,,2012-04-21 17:42:58


We need to drop rows with missing data

In [5]:
df.dropna(inplace=True, axis='rows')
df[:5]

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824.0,5222.0,38.895112,-77.036366,2012-04-21 17:43:47
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
7,984291,105054.0,5222.0,45.523452,-122.676207,2012-04-21 17:39:22
9,984318,2146539.0,5222.0,40.764462,-111.904565,2012-04-21 17:35:46
10,984232,93870.0,380645.0,33.448377,-112.074037,2012-04-21 17:38:18


Now the total length should be 396.634

In [6]:
len(df)

396634

For faster clustering we can use only first 100k

In [7]:
df = df[:100000]
len(df)

100000

In [8]:
coords = df.iloc[:,-3:-1]
coords[:5]

Unnamed: 0,latitude,longitude
1,38.895112,-77.036366
3,33.800745,-84.41052
7,45.523452,-122.676207
9,40.764462,-111.904565
10,33.448377,-112.074037


In [9]:
from sklearn.cluster import MeanShift

clustering = MeanShift(bandwidth=0.1)
clustering = clustering.fit(coords)

In [10]:
clustering.cluster_centers_

array([[  40.7177164 ,  -73.99183542],
       [  33.44943805, -112.00213969],
       [  33.44638027, -111.90188756],
       ...,
       [ -37.8229826 ,  145.1811902 ],
       [ -41.2924945 ,  174.7732353 ],
       [ -45.0311622 ,  168.6626435 ]])

In [11]:
clustering.labels_, len(clustering.labels_), len(clustering.cluster_centers_)

(array([ 5,  7, 30, ..., 25, 19,  4]), 100000, 3231)

In [12]:
import numpy as np
val, counts = np.unique(clustering.labels_, return_counts=True)

In [13]:
val

array([   0,    1,    2, ..., 3228, 3229, 3230])

In [14]:
counts

array([12506,  4692,  3994, ...,     1,     1,     1])

In [15]:
valid_labels = []
for i, v in enumerate(val):
    if counts[i] > 15:
        valid_labels.append(v)

In [16]:
len(valid_labels)

592

In [17]:
cities = [[33.751277, -118.188740],
[25.867736, -80.324116],
[51.503016, -0.075479],
[52.378894, 4.885084],
[39.366487, 117.036146],
[-33.868457, 151.205134]]

In [18]:
pred = clustering.predict(cities)
pred

array([  51,  419,   58,  370, 1980,  420])

In [19]:
pred = [p for p in pred if p in valid_labels]
pred

[51, 419, 58, 370, 420]

In [20]:
indices = []
for p in pred:
    indices.append(list(clustering.labels_).index(p))
indices

[28, 2396, 321, 1498, 458]

In [21]:
centroids = []
for i in indices:
    centroids.append(clustering.cluster_centers_[i])
    print(str(clustering.cluster_centers_[i][0]) + ', ' + str(clustering.cluster_centers_[i][1]))

37.390292432025845, -122.0872864361108
46.7323875, -117.00016509999999
47.80529201081078, -122.34446337297295
43.23785189999999, -73.4915018
45.525410799999996, -122.97612223333336


In [22]:
valid_centroids = []
for label in valid_labels:
    valid_centroids.append(clustering.cluster_centers_[label])

finding closest valid centroid for our cities

In [23]:
import math

def calc_dist(x1, y1, x2, y2):
    return math.sqrt((x1-x2)**2 + (y1-y2)**2) 
mins = {}
for city in cities:
    min_dist = 10000000000
    centroid = [1,1]
    for center in valid_centroids:
        dist = calc_dist(city[0], city[1], center[0], center[1])
        if dist < min_dist:
            min_dist = dist
            centroid = center
    mins[min_dist] = centroid

In [24]:
mins

{0.007834758163107856: array([-33.86063043, 151.20477593]),
 0.009353316185992226: array([52.37296399,  4.89231722]),
 0.022674066158385495: array([ 25.84567226, -80.3188906 ]),
 0.05005829482278787: array([51.50299126, -0.12553729]),
 0.07084773242719973: array([  33.80987796, -118.14892381]),
 9.267575010767361: array([ 31.230393, 121.473704])}