In [65]:
lat_index = 3
lon_index = 4

offices = {
    "Los Angeles" : [33.751277, -118.188740],
    "Miami" : [25.867736, -80.324116],
    "London" : [51.503016, -0.075479],
    "Amsterdam" : [52.378894, 4.885084],
    "Beijing" : [39.366487, 117.036146],
    "Sydney" : [-33.868457, 151.205134]
}

def write_answer(filename, answer):
    with open(filename, "w") as fout:
        fout.write(str(answer))

In [9]:
import csv

fields_num = 6
csv_counter = 0

with open("checkins.dat") as dat_file:
    with open("checkins.csv", "w") as csv_file:
        csv_writer = csv.writer(csv_file)
        for dat_line in dat_file:
            new_line = map(str.strip, dat_line.split('|'))
            if len(new_line) == fields_num and new_line[lat_index] and new_line[lon_index]:
                csv_writer.writerow(new_line)
                csv_counter += 1
                
print("Done. Total rows written: {:,}".format(csv_counter))

Done. Total rows written: 396,635


In [12]:
import pandas as pd
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from functools import reduce

In [24]:
checkins = pd.read_csv("checkins.csv", nrows = 100000)
print(checkins.head(5))

       id  user_id  venue_id   latitude   longitude           created_at
0  984222    15824      5222  38.895112  -77.036366  2012-04-21 17:43:47
1  984234    44652      5222  33.800745  -84.410520  2012-04-21 17:43:43
2  984291   105054      5222  45.523452 -122.676207  2012-04-21 17:39:22
3  984318  2146539      5222  40.764462 -111.904565  2012-04-21 17:35:46
4  984232    93870    380645  33.448377 -112.074037  2012-04-21 17:38:18


In [27]:
X = checkins.values[:, [lat_index, lon_index]]
ms = MeanShift(bandwidth=0.1)
ms.fit(X)

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [74]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_
#labels_unique = np.unique(labels)
clusters_num = len(cluster_centers)
#print(cluster_centers)
print("Number of clusters: %d" % clusters_num)

# Словарь кластеров для хранения элементов, разбитых по кластерам.
cluster_dict = {}

for i, x in enumerate(X):
    label = labels[i]
    if label in cluster_dict:
        cluster_dict[label].append(x)
    else:
        cluster_dict[label] = [x]

# Список расстояний от каждого цента кластера до каждого офиса.
distances = []

for label in cluster_dict:
    size = len(cluster_dict[label])
    # Пропускаем кластер размером меньше 16 элементов 
    if size < 16: continue
    cluster_center = cluster_centers[label]
    for city in offices:
        office_center = offices[city]
        diff = office_center - cluster_center
        squared_dist = np.dot(diff, diff)
        distances.append((label, city, squared_dist))
 
distances.sort(key=lambda x: x[2])
print(distances[:5])
closest_cluster_label = distances[0][0]
closest_cluster_center = cluster_centers[closest_cluster_label]
answer = "{} {}".format(closest_cluster_center[0], closest_cluster_center[1])
write_answer("a1.txt", answer)

Number of clusters: 3230
[(413, 'Sydney', 6.1383435474385181e-05), (373, 'Amsterdam', 8.7484523675144147e-05), (405, 'Miami', 0.00051411327615484234), (58, 'London', 0.0025058328805651507), (51, 'Los Angeles', 0.0050194011900760883)]
