In [94]:
import zipfile
import csv
import pandas as pd
from sklearn.utils import resample
from sklearn.cluster import MeanShift
import numpy as np
from scipy.spatial.distance import cdist

In [3]:
with zipfile.ZipFile('fsq.zip') as z:
    z.printdir()
    file_path = z.extract('umn_foursquare_datasets/checkins.dat')   

File Name                                             Modified             Size
umn_foursquare_datasets/                       2013-09-30 13:05:42            0
umn_foursquare_datasets/.DS_Store              2013-09-30 13:05:52         6148
__MACOSX/                                      2013-09-30 13:06:22            0
__MACOSX/umn_foursquare_datasets/              2013-09-30 13:06:22            0
__MACOSX/umn_foursquare_datasets/._.DS_Store   2013-09-30 13:05:52           82
umn_foursquare_datasets/checkins.dat           2012-07-15 22:56:50     94021074
umn_foursquare_datasets/ratings.dat            2012-10-06 02:25:54     81477895
__MACOSX/umn_foursquare_datasets/._ratings.dat 2012-10-06 02:25:54          167
umn_foursquare_datasets/socialgraph.dat        2012-07-15 22:21:30    867151370
umn_foursquare_datasets/users.dat              2012-07-10 01:06:06    111980510
umn_foursquare_datasets/venues.dat             2012-07-10 15:05:40     65156262


In [4]:
with open(file_path) as file:
    f = [i.strip().split('|') for i in file.readlines()]

In [7]:
f.pop(1)

['---------+---------+----------+-------------------+-------------------+---------------------']

In [9]:
file_cleaned = [[value.strip() for value in row] for row in f]

In [21]:
file_cleaned = file_cleaned[:-2]

In [22]:
with open('data.csv', 'w') as data:
    writer = csv.writer(data)
    writer.writerows(file_cleaned)

In [23]:
df = pd.read_csv('data.csv')

In [26]:
df.dropna(axis=0, subset=['latitude', 'longitude'], inplace=True)

In [27]:
df.shape

(396634, 6)

In [29]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
3,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
7,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
9,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
10,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [37]:
df_subsample = resample(df, replace=False, n_samples=100000, random_state=42)

In [41]:
df_subsample = df_subsample[['latitude', 'longitude']]

In [43]:
mean_shift = MeanShift(bandwidth=0.1, cluster_all=False, n_jobs=-1)
mean_shift.fit(df_subsample)

MeanShift(bandwidth=0.1, cluster_all=False, n_jobs=-1)

In [80]:
clusters_more15 = pd.value_counts(pd.Series(mean_shift.labels_))[pd.value_counts(pd.Series(mean_shift.labels_)) > 15].index.to_list()

In [81]:
centers = mean_shift.cluster_centers_

In [86]:
centers_more15 = centers[clusters_more15]

In [87]:
centers_str = [[str(i), str(j)] for i, j in centers_more15]

In [89]:
with open('centers.txt', 'w') as center_file: 
    for string in centers_str: 
        center_file.write(' '.join(string))
        center_file.write('\n')

In [93]:
office_coords = np.array([[33.751277, -118.188740],
[25.867736, -80.324116],
[51.503016, -0.075479],
[52.378894, 4.885084],
[39.366487, 117.036146],
[-33.868457, 151.205134]])

In [100]:
distances = cdist(centers_more15, office_coords)
distances.shape

(585, 6)

In [111]:
row, col = [distances.argmin() // distances.shape[1], distances.argmin() % distances.shape[1]]

In [113]:
distances[row, col]

0.0042154949796996384

In [114]:
centers_more15[row]

array([-33.86449533, 151.20369331])