In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Metadata

NODES = 196591
EDGES = 950327
FULL_EDGES = 2097245
CHECKINS = 6442892

In [113]:
## Load Check-ins and permute them

ci_dataset = np.load('../Dataset/ci_ids.npy') # [usr, place]
p = np.random.permutation(CHECKINS)
ci_dataset = ci_dataset[p, :].astype(np.int32)

(6442892, 2)
(6442892, 2)


# Calculate accuracy with no clusterization (baseline)

In [34]:
## Count users

u_usr = np.unique(ci_dataset[:, 0]).astype(np.int32)
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [96]:
## Divide into train and test data

test_size = n_usr // 10 # test dataset is 5 times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

# permute users
p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

# divide users
test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

# mark arguments of test and train checkins in ci_dataset
test_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], test_usrs)).flatten()
train_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], train_usrs)).flatten()
assert test_checkins_args.size + train_checkins_args.size == CHECKINS
print('Number of train check-ins: %i' % train_checkins_args.size)
print('Number of test check-ins: %i' % test_checkins_args.size)

# get checkins from ci_dataset
test_checkins = ci_dataset[test_checkins_args, 1].flatten()
train_checkins = ci_dataset[train_checkins_args, 1].flatten()

Number of train users: 96383
Number of test users: 10709
Number of train check-ins: 5772068
Number of test check-ins: 670824


In [97]:
## Count checkins

uniq_train, cnt_train = np.unique(train_checkins, return_counts=True)
uniq_test, cnt_test = np.unique(test_checkins, return_counts=True)

# concatenate unique elements and their counts
train_data = np.concatenate((uniq_train.reshape(-1, 1),
                             cnt_train.reshape(-1, 1)),
                             axis=1).astype(np.int32)
test_data = np.concatenate((uniq_test.reshape(-1, 1),
                            cnt_test.reshape(-1, 1)),
                            axis=1).astype(np.int32)

In [98]:
## Sort with respect to counts and then flip to make ratings

train_data_srt = train_data[train_data[:, 1].argsort()]
train_data_srt = np.flip(train_data_srt, axis=0)

# visualization
train_dataframe = pd.DataFrame(data=train_data_srt[:10],
                               index=np.arange(1, 11),
                               columns=['Place ID', 'Count'])
print('Train Ratings')
print(train_dataframe)

Train Ratings
    Place ID  Count
1      55033   5256
2      19542   5070
3       9410   4220
4      10259   3636
5      58725   3156
6      23256   3106
7      14470   3088
8       9246   3027
9      10190   3021
10      9241   2994


In [102]:
## Calculate the accuracy metric: hits / (k * test_size) where k = 10

# get top 10 locations from train data
train_top10_loc = train_data_srt[:10, 0]

# find counts in the test dataset that correspond to those locations
coinc = np.argwhere(np.in1d(test_data[:, 0].flatten(), train_top10_loc)).flatten()
sum_test_counts = np.sum(test_data[coinc][:, 1])
acc = sum_test_counts / (10 * test_size)
print('Baseline accuracy: %.3f' % acc)

Baseline accuracy: 0.039


# Calculate accuracy with clusterization

In [89]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')
exemp_unique = np.unique(exemp) # unique exemplars
print(exemp_unique.shape)

(23799,)


In [90]:
## Make a dictionary of clusters

clust_dict = dict()
for exemplar in exemp_unique:
    clust_dict[exemplar] = np.argwhere(exemp == exemplar).flatten()

for exemplar in exemp_unique:
    print(clust_dict[exemplar].shape)

(49322,)
(47,)
(96,)
(218,)
(10,)
(32,)
(4,)
(8,)
(8,)
(18,)
(3,)
(3,)
(18,)
(33,)
(9,)
(6,)
(21,)
(7,)
(18,)
(27,)
(3,)
(11,)
(13,)
(5,)
(2,)
(3,)
(16,)
(11,)
(11,)
(13,)
(16,)
(34,)
(8,)
(3,)
(3,)
(3,)
(5,)
(5,)
(34,)
(14,)
(20,)
(7,)
(18,)
(9,)
(4,)
(4,)
(9,)
(6,)
(5,)
(22,)
(8,)
(11,)
(16,)
(6,)
(3,)
(16,)
(5,)
(17,)
(19,)
(12,)
(3,)
(5,)
(22,)
(7,)
(25,)
(5,)
(13,)
(8,)
(8,)
(8,)
(9,)
(4,)
(3,)
(4,)
(7,)
(4,)
(46,)
(5,)
(9,)
(28,)
(9,)
(14,)
(7,)
(6,)
(9,)
(6,)
(3,)
(17,)
(83,)
(4,)
(13,)
(32,)
(5,)
(5,)
(6,)
(13,)
(4,)
(15,)
(16,)
(225,)
(23,)
(11,)
(4,)
(10,)
(3,)
(4,)
(19,)
(9,)
(4571,)
(5,)
(9,)
(6,)
(6,)
(6,)
(9,)
(14,)
(3,)
(9,)
(7,)
(13,)
(3,)
(4,)
(2,)
(2,)
(6,)
(29,)
(5,)
(4,)
(10,)
(11,)
(6,)
(27,)
(9,)
(5,)
(35,)
(37,)
(5,)
(7,)
(8,)
(15,)
(1451,)
(4,)
(18,)
(4,)
(7,)
(4,)
(10,)
(11,)
(6,)
(25,)
(6,)
(11,)
(19,)
(4,)
(20,)
(7,)
(10,)
(5,)
(6,)
(11,)
(14,)
(9,)
(8,)
(10,)
(3,)
(4,)
(4,)
(14,)
(24,)
(9,)
(15,)
(4,)
(7,)
(7,)
(5,)
(6,)
(8,)
(16,)
(21,)
(7,)
(13,)
(13,)
(15

(8,)
(4,)
(4,)
(8,)
(3,)
(3,)
(4,)
(15,)
(13,)
(4,)
(7,)
(4,)
(10,)
(8,)
(8,)
(7,)
(5,)
(5,)
(9,)
(7,)
(4,)
(13,)
(9,)
(3,)
(6,)
(6,)
(8,)
(5,)
(9,)
(5,)
(3,)
(4,)
(17,)
(6,)
(11,)
(15,)
(4,)
(7,)
(4,)
(3,)
(3,)
(4,)
(7,)
(3,)
(7,)
(7,)
(2,)
(3,)
(3,)
(8,)
(3,)
(6,)
(4,)
(7,)
(11,)
(4,)
(3,)
(3,)
(15,)
(3,)
(3,)
(6,)
(36,)
(16,)
(4,)
(4,)
(5,)
(13,)
(18,)
(6,)
(3,)
(3,)
(7,)
(16,)
(5,)
(3,)
(4,)
(3,)
(5,)
(3,)
(4,)
(10,)
(8,)
(7,)
(6,)
(4,)
(7,)
(4,)
(3,)
(18,)
(5,)
(3,)
(6,)
(9,)
(22,)
(3,)
(5,)
(10,)
(3,)
(4,)
(5,)
(6,)
(4,)
(14,)
(14,)
(7,)
(4,)
(5,)
(8,)
(9,)
(4,)
(6,)
(3,)
(7,)
(3,)
(3,)
(3,)
(3,)
(6,)
(3,)
(13,)
(11,)
(9,)
(22,)
(3,)
(15,)
(9,)
(5,)
(3,)
(3,)
(6,)
(5,)
(5,)
(3,)
(5,)
(2,)
(10,)
(4,)
(5,)
(5,)
(22,)
(8,)
(3,)
(7,)
(3,)
(19,)
(5,)
(3,)
(4,)
(4,)
(4,)
(6,)
(26,)
(3,)
(5,)
(4,)
(6,)
(8,)
(14,)
(5,)
(5,)
(6,)
(5,)
(6,)
(11,)
(12,)
(11,)
(5,)
(6,)
(3,)
(3,)
(3,)
(5,)
(4,)
(4,)
(8,)
(12,)
(11,)
(3,)
(6,)
(8,)
(4,)
(5,)
(4,)
(5,)
(5,)
(4,)
(3,)
(8,)
(7,)
(10,)
(6,)
(12,)

(5,)
(5,)
(10,)
(4,)
(3,)
(3,)
(3,)
(5,)
(3,)
(3,)
(4,)
(34,)
(8,)
(5,)
(5,)
(6,)
(3,)
(3,)
(3,)
(4,)
(5,)
(7,)
(3,)
(5,)
(5,)
(5,)
(4,)
(9,)
(5,)
(9,)
(7,)
(12,)
(5,)
(6,)
(4,)
(4,)
(3,)
(4,)
(6,)
(4,)
(6,)
(4,)
(5,)
(8,)
(5,)
(12,)
(7,)
(6,)
(5,)
(4,)
(7,)
(3,)
(12,)
(3,)
(3,)
(6,)
(6,)
(4,)
(4,)
(3,)
(6,)
(7,)
(7,)
(5,)
(4,)
(4,)
(4,)
(3,)
(6,)
(5,)
(6,)
(4,)
(8,)
(6,)
(8,)
(5,)
(4,)
(3,)
(3,)
(4,)
(5,)
(6,)
(4,)
(9,)
(3,)
(10,)
(7,)
(4,)
(3,)
(5,)
(5,)
(13,)
(5,)
(8,)
(4,)
(11,)
(4,)
(5,)
(4,)
(5,)
(6,)
(32,)
(6,)
(10,)
(8,)
(4,)
(9,)
(15,)
(8,)
(15,)
(16,)
(3,)
(3,)
(8,)
(4,)
(16,)
(18,)
(75,)
(10,)
(12,)
(4,)
(12,)
(11,)
(6,)
(11,)
(21,)
(3,)
(3,)
(14,)
(5,)
(11,)
(5,)
(3,)
(4,)
(3,)
(10,)
(11,)
(3,)
(6,)
(14,)
(5,)
(5,)
(3,)
(7,)
(6,)
(4,)
(5,)
(3,)
(4,)
(8,)
(4,)
(6,)
(3,)
(4,)
(3,)
(23,)
(8,)
(5,)
(8,)
(12,)
(3,)
(3,)
(3,)
(4,)
(15,)
(3,)
(5,)
(10,)
(4,)
(6,)
(4,)
(4,)
(9,)
(6,)
(4,)
(7,)
(14,)
(8,)
(4,)
(6,)
(3,)
(4,)
(16,)
(8,)
(4,)
(11,)
(9,)
(7,)
(10,)
(10,)
(3,)
(16,)
(11

(8,)
(8,)
(6,)
(15,)
(6,)
(6,)
(9,)
(10,)
(24,)
(7,)
(2,)
(6,)
(3,)
(4,)
(4,)
(11,)
(4,)
(10,)
(7,)
(4,)
(3,)
(8,)
(8,)
(4,)
(7,)
(3,)
(8,)
(6,)
(4,)
(3,)
(5,)
(4,)
(4,)
(7,)
(7,)
(3,)
(6,)
(6,)
(4,)
(3,)
(4,)
(3,)
(14,)
(8,)
(3,)
(3,)
(3,)
(5,)
(3,)
(3,)
(4,)
(9,)
(3,)
(5,)
(3,)
(5,)
(5,)
(6,)
(6,)
(3,)
(4,)
(8,)
(4,)
(3,)
(9,)
(3,)
(5,)
(2,)
(5,)
(7,)
(5,)
(8,)
(3,)
(4,)
(3,)
(7,)
(3,)
(5,)
(3,)
(8,)
(3,)
(3,)
(7,)
(11,)
(5,)
(4,)
(4,)
(5,)
(5,)
(6,)
(2,)
(3,)
(3,)
(5,)
(7,)
(5,)
(2,)
(6,)
(4,)
(4,)
(4,)
(4,)
(17,)
(10,)
(3,)
(4,)
(6,)
(3,)
(4,)
(3,)
(6,)
(6,)
(4,)
(9,)
(3,)
(7,)
(4,)
(3,)
(3,)
(3,)
(4,)
(3,)
(5,)
(4,)
(3,)
(3,)
(3,)
(3,)
(5,)
(3,)
(4,)
(9,)
(4,)
(6,)
(3,)
(2,)
(3,)
(2,)
(2,)
(5,)
(3,)
(8,)
(6,)
(9,)
(5,)
(10,)
(6,)
(3,)
(4,)
(3,)
(3,)
(4,)
(4,)
(3,)
(5,)
(3,)
(4,)
(3,)
(7,)
(6,)
(3,)
(5,)
(8,)
(23,)
(4,)
(4,)
(4,)
(3,)
(4,)
(6,)
(10,)
(4,)
(5,)
(8,)
(4,)
(3,)
(3,)
(3,)
(3,)
(7,)
(4,)
(7,)
(4,)
(5,)
(4,)
(6,)
(3,)
(4,)
(6,)
(4,)
(13,)
(4,)
(8,)
(9,)
(6,)
(3,)
(4,)
(4

(14,)
(5,)
(9,)
(27,)
(5,)
(3,)
(3,)
(16,)
(3,)
(4,)
(4,)
(3,)
(3,)
(6,)
(10,)
(4,)
(8,)
(4,)
(4,)
(7,)
(4,)
(9,)
(12,)
(9,)
(5,)
(12,)
(2,)
(4,)
(3,)
(4,)
(6,)
(9,)
(6,)
(5,)
(10,)
(21,)
(4,)
(3,)
(3,)
(5,)
(8,)
(4,)
(6,)
(10,)
(3,)
(10,)
(6,)
(5,)
(3,)
(3,)
(2,)
(4,)
(10,)
(8,)
(11,)
(7,)
(7,)
(8,)
(10,)
(8,)
(5,)
(29,)
(6,)
(3,)
(6,)
(2,)
(18,)
(7,)
(3,)
(4,)
(29,)
(8,)
(6,)
(26,)
(6,)
(8,)
(6,)
(5,)
(7,)
(3,)
(3,)
(4,)
(12,)
(6,)
(5,)
(3,)
(4,)
(4,)
(5,)
(5,)
(6,)
(4,)
(13,)
(14,)
(4,)
(3,)
(7,)
(14,)
(2,)
(4,)
(9,)
(13,)
(5,)
(4,)
(6,)
(12,)
(5,)
(6,)
(7,)
(8,)
(3,)
(5,)
(5,)
(3,)
(3,)
(5,)
(3,)
(7,)
(8,)
(4,)
(4,)
(4,)
(4,)
(13,)
(3,)
(14,)
(9,)
(14,)
(6,)
(9,)
(6,)
(3,)
(8,)
(5,)
(3,)
(4,)
(4,)
(3,)
(3,)
(4,)
(4,)
(3,)
(5,)
(3,)
(6,)
(5,)
(2,)
(8,)
(5,)
(3,)
(2,)
(2,)
(4,)
(5,)
(2,)
(3,)
(4,)
(4,)
(2,)
(6,)
(4,)
(8,)
(5,)
(6,)
(5,)
(5,)
(3,)
(6,)
(6,)
(2,)
(2,)
(2,)
(2,)
(4,)
(3,)
(2,)
(3,)
(3,)
(2,)
(3,)
(2,)
(4,)
(1,)
(2,)
(4,)
(5,)
(5,)
(4,)
(17,)
(2,)
(3,)
(6,)
(3,)
(4,)
(4,

(5,)
(2,)
(2,)
(3,)
(7,)
(4,)
(4,)
(3,)
(3,)
(3,)
(3,)
(3,)
(2,)
(2,)
(3,)
(4,)
(3,)
(3,)
(6,)
(3,)
(3,)
(2,)
(5,)
(3,)
(7,)
(4,)
(4,)
(5,)
(4,)
(4,)
(4,)
(3,)
(9,)
(3,)
(4,)
(2,)
(4,)
(7,)
(2,)
(11,)
(2,)
(3,)
(5,)
(4,)
(2,)
(2,)
(5,)
(3,)
(2,)
(5,)
(3,)
(11,)
(5,)
(3,)
(2,)
(3,)
(2,)
(2,)
(2,)
(4,)
(5,)
(2,)
(4,)
(2,)
(2,)
(3,)
(3,)
(4,)
(4,)
(7,)
(4,)
(3,)
(4,)
(2,)
(4,)
(3,)
(3,)
(2,)
(7,)
(4,)
(3,)
(5,)
(4,)
(4,)
(4,)
(3,)
(9,)
(3,)
(6,)
(4,)
(1,)
(2,)
(3,)
(3,)
(4,)
(3,)
(5,)
(3,)
(3,)
(3,)
(13,)
(5,)
(2,)
(3,)
(5,)
(4,)
(2,)
(5,)
(3,)
(3,)
(5,)
(6,)
(3,)
(2,)
(4,)
(3,)
(4,)
(4,)
(9,)
(3,)
(6,)
(3,)
(2,)
(5,)
(4,)
(4,)
(5,)
(2,)
(3,)
(2,)
(3,)
(2,)
(11,)
(4,)
(3,)
(4,)
(3,)
(11,)
(3,)
(2,)
(2,)
(5,)
(3,)
(4,)
(4,)
(3,)
(3,)
(3,)
(2,)
(3,)
(2,)
(3,)
(3,)
(3,)
(7,)
(3,)
(7,)
(2,)
(3,)
(5,)
(2,)
(3,)
(2,)
(4,)
(6,)
(4,)
(5,)
(4,)
(3,)
(4,)
(1,)
(3,)
(4,)
(4,)
(5,)
(4,)
(5,)
(3,)
(3,)
(2,)
(5,)
(6,)
(8,)
(5,)
(4,)
(4,)
(3,)
(3,)
(3,)
(3,)
(2,)
(5,)
(3,)
(7,)
(6,)
(2,)
(3,)
(2,)
(1,)


(3,)
(4,)
(2,)
(3,)
(6,)
(5,)
(4,)
(5,)
(2,)
(5,)
(3,)
(3,)
(5,)
(4,)
(7,)
(3,)
(5,)
(4,)
(2,)
(3,)
(7,)
(5,)
(3,)
(2,)
(3,)
(2,)
(7,)
(2,)
(2,)
(1,)
(5,)
(7,)
(3,)
(6,)
(3,)
(4,)
(11,)
(2,)
(6,)
(9,)
(3,)
(5,)
(6,)
(5,)
(5,)
(4,)
(4,)
(6,)
(9,)
(3,)
(3,)
(4,)
(2,)
(2,)
(5,)
(3,)
(2,)
(6,)
(6,)
(5,)
(3,)
(3,)
(3,)
(4,)
(3,)
(2,)
(2,)
(8,)
(3,)
(5,)
(4,)
(3,)
(4,)
(2,)
(2,)
(2,)
(5,)
(2,)
(3,)
(1,)
(5,)
(3,)
(5,)
(3,)
(2,)
(5,)
(3,)
(3,)
(2,)
(8,)
(4,)
(4,)
(4,)
(5,)
(4,)
(6,)
(3,)
(4,)
(1,)
(4,)
(2,)
(6,)
(5,)
(5,)
(4,)
(4,)
(9,)
(3,)
(6,)
(12,)
(3,)
(3,)
(3,)
(2,)
(10,)
(3,)
(4,)
(4,)
(5,)
(2,)
(3,)
(6,)
(2,)
(2,)
(8,)
(2,)
(4,)
(2,)
(3,)
(5,)
(7,)
(4,)
(3,)
(3,)
(5,)
(8,)
(10,)
(7,)
(2,)
(3,)
(4,)
(2,)
(6,)
(7,)
(6,)
(6,)
(1,)
(3,)
(7,)
(2,)
(3,)
(3,)
(3,)
(4,)
(4,)
(8,)
(5,)
(3,)
(7,)
(3,)
(2,)
(3,)
(4,)
(3,)
(4,)
(2,)
(9,)
(2,)
(6,)
(3,)
(4,)
(2,)
(9,)
(5,)
(8,)
(2,)
(3,)
(2,)
(4,)
(3,)
(3,)
(2,)
(3,)
(7,)
(4,)
(5,)
(6,)
(3,)
(2,)
(4,)
(3,)
(1,)
(8,)
(2,)
(2,)
(2,)
(5,)
(5,)
(10,)


In [87]:
## Make top 10 for each cluster (among the users in the train selection)

clust_top10 = dict()
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the train selection
    train_clust_args = np.argwhere(np.in1d(train_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    train_clust_checkins = ci_dataset[train_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_train, cnt_clust_train = np.unique(train_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    train_clust_data = np.concatenate((uniq_clust_train.reshape(-1, 1),
                                       cnt_clust_train.reshape(-1, 1)),
                                       axis=1).astype(np.int32)
    
    # sort & flip
    train_clust_data_srt = train_clust_data[train_clust_data[:, 1].argsort()]
    train_clust_data_srt = np.flip(train_clust_data_srt, axis=0)
    
    # store the top 10 locations
    clust_top10[exemplar] = train_clust_data_srt[:10, 0]

In [88]:
## Calculate the accuracy metric for the clusters

total_sum = 0
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the test selection
    test_clust_args = np.argwhere(np.in1d(test_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    test_clust_checkins = ci_dataset[test_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_test, cnt_clust_test = np.unique(test_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    test_clust_data = np.concatenate((uniq_clust_test.reshape(-1, 1),
                                      cnt_clust_test.reshape(-1, 1)),
                                      axis=1).astype(np.int32)
    
    # get coincidences with the top 10 locations for the cluster (arguments of locations)
    clust_coinc = np.argwhere(np.in1d(test_clust_data[:, 0].flatten(), clust_top10[exemplar])).flatten()
    
    # count the coincidences 
    clust_sum_test_counts = np.sum(test_clust_data[clust_coinc][:, 1])
    total_sum += clust_sum_test_counts
    
acc_clust = total_sum / (10 * test_size)
print('Clusterized accuracy: %f' % acc_clust)

Clusterized accuracy: 0.000182
