In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Metadata

NODES = 196591
EDGES = 950327
FULL_EDGES = 2097245
CHECKINS = 6442892

In [3]:
## Load Check-ins and permute them

ci_dataset = np.load('../Dataset/ci_ids.npy') # [usr, place]
p = np.random.permutation(CHECKINS)
ci_dataset = ci_dataset[p, :].astype(np.int32)

# Calculate accuracy with no clusterization (baseline)

In [4]:
## Count users

u_usr = np.unique(ci_dataset[:, 0]).astype(np.int32)
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [5]:
## Divide into train and test data

test_size = n_usr // 10 # test dataset is 5 times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

# permute users
p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

# divide users
test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

# mark arguments of test and train checkins in ci_dataset
test_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], test_usrs)).flatten()
train_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], train_usrs)).flatten()
assert test_checkins_args.size + train_checkins_args.size == CHECKINS
print('Number of train check-ins: %i' % train_checkins_args.size)
print('Number of test check-ins: %i' % test_checkins_args.size)

# get checkins from ci_dataset
test_checkins = ci_dataset[test_checkins_args, 1].flatten()
train_checkins = ci_dataset[train_checkins_args, 1].flatten()

Number of train users: 96383
Number of test users: 10709
Number of train check-ins: 5813321
Number of test check-ins: 629571


In [6]:
## Count checkins

uniq_train, cnt_train = np.unique(train_checkins, return_counts=True)
uniq_test, cnt_test = np.unique(test_checkins, return_counts=True)

# concatenate unique elements and their counts
train_data = np.concatenate((uniq_train.reshape(-1, 1),
                             cnt_train.reshape(-1, 1)),
                             axis=1).astype(np.int32)
test_data = np.concatenate((uniq_test.reshape(-1, 1),
                            cnt_test.reshape(-1, 1)),
                            axis=1).astype(np.int32)

In [7]:
## Sort with respect to counts and then flip to make ratings

train_data_srt = train_data[train_data[:, 1].argsort()]
train_data_srt = np.flip(train_data_srt, axis=0)

# visualization
train_dataframe = pd.DataFrame(data=train_data_srt[:10],
                               index=np.arange(1, 11),
                               columns=['Place ID', 'Count'])
print('Train Ratings')
print(train_dataframe)

Train Ratings
    Place ID  Count
1      55033   5297
2      19542   5072
3       9410   4175
4      10259   3672
5      58725   3118
6      23256   3090
7       9246   3088
8      14470   3048
9      10190   3041
10      9241   2916


In [8]:
## Calculate the accuracy metric: hits / (k * test_size) where k = 10

# get top 10 locations from train data
train_top10_loc = train_data_srt[:10, 0]

# find counts in the test dataset that correspond to those locations
coinc = np.argwhere(np.in1d(test_data[:, 0].flatten(), train_top10_loc)).flatten()
sum_test_counts = np.sum(test_data[coinc][:, 1])
acc = sum_test_counts / (10 * test_size)
print('Baseline accuracy: %.3f' % acc)

Baseline accuracy: 0.039


# Calculate accuracy with clusterization

In [9]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')
exemp_unique = np.unique(exemp) # unique exemplars
print(exemp_unique.shape)

(9296,)


In [10]:
## Make a dictionary of clusters

clust_dict = dict()
for exemplar in exemp_unique:
    clust_dict[exemplar] = np.argwhere(exemp == exemplar).flatten()

for exemplar in exemp_unique:
    print(clust_dict[exemplar].shape)

(133709,)
(39,)
(58,)
(184,)
(6,)
(13,)
(11,)
(3,)
(11,)
(22,)
(5,)
(4,)
(11,)
(14,)
(15,)
(7,)
(3,)
(9,)
(7,)
(3,)
(8,)
(22,)
(5,)
(3,)
(21,)
(8,)
(8,)
(11,)
(7,)
(6,)
(19,)
(14,)
(6,)
(15,)
(11,)
(11,)
(10,)
(4,)
(12,)
(9,)
(6,)
(7,)
(6,)
(4,)
(3,)
(19,)
(23,)
(6,)
(8,)
(4,)
(8,)
(3,)
(44,)
(8,)
(25,)
(5,)
(5,)
(7,)
(9,)
(174,)
(10,)
(8,)
(9,)
(13,)
(3633,)
(5,)
(7,)
(10,)
(5,)
(7,)
(10,)
(15,)
(5,)
(7,)
(20,)
(30,)
(20,)
(15,)
(1625,)
(11,)
(6,)
(5,)
(11,)
(11,)
(5,)
(8,)
(9,)
(6,)
(3,)
(8,)
(15,)
(3,)
(4,)
(8,)
(12,)
(11,)
(9,)
(10,)
(8,)
(6,)
(7,)
(12,)
(14,)
(26,)
(7,)
(8,)
(4,)
(5,)
(21,)
(6,)
(1024,)
(15,)
(47,)
(29,)
(13,)
(9,)
(37,)
(186,)
(13,)
(4,)
(11,)
(8,)
(3,)
(9,)
(7,)
(9,)
(10,)
(809,)
(4,)
(6,)
(26,)
(4,)
(8,)
(5,)
(4,)
(9,)
(11,)
(232,)
(4,)
(7,)
(8,)
(4,)
(5,)
(16,)
(4,)
(4,)
(10,)
(14,)
(8,)
(22,)
(8,)
(20,)
(13,)
(28,)
(40,)
(20,)
(6,)
(5,)
(22,)
(19,)
(14,)
(10,)
(79,)
(33,)
(7,)
(144,)
(8,)
(50,)
(19,)
(6,)
(9,)
(10,)
(7,)
(6,)
(5,)
(8,)
(5,)
(12,)
(4,)
(9,)
(6

(5,)
(3,)
(4,)
(5,)
(6,)
(5,)
(5,)
(7,)
(3,)
(3,)
(4,)
(4,)
(6,)
(9,)
(4,)
(4,)
(7,)
(3,)
(3,)
(6,)
(7,)
(3,)
(8,)
(4,)
(4,)
(4,)
(9,)
(6,)
(3,)
(5,)
(13,)
(4,)
(3,)
(3,)
(8,)
(12,)
(4,)
(3,)
(10,)
(6,)
(45,)
(4,)
(3,)
(8,)
(3,)
(6,)
(3,)
(4,)
(4,)
(5,)
(5,)
(7,)
(6,)
(4,)
(3,)
(3,)
(6,)
(4,)
(3,)
(4,)
(7,)
(3,)
(9,)
(4,)
(11,)
(4,)
(8,)
(7,)
(12,)
(3,)
(3,)
(3,)
(7,)
(7,)
(3,)
(7,)
(3,)
(4,)
(11,)
(3,)
(7,)
(3,)
(6,)
(7,)
(6,)
(4,)
(5,)
(5,)
(4,)
(8,)
(7,)
(3,)
(4,)
(5,)
(3,)
(4,)
(10,)
(3,)
(6,)
(3,)
(3,)
(3,)
(3,)
(4,)
(5,)
(5,)
(3,)
(16,)
(3,)
(5,)
(7,)
(3,)
(3,)
(3,)
(3,)
(10,)
(6,)
(3,)
(3,)
(7,)
(3,)
(4,)
(5,)
(5,)
(9,)
(11,)
(7,)
(4,)
(6,)
(7,)
(9,)
(6,)
(9,)
(4,)
(4,)
(5,)
(3,)
(3,)
(3,)
(4,)
(3,)
(3,)
(5,)
(5,)
(4,)
(7,)
(9,)
(6,)
(5,)
(5,)
(8,)
(3,)
(30,)
(5,)
(6,)
(6,)
(6,)
(8,)
(3,)
(6,)
(12,)
(8,)
(41,)
(7,)
(8,)
(4,)
(3,)
(7,)
(7,)
(3,)
(8,)
(9,)
(4,)
(3,)
(4,)
(8,)
(11,)
(4,)
(3,)
(4,)
(5,)
(4,)
(6,)
(4,)
(19,)
(3,)
(10,)
(3,)
(6,)
(5,)
(3,)
(3,)
(5,)
(4,)
(8,)
(3,)
(12

(3,)
(3,)
(6,)
(3,)
(3,)
(3,)
(4,)
(3,)
(4,)
(4,)
(5,)
(7,)
(2,)
(2,)
(5,)
(3,)
(5,)
(10,)
(4,)
(2,)
(5,)
(6,)
(2,)
(6,)
(12,)
(13,)
(3,)
(6,)
(7,)
(5,)
(3,)
(4,)
(3,)
(9,)
(4,)
(5,)
(3,)
(4,)
(5,)
(3,)
(4,)
(5,)
(2,)
(7,)
(2,)
(5,)
(11,)
(6,)
(8,)
(9,)
(4,)
(5,)
(9,)
(9,)
(4,)
(2,)
(7,)
(3,)
(4,)
(3,)
(13,)
(4,)
(3,)
(7,)
(5,)
(12,)
(3,)
(8,)
(7,)
(3,)
(4,)
(3,)
(4,)
(5,)
(9,)
(10,)
(6,)
(9,)
(11,)
(7,)
(6,)
(4,)
(3,)
(3,)
(7,)
(2,)
(4,)
(3,)
(4,)
(5,)
(4,)
(4,)
(2,)
(4,)
(3,)
(4,)
(3,)
(3,)
(3,)
(6,)
(8,)
(3,)
(6,)
(5,)
(3,)
(4,)
(2,)
(5,)
(15,)
(3,)
(2,)
(3,)
(6,)
(4,)
(5,)
(4,)
(4,)
(5,)
(4,)
(3,)
(5,)
(5,)
(4,)
(10,)
(3,)
(7,)
(3,)
(4,)
(7,)
(4,)
(6,)
(3,)
(2,)
(2,)
(5,)
(8,)
(9,)
(8,)
(4,)
(2,)
(7,)
(4,)
(6,)
(3,)
(11,)
(10,)
(3,)
(4,)
(8,)
(3,)
(6,)
(4,)
(5,)
(7,)
(11,)
(3,)
(3,)
(4,)
(3,)
(7,)
(6,)
(6,)
(4,)
(3,)
(3,)
(4,)
(4,)
(3,)
(5,)
(6,)
(5,)
(17,)
(21,)
(3,)
(3,)
(3,)
(2,)
(4,)
(9,)
(3,)
(10,)
(3,)
(3,)
(22,)
(20,)
(5,)
(3,)
(7,)
(3,)
(4,)
(3,)
(3,)
(5,)
(10,)
(4,)
(3,)
(

(3,)
(4,)
(4,)
(3,)
(13,)
(4,)
(4,)
(4,)
(4,)
(7,)
(2,)
(2,)
(3,)
(6,)
(2,)
(4,)
(2,)
(2,)
(3,)
(9,)
(5,)
(6,)
(6,)
(5,)
(4,)
(2,)
(4,)
(4,)
(4,)
(5,)
(3,)
(3,)
(3,)
(2,)
(2,)
(3,)
(7,)
(7,)
(2,)
(5,)
(5,)
(3,)
(6,)
(4,)
(5,)
(5,)
(12,)
(5,)
(2,)
(8,)
(5,)
(4,)
(3,)
(2,)
(9,)
(3,)
(2,)
(2,)
(4,)
(8,)
(5,)
(4,)
(3,)
(4,)
(4,)
(9,)
(7,)
(5,)
(2,)
(6,)
(3,)
(2,)
(5,)
(5,)
(3,)
(6,)
(3,)
(3,)
(4,)
(2,)
(2,)
(2,)
(5,)
(5,)
(4,)
(3,)
(2,)
(2,)
(3,)
(6,)
(4,)
(4,)
(4,)
(3,)
(3,)
(7,)
(6,)
(4,)
(2,)
(3,)
(2,)
(2,)
(6,)
(5,)
(5,)
(2,)
(8,)
(10,)
(3,)
(3,)
(4,)
(10,)
(3,)
(4,)
(2,)
(4,)
(6,)
(2,)
(4,)
(5,)
(4,)
(2,)
(4,)
(2,)
(2,)
(2,)
(3,)
(3,)
(10,)
(4,)
(3,)
(5,)
(8,)
(3,)
(6,)
(6,)
(7,)
(4,)
(4,)
(10,)
(4,)
(5,)
(5,)
(2,)
(4,)
(6,)
(4,)
(3,)
(4,)
(5,)
(4,)
(3,)
(9,)
(4,)
(4,)
(5,)
(3,)
(3,)
(2,)
(6,)
(4,)
(3,)
(2,)
(4,)
(4,)
(4,)
(3,)
(5,)
(4,)
(2,)
(7,)
(3,)
(4,)
(3,)
(2,)
(3,)
(2,)
(4,)
(5,)
(3,)
(3,)
(6,)
(2,)
(4,)
(4,)
(4,)
(4,)
(2,)
(5,)
(2,)
(6,)
(5,)
(4,)
(5,)
(3,)
(2,)
(3,)
(4,)
(3,)

In [87]:
## Make top 10 for each cluster (among the users in the train selection)

clust_top10 = dict()
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the train selection
    train_clust_args = np.argwhere(np.in1d(train_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    train_clust_checkins = ci_dataset[train_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_train, cnt_clust_train = np.unique(train_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    train_clust_data = np.concatenate((uniq_clust_train.reshape(-1, 1),
                                       cnt_clust_train.reshape(-1, 1)),
                                       axis=1).astype(np.int32)
    
    # sort & flip
    train_clust_data_srt = train_clust_data[train_clust_data[:, 1].argsort()]
    train_clust_data_srt = np.flip(train_clust_data_srt, axis=0)
    
    # store the top 10 locations
    clust_top10[exemplar] = train_clust_data_srt[:10, 0]

In [88]:
## Calculate the accuracy metric for the clusters

total_sum = 0
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the test selection
    test_clust_args = np.argwhere(np.in1d(test_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    test_clust_checkins = ci_dataset[test_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_test, cnt_clust_test = np.unique(test_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    test_clust_data = np.concatenate((uniq_clust_test.reshape(-1, 1),
                                      cnt_clust_test.reshape(-1, 1)),
                                      axis=1).astype(np.int32)
    
    # get coincidences with the top 10 locations for the cluster (arguments of locations)
    clust_coinc = np.argwhere(np.in1d(test_clust_data[:, 0].flatten(), clust_top10[exemplar])).flatten()
    
    # count the coincidences 
    clust_sum_test_counts = np.sum(test_clust_data[clust_coinc][:, 1])
    total_sum += clust_sum_test_counts
    
acc_clust = total_sum / (10 * test_size)
print('Clusterized accuracy: %f' % acc_clust)

Clusterized accuracy: 0.000182
