In [2]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
## Metadata

NODES = 196591
EDGES = 950327
FULL_EDGES = 2097245
CHECKINS = 6442892

In [4]:
## Load Check-ins and permute them

ci_dataset = np.load('../Dataset/ci_ids.npy') # [usr, place]
p = np.random.permutation(CHECKINS)
ci_dataset = ci_dataset[p, :].astype(np.int32)

# Calculate accuracy with no clusterization (baseline)

In [5]:
## Count users

u_usr = np.unique(ci_dataset[:, 0]).astype(np.int32)
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [6]:
## Divide into train and test data

test_size = n_usr // 10 # test dataset is 5 times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

# permute users
p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

# divide users
test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

# mark arguments of test and train checkins in ci_dataset
test_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], test_usrs)).flatten()
train_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], train_usrs)).flatten()
assert test_checkins_args.size + train_checkins_args.size == CHECKINS
print('Number of train check-ins: %i' % train_checkins_args.size)
print('Number of test check-ins: %i' % test_checkins_args.size)

# get checkins from ci_dataset
test_checkins = ci_dataset[test_checkins_args, 1].flatten()
train_checkins = ci_dataset[train_checkins_args, 1].flatten()

Number of train users: 96383
Number of test users: 10709
Number of train check-ins: 5800524
Number of test check-ins: 642368


In [7]:
## Count checkins

uniq_train, cnt_train = np.unique(train_checkins, return_counts=True)
uniq_test, cnt_test = np.unique(test_checkins, return_counts=True)

# concatenate unique elements and their counts
train_data = np.concatenate((uniq_train.reshape(-1, 1),
                             cnt_train.reshape(-1, 1)),
                             axis=1).astype(np.int32)
test_data = np.concatenate((uniq_test.reshape(-1, 1),
                            cnt_test.reshape(-1, 1)),
                            axis=1).astype(np.int32)

In [8]:
## Sort with respect to counts and then flip to make ratings

train_data_srt = train_data[train_data[:, 1].argsort()]
train_data_srt = np.flip(train_data_srt, axis=0)

# visualization
train_dataframe = pd.DataFrame(data=train_data_srt[:10],
                               index=np.arange(1, 11),
                               columns=['Place ID', 'Count'])
print('Train Ratings')
print(train_dataframe)

Train Ratings
    Place ID  Count
1      55033   5207
2      19542   5058
3       9410   4268
4      10259   3671
5      58725   3152
6      23256   3099
7       9241   3085
8      14470   3054
9       9246   3051
10     10190   3033


In [9]:
## Calculate the accuracy metric: hits / (k * test_size) where k = 10

# get top 10 locations from train data
train_top10_loc = train_data_srt[:10, 0]

# find counts in the test dataset that correspond to those locations
coinc = np.argwhere(np.in1d(test_data[:, 0].flatten(), train_top10_loc)).flatten()
sum_test_counts = np.sum(test_data[coinc][:, 1])
acc = sum_test_counts / (10 * test_size)
print('Baseline accuracy: %.3f' % acc)

Baseline accuracy: 0.038


# Calculate accuracy with clusterization

In [14]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')
exemp_unique = np.unique(exemp) # unique exemplars
print(exemp_unique.shape)

(5337,)


In [15]:
## Make a dictionary of clusters

clust_dict = dict()
for exemplar in exemp_unique:
    clust_dict[exemplar] = np.argwhere(exemp == exemplar).flatten()

for exemplar in exemp_unique:
    print(clust_dict[exemplar].shape)

(84338,)
(882,)
(357,)
(636,)
(6,)
(145,)
(28,)
(46,)
(46,)
(18,)
(93,)
(32,)
(25,)
(42,)
(151,)
(51,)
(2,)
(322,)
(35,)
(14,)
(36,)
(41,)
(8,)
(31,)
(99,)
(6,)
(143,)
(14,)
(59,)
(17,)
(16,)
(30,)
(42,)
(3,)
(8,)
(29,)
(62,)
(24,)
(4,)
(212,)
(37,)
(63,)
(5,)
(9,)
(755,)
(99,)
(10,)
(51,)
(6088,)
(6,)
(9,)
(85,)
(33,)
(23,)
(175,)
(33,)
(31,)
(37,)
(138,)
(37,)
(49,)
(5940,)
(22,)
(17,)
(185,)
(77,)
(23,)
(79,)
(34,)
(28,)
(26,)
(4,)
(42,)
(30,)
(31,)
(116,)
(31,)
(59,)
(19,)
(32,)
(62,)
(153,)
(22,)
(79,)
(1861,)
(42,)
(162,)
(61,)
(18,)
(24,)
(118,)
(1043,)
(61,)
(60,)
(19,)
(156,)
(12,)
(2409,)
(15,)
(161,)
(179,)
(19,)
(60,)
(878,)
(35,)
(96,)
(42,)
(158,)
(264,)
(36,)
(22,)
(12,)
(205,)
(44,)
(146,)
(245,)
(13,)
(73,)
(43,)
(62,)
(90,)
(8,)
(314,)
(45,)
(51,)
(592,)
(8,)
(111,)
(118,)
(22,)
(9,)
(102,)
(23,)
(9,)
(9,)
(138,)
(14,)
(83,)
(40,)
(5,)
(11,)
(20,)
(15,)
(8,)
(61,)
(7,)
(33,)
(10,)
(24,)
(169,)
(29,)
(35,)
(26,)
(4,)
(30,)
(11,)
(89,)
(12,)
(52,)
(62,)
(9,)
(8,)
(69,)


(2,)
(8,)
(13,)
(3,)
(12,)
(8,)
(4,)
(4,)
(17,)
(7,)
(8,)
(3,)
(9,)
(6,)
(26,)
(11,)
(65,)
(9,)
(7,)
(5,)
(5,)
(7,)
(6,)
(3,)
(5,)
(6,)
(2,)
(2,)
(3,)
(6,)
(2,)
(19,)
(7,)
(6,)
(37,)
(53,)
(7,)
(3,)
(6,)
(4,)
(10,)
(25,)
(6,)
(10,)
(12,)
(7,)
(10,)
(13,)
(35,)
(9,)
(12,)
(2,)
(9,)
(21,)
(4,)
(20,)
(20,)
(8,)
(28,)
(4,)
(4,)
(4,)
(9,)
(7,)
(20,)
(4,)
(23,)
(8,)
(6,)
(5,)
(17,)
(33,)
(9,)
(7,)
(3,)
(4,)
(23,)
(10,)
(7,)
(10,)
(4,)
(2,)
(5,)
(2,)
(7,)
(5,)
(5,)
(4,)
(16,)
(6,)
(4,)
(4,)
(19,)
(27,)
(3,)
(4,)
(10,)
(9,)
(7,)
(9,)
(7,)
(3,)
(8,)
(4,)
(29,)
(4,)
(7,)
(7,)
(3,)
(9,)
(6,)
(7,)
(5,)
(15,)
(3,)
(3,)
(4,)
(2,)
(4,)
(10,)
(12,)
(6,)
(6,)
(5,)
(3,)
(6,)
(8,)
(21,)
(13,)
(7,)
(11,)
(4,)
(6,)
(6,)
(8,)
(5,)
(28,)
(4,)
(16,)
(5,)
(7,)
(2,)
(21,)
(4,)
(9,)
(16,)
(22,)
(2,)
(9,)
(27,)
(5,)
(2,)
(10,)
(11,)
(9,)
(12,)
(8,)
(7,)
(8,)
(20,)
(14,)
(4,)
(2,)
(11,)
(9,)
(9,)
(6,)
(4,)
(28,)
(92,)
(3,)
(3,)
(3,)
(10,)
(7,)
(6,)
(2,)
(2,)
(9,)
(4,)
(3,)
(12,)
(6,)
(4,)
(33,)
(8,)
(14,)
(13,)
(3

In [87]:
## Make top 10 for each cluster (among the users in the train selection)

clust_top10 = dict()
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the train selection
    train_clust_args = np.argwhere(np.in1d(train_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    train_clust_checkins = ci_dataset[train_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_train, cnt_clust_train = np.unique(train_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    train_clust_data = np.concatenate((uniq_clust_train.reshape(-1, 1),
                                       cnt_clust_train.reshape(-1, 1)),
                                       axis=1).astype(np.int32)
    
    # sort & flip
    train_clust_data_srt = train_clust_data[train_clust_data[:, 1].argsort()]
    train_clust_data_srt = np.flip(train_clust_data_srt, axis=0)
    
    # store the top 10 locations
    clust_top10[exemplar] = train_clust_data_srt[:10, 0]

In [88]:
## Calculate the accuracy metric for the clusters

total_sum = 0
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the test selection
    test_clust_args = np.argwhere(np.in1d(test_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    test_clust_checkins = ci_dataset[test_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_test, cnt_clust_test = np.unique(test_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    test_clust_data = np.concatenate((uniq_clust_test.reshape(-1, 1),
                                      cnt_clust_test.reshape(-1, 1)),
                                      axis=1).astype(np.int32)
    
    # get coincidences with the top 10 locations for the cluster (arguments of locations)
    clust_coinc = np.argwhere(np.in1d(test_clust_data[:, 0].flatten(), clust_top10[exemplar])).flatten()
    
    # count the coincidences 
    clust_sum_test_counts = np.sum(test_clust_data[clust_coinc][:, 1])
    total_sum += clust_sum_test_counts
    
acc_clust = total_sum / (10 * test_size)
print('Clusterized accuracy: %f' % acc_clust)

Clusterized accuracy: 0.000182
