In [11]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
## Metadata

NODES = 196591
EDGES = 950327
FULL_EDGES = 2097245
CHECKINS = 6442892

In [13]:
## Load Check-ins and permute them

ci_dataset = np.load('../Dataset/ci_ids.npy') # [usr, place]
p = np.random.permutation(CHECKINS)
ci_dataset = ci_dataset[p, :].astype(np.int32)

# Calculate accuracy with no clusterization (baseline)

In [14]:
## Count users

u_usr = np.unique(ci_dataset[:, 0]).astype(np.int32)
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [15]:
## Divide into train and test data

test_size = n_usr // 10 # test dataset is 5 times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

# permute users
p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

# divide users
test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

# mark arguments of test and train checkins in ci_dataset
test_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], test_usrs)).flatten()
train_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], train_usrs)).flatten()
assert test_checkins_args.size + train_checkins_args.size == CHECKINS
print('Number of train check-ins: %i' % train_checkins_args.size)
print('Number of test check-ins: %i' % test_checkins_args.size)

# get checkins from ci_dataset
test_checkins = ci_dataset[test_checkins_args, 1].flatten()
train_checkins = ci_dataset[train_checkins_args, 1].flatten()

Number of train users: 96383
Number of test users: 10709
Number of train check-ins: 5765488
Number of test check-ins: 677404


In [16]:
## Count checkins

uniq_train, cnt_train = np.unique(train_checkins, return_counts=True)
uniq_test, cnt_test = np.unique(test_checkins, return_counts=True)

# concatenate unique elements and their counts
train_data = np.concatenate((uniq_train.reshape(-1, 1),
                             cnt_train.reshape(-1, 1)),
                             axis=1).astype(np.int32)
test_data = np.concatenate((uniq_test.reshape(-1, 1),
                            cnt_test.reshape(-1, 1)),
                            axis=1).astype(np.int32)

In [17]:
## Sort with respect to counts and then flip to make ratings

train_data_srt = train_data[train_data[:, 1].argsort()]
train_data_srt = np.flip(train_data_srt, axis=0)

# visualization
train_dataframe = pd.DataFrame(data=train_data_srt[:10],
                               index=np.arange(1, 11),
                               columns=['Place ID', 'Count'])
print('Train Ratings')
print(train_dataframe)

Train Ratings
    Place ID  Count
1      55033   5147
2      19542   5131
3       9410   4243
4      10259   3623
5      58725   3192
6      23256   3110
7      14470   3085
8      10190   3039
9       9246   3032
10      9241   2917


In [18]:
## Calculate the accuracy metric: hits / (k * test_size) where k = 10

# get top 10 locations from train data
train_top10_loc = train_data_srt[:10, 0]

# find counts in the test dataset that correspond to those locations
coinc = np.argwhere(np.in1d(test_data[:, 0].flatten(), train_top10_loc)).flatten()
sum_test_counts = np.sum(test_data[coinc][:, 1])
acc = sum_test_counts / (10 * test_size)
print('Baseline accuracy: %.3f' % acc)

Baseline accuracy: 0.039


# Calculate accuracy with clusterization

In [21]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')
exemp_unique = np.unique(exemp) # unique exemplars

In [22]:
## Make a dictionary of clusters

clust_dict = dict()
for exemplar in exemp_unique:
    clust_dict[exemplar] = np.argwhere(exemp == exemplar).flatten()

In [19]:
## Make top 10 for each cluster (among the users in the train selection)

clust_top10 = dict()
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the train selection
    train_clust_args = np.argwhere(np.in1d(train_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    train_clust_checkins = ci_dataset[train_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_train, cnt_clust_train = np.unique(train_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    train_clust_data = np.concatenate((uniq_clust_train.reshape(-1, 1),
                                       cnt_clust_train.reshape(-1, 1)),
                                       axis=1).astype(np.int32)
    
    # sort & flip
    train_clust_data_srt = train_clust_data[train_clust_data[:, 1].argsort()]
    train_clust_data_srt = np.flip(train_clust_data_srt, axis=0)
    
    # store the top 10 locations
    clust_top10[exemplar] = train_clust_data_srt[:10, 0]

In [25]:
print(clust_top10[2])

[5593696  162742  152811  155420  155497  159042  159211  160040  162659
  163480]


In [26]:
print(np.argwhere(ci_dataset[:, 1] == 5593696))

[[  27600]
 [2347203]
 [6174887]]


In [20]:
## Calculate the accuracy metric for the clusters

total_sum = 0
for exemplar in exemp_unique:
    
    # mark users in a cluster that are in the test selection
    test_clust_args = np.argwhere(np.in1d(test_usrs, clust_dict[exemplar]))
    
    # get their corresponding checkins
    test_clust_checkins = ci_dataset[test_clust_args, 1].flatten()
    
    # count checkins
    uniq_clust_test, cnt_clust_test = np.unique(test_clust_checkins, return_counts=True)
    
    # make a mini-dataset [location, count]
    test_clust_data = np.concatenate((uniq_clust_test.reshape(-1, 1),
                                      cnt_clust_test.reshape(-1, 1)),
                                      axis=1).astype(np.int32)
    
    # get coincidences with the top 10 locations for the cluster (arguments of locations)
    clust_coinc = np.argwhere(np.in1d(test_clust_data[:, 0].flatten(), clust_top10[exemplar])).flatten()
    
    # count the coincidences 
    clust_sum_test_counts = np.sum(test_clust_data[clust_coinc][:, 1])
    total_sum += clust_sum_test_counts
    
acc_clust = total_sum / (10 * test_size)
print('Clusterized accuracy: %f' % acc_clust)

Clusterized accuracy: 0.000075
