In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Metadata

NODES = 196591
EDGES = 950327
FULL_EDGES = 2097245
CHECKINS = 6442892

In [3]:
## Load Check-ins and permute them

ci_dataset = np.load('../Dataset/ci_ids.npy') # [usr, place]
p = np.random.permutation(CHECKINS)
ci_dataset = ci_dataset[p, :]

# Calculate accuracy with no clusterization (baseline)

In [4]:
## Count users

u_usr = np.unique(ci_dataset[:, 0]).astype(np.int32)
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [5]:
## Divide into train and test data

test_size = n_usr // 5 # test dataset is 5 times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

# permute users
p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

# divide users
test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

# mark arguments of test and train checkins in ci_dataset
test_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], test_usrs)).flatten()
train_checkins_args = np.argwhere(np.in1d(ci_dataset[:, 0], train_usrs)).flatten()
print('Number of train check-ins: %i' % train_checkins_args.size)
print('Number of test check-ins: %i' % test_checkins_args.size)

# get checkins
test_checkins = ci_dataset[test_checkins_args, 1].flatten()
train_checkins = ci_dataset[train_checkins_args, 1].flatten()

Number of train users: 85674
Number of test users: 21418
Number of train check-ins: 5161157
Number of test check-ins: 1281735


In [6]:
## Count checkins

uniq_train, cnt_train = np.unique(train_checkins, return_counts=True)
uniq_test, cnt_test = np.unique(test_checkins, return_counts=True)

# concatenate unique elements and their counts
train_data = np.concatenate((uniq_train.reshape(-1, 1),
                             cnt_train.reshape(-1, 1)),
                             axis=1).astype(np.int32)
test_data = np.concatenate((uniq_test.reshape(-1, 1),
                            cnt_test.reshape(-1, 1)),
                            axis=1).astype(np.int32)

In [7]:
## Sort with respect to counts and then flip to make ratings

train_data_srt = train_data[train_data[:, 1].argsort()]
train_data_srt = np.flip(train_data_srt, axis=0)

# visualization
train_dataframe = pd.DataFrame(data=train_data_srt[:10],
                               index=np.arange(1, 11),
                               columns=['Place ID', 'Count'])
print('Train Ratings')
print(train_dataframe)

Train Ratings
    Place ID  Count
1      55033   4822
2      19542   4617
3       9410   3733
4      10259   3345
5      14470   2809
6      58725   2787
7       9246   2763
8      23256   2721
9      10190   2718
10      9241   2550


In [8]:
## Calculate the accuracy metric: hits / (k * test_size) where k = 10

# get top 10 locations from train data
train_top10_loc = train_data_srt[:10, 0]

# find counts in the test dataset that correspond to those locations
coinc = np.argwhere(np.in1d(test_data[:, 0].flatten(), train_top10_loc)).flatten()
sum_test_counts = np.sum(test_data[coinc][:, 1])
acc = sum_test_counts / (10 * test_size)
print('Baseline accuracy: %.3f' % acc)

Baseline accuracy: 0.037


# Calculate accuracy with clusterization

In [19]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')
exemp_unique = np.unique(exemp) # unique exemplars
print(exemp_unique.shape)

(11483,)


In [20]:
## Make a dictionary of clusters

clust_dict = dict()
for exemplar in exemp_unique:
    clust_dict[exemplar] = np.argwhere(exemp == exemplar).flatten()

print(clust_dict[exemp_unique[0]].shape)

(96542,)


In [14]:
## Make top 10 for each cluster (among the users in the train selection)

clust_top10 = dict()
for exemplar in exemp_unique:
    train_clust_args = np.argwhere(np.in1d(train_usrs, clust_dict[exemplar]))
    train_clust_checkins = ci_dataset[train_clust_args, 1].flatten()
    uniq_clust_train, cnt_clust_train = np.unique(train_clust_checkins, return_counts=True)
    train_clust_data = np.concatenate((uniq_clust_train.reshape(-1, 1),
                                       cnt_clust_train.reshape(-1, 1)),
                                       axis=1).astype(np.int32)
    train_clust_data_srt = train_clust_data[train_clust_data[:, 1].argsort()]
    train_clust_data_srt = np.flip(train_clust_data_srt, axis=0)
    clust_top10[exemplar] = train_clust_data_srt[:10, 0]

In [12]:
## Calculate the accuracy metric for the clusters

#for exemplar in exemp_unique:
#    test_clust_args = np.argwhere(np.in1d(test, clust_dict[exemplar]))

NameError: name 'test' is not defined