# Make recommendations and calculate accuracy

In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Metadata and parameters

K = 10 # top k locations

In [3]:
## Load check-ins 

ci_dataset = np.load('../Dataset/ci_ids.npy').astype(np.int32) # [usr, place]
print('Number of chekins: %i' % ci_dataset.shape[0])

Number of chekins: 6442892


In [4]:
## Get unique users

u_usr = np.unique(ci_dataset[:, 0])
n_usr = u_usr.size
print('Unique users: %i' % n_usr)

Unique users: 107092


In [5]:
## Load array of exemplars

exemp = np.load('./exemplars.npy')

In [6]:
## Make a dataframe of the exemplars

exemp_df = pd.DataFrame(exemp, dtype=np.int32)
exemp_df = exemp_df.reset_index()
exemp_df.columns = ['UID', 'CID'] # user ID, cluster ID (ID of an exemplar)

In [7]:
## Make a dataframe of the ci_dataset

ci_df = pd.DataFrame(ci_dataset, dtype=np.int32)
ci_df.columns = ['UID', 'locID'] # user ID, location ID

In [8]:
## Merge the dataframes with respect to the user ID column

all_df = pd.merge(ci_df, exemp_df, on='UID')
all_df

Unnamed: 0,UID,locID,CID
0,0,22847,0
1,0,420315,0
2,0,316637,0
3,0,16516,0
4,0,5535878,0
...,...,...,...
6442887,196578,906885,196504
6442888,196578,965121,196504
6442889,196578,1174322,196504
6442890,196585,471724,196539


In [9]:
## Divide into train and test data

divisor = 10
test_size = n_usr // divisor # test dataset is 'divisor' times smaller than the full dataset
print('Number of train users: %i' % (n_usr - test_size))
print('Number of test users: %i' % test_size)

Number of train users: 96383
Number of test users: 10709


In [10]:
## Permute users

p = np.random.permutation(n_usr)
u_usr_rand = u_usr[p]

In [11]:
## Divide users into train and test selections

test_usrs = u_usr_rand[:test_size]
train_usrs = u_usr_rand[test_size:]

In [12]:
## Get parts of the all_df corresponding to the train and test datasets

test_df = all_df.loc[all_df.UID.isin(test_usrs)]
train_df = all_df.loc[all_df.UID.isin(train_usrs)]

In [13]:
## Get top K locations among all users

# extract the locations and count them
locs = train_df.groupby('locID')
locs_count = locs.count()
locs_count = locs_count.drop('CID', axis=1)
locs_count = locs_count.rename(columns={'UID': 'counts'})

# sort and get top k locsations
locs_ranked = locs_count.sort_values('counts', ascending=False)
locs_ranked = locs_ranked.reset_index()
locs_top_k_df = locs_ranked[:K]
locs_top_k = locs_top_k_df.locID.values
locs_top_k_uniq = np.unique(locs_top_k)
locs_top_k_df

Unnamed: 0,locID,counts
0,55033,5215
1,19542,5185
2,9410,4246
3,10259,3704
4,9246,3125
5,23256,3109
6,9241,3090
7,58725,3080
8,10190,3055
9,14470,3053


In [14]:
## Get top locations among the clusters

# extract the locations and count them
clust_locs = train_df.groupby(['CID', 'locID'])
clust_locs_count = clust_locs.count()
clust_locs_count = clust_locs_count.rename(columns={'UID': 'count'})
clust_locs_count = clust_locs_count.reset_index()

# just sort
clust_locs_ranked = clust_locs_count.sort_values(['CID', 'count'], ascending=False)
clust_locs_ranked

Unnamed: 0,CID,locID,count
3432765,196539,103606,11
3432771,196539,271694,5
3432766,196539,124683,1
3432767,196539,128331,1
3432768,196539,130147,1
...,...,...,...
1982,0,4942030,1
1983,0,5109265,1
1984,0,5130189,1
1985,0,5305317,1


In [15]:
## Calculate the accuracy metric: hits / (k * test_size)

base_hits = 0
clust_hits = 0
for usr in test_usrs:
    
    # get locations 
    all_locs = test_df.loc[test_df.UID==usr]['locID'].values
    locs_usr_uniq = np.unique(all_locs)
    
    # cout for baseline
    base_hits += np.intersect1d(locs_top_k_uniq, locs_usr_uniq).size
    
    # count for clusters
    exemplar = exemp[usr]
    clust_locs_top_k = clust_locs_ranked.loc[clust_locs_ranked.CID==exemplar]['locID'].values[:10]
    clust_locs_top_k_uniq = np.unique(clust_locs_top_k)
    clust_hits += np.intersect1d(clust_locs_top_k_uniq, locs_usr_uniq).size

# calculate accuracy
base_acc = base_hits / (K * test_size)
clust_acc = clust_hits / (K * test_size)

In [16]:
## Output
print('Baseline accuracy: %.3f' % base_acc)
print('Cluster accuracy: %.3f' % clust_acc)

Baseline accuracy: 0.018
Cluster accuracy: 0.053
