In [1]:
import pandas as pd
from scipy import sparse as spsp
import numpy as np

In [24]:
data_dir = 'Marriott'

i2i = pd.read_csv(data_dir + '/i2inbrs11.ijv', delim_whitespace=True, header=None)

In [25]:
row = i2i[0]
col = i2i[1]
val = i2i[2]
i2i_spm = spsp.coo_matrix((val, (row, col)))

In [26]:
def load_txt_csr(file_name):
    row = []
    col = []
    with open(file_name, 'r') as infile:
        row_id = 0
        for line in infile:
            strs = line.split()
            col_idx = list(map(lambda s: int(s), strs[0::2]))
            row.extend([row_id] * len(col_idx))
            col.extend(col_idx)
            row_id += 1
    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    return spsp.coo_matrix((np.ones(len(row)), (row, col)))
spm = load_txt_csr(data_dir + '/train.csr')

In [27]:
spm.shape

(2743661, 7649)

In [28]:
def coverage(ctx, targets):
    u_ctx = np.squeeze(np.array(np.sum(ctx, axis=0) > 0))
    u_target = np.squeeze(np.array(np.sum(targets, axis=0) > 0))
    return np.sum(u_ctx * u_target) / np.sum(u_ctx)

from sklearn import metrics
def similarity(targets):
    sim = metrics.pairwise.cosine_similarity(targets)
    return np.mean(sim)

In [29]:
def load_topk_pred(file_name, ctx_size, k):
    ctxs = []
    preds = []
    with open(file_name, 'r') as infile:
        for line in infile:
            strs = line.split()
            if strs[0].startswith('row'):
                ctx = strs[2:]
                assert len(ctx) == ctx_size
                ctxs.append(ctx)
            elif strs[0].startswith('recommend'):
                pred = list(map(lambda s: int(s), strs[1:]))
                preds.append(pred)
    print('There are {} recommendations'.format(len(ctxs)))
    ctx_preds = {}
    for ctx, pred in zip(ctxs, preds):
        key = ' '.join(ctx)
        ctx_preds[key] = pred
    print('There are {} unique recommendations'.format(len(ctx_preds)))
    ctxs = []
    preds = []
    for ctx_str, pred in ctx_preds.items():
        ctx = list(map(lambda s: int(s), ctx_str.split()))
        ctxs.append(ctx)
        preds.append(pred)
    return ctxs, preds

ctxs1, preds1 = load_topk_pred(data_dir + '/ctx1_hits10.txt', 1, 10)
ctxs2, preds2 = load_topk_pred(data_dir + '/ctx2_hits10.txt', 2, 10)
ctxs3, preds3 = load_topk_pred(data_dir + '/ctx3_hits10.txt', 3, 10)

There are 420884 recommendations
There are 7341 unique recommendations
There are 420884 recommendations
There are 197561 unique recommendations
There are 222035 recommendations
There are 202127 unique recommendations


In [None]:
item_row_spm = spm.transpose().tocsr()
coverages = []
similarities = []
for ctx, pred in zip(ctxs1, preds1):
    if len(pred) == 0:
        continue
    coverages.append(coverage(item_row_spm[ctx], item_row_spm[pred]))
    similarities.append(similarity(item_row_spm[pred]))
print(np.mean(coverages), np.mean(similarities))

coverages = []
for ctx, pred in zip(ctxs2, preds2):
    if len(pred) == 0:
        continue
    coverages.append(coverage(item_row_spm[ctx], item_row_spm[pred]))
    similarities.append(similarity(item_row_spm[pred]))
print(np.mean(coverages), np.mean(similarities))

coverages = []
for ctx, pred in zip(ctxs3, preds3):
    if len(pred) == 0:
        continue
    coverages.append(coverage(item_row_spm[ctx], item_row_spm[pred]))
    similarities.append(similarity(item_row_spm[pred]))
print(np.mean(coverages), np.mean(similarities))

0.7010834868274863 0.19721751091103693
