In [1]:
import pandas as pd
import numpy as np
import json
from scipy import sparse as sp
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
import implicit

In [3]:
import sys
sys.path.append('../')

from src.utils import get_shard_path
from src.utils import ProductEncoder, make_coo_row
from src.metrics import normalized_average_precision

In [4]:
product_encoder = ProductEncoder('../data/raw/products.csv')

In [5]:
valid_data = [json.loads(l) for l in open(get_shard_path(7))][:3000]

In [15]:
def make_coo_row(transaction_history, product_encoder: ProductEncoder):
    idx = []
    values = []

    items = []
    for trans in transaction_history:
        items.extend([i["product_id"] for i in trans["products"]])
    n_items = len(items)

    for pid in items:
        idx.append(product_encoder.toIdx(pid))
        values.append(1.0 / n_items)

    return sp.coo_matrix(
        (np.array(values).astype(np.float32), ([0] * len(idx), idx)), shape=(1, product_encoder.num_products),
    )

In [16]:
rows = []
for shard_id in range(4):
    for js in tqdm(json.loads(l) for l in open(get_shard_path(shard_id))):
        rows.append(make_coo_row(js["transaction_history"], product_encoder))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [17]:
X_sparse = sp.vstack(rows).tocsr()

In [18]:
X_sparse.shape

(40421, 43038)

# Cosine

In [21]:
model = implicit.nearest_neighbours.CosineRecommender(K=10)
model.fit(X_sparse.T)

HBox(children=(FloatProgress(value=0.0, max=43038.0), HTML(value='')))




In [22]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder).tocsr()
    raw_recs = model.recommend(0, row_sparse, N=30, filter_already_liked_items=False, recalculate_user=True)
    recommended_items = product_encoder.toPid([x[0] for x in raw_recs])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


0.11892847087471084


In [12]:
! mkdir -p ../tmp/i2i

In [13]:
import pickle
pickle.dump(model, open("../tmp/i2i/cosine.pkl", "wb"))