In [1]:
import pandas as pd
import numpy as np
import json
from scipy import sparse as sp
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
import sys
sys.path.append('../')

from src.utils import get_shard_path, ProductEncoder, make_coo_row
from src.metrics import normalized_average_precision

In [3]:
product_encoder = ProductEncoder('../data/raw/products.csv')

In [4]:
rows = []
for shard_id in range(4):
    for js in tqdm(json.loads(l) for l in open(get_shard_path(shard_id))):
        rows.append(make_coo_row(js["transaction_history"], product_encoder))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [5]:
X_sparse = sp.vstack(rows)

In [6]:
X_sparse.shape

(40421, 43038)

In [7]:
X_stored = X_sparse.tocsr()

In [8]:
from sklearn.decomposition import TruncatedSVD

In [9]:
svd = TruncatedSVD(n_components=128)
X_dense = svd.fit_transform(X_sparse)

In [10]:
from sklearn.neighbors import NearestNeighbors

In [11]:
num_neighbours = 256
knn = NearestNeighbors(n_neighbors=num_neighbours, metric="cosine")
knn.fit(X_dense)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=256, p=2,
                 radius=1.0)

In [13]:
valid_data = [json.loads(l) for l in open(get_shard_path(7))][:3000]

In [14]:
m_ap = []
for js in tqdm(valid_data):
    # just to save time
    
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = knn.kneighbors(row_dense, n_neighbors=num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


0.08951838714098508


In [15]:
! mkdir -p ../tmp/u2u

In [16]:
import pickle
pickle.dump(X_stored, open('../tmp/u2u/X_stored.pkl', "wb"))
pickle.dump(svd, open('../tmp/u2u/svd.pkl', "wb"))
pickle.dump(knn, open('../tmp/u2u/knn.pkl', "wb"))

In [17]:
! ls -lah ../tmp/u2u

total 63M
drwxr-xr-x 5 jovyan users 160 Feb  1 11:30 .
drwxr-xr-x 6 jovyan users 192 Feb  1 11:30 ..
-rw-r--r-- 1 jovyan users 20M Feb  1 11:30 knn.pkl
-rw-r--r-- 1 jovyan users 22M Feb  1 11:30 svd.pkl
-rw-r--r-- 1 jovyan users 22M Feb  1 11:30 X_stored.pkl


# FAISS
[Вики faiss](https://github.com/facebookresearch/faiss/wiki)

In [22]:
!python -m pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.6.1-cp37-cp37m-manylinux2010_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 1.2 MB/s eta 0:00:01     |██▌                             | 552 kB 1.9 MB/s eta 0:00:04     |█████████████████████████▌      | 5.6 MB 2.1 MB/s eta 0:00:01     |██████████████████████████████  | 6.6 MB 1.2 MB/s eta 0:00:01
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.6.1


In [23]:
import faiss

In [24]:
index = faiss.index_factory(128, "IVF256,PQ32", faiss.METRIC_INNER_PRODUCT)
index.train(X_dense)
index.add(X_dense)

[Индексы в faiss](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes)

In [25]:
index.nprobe = 32  # чтобы смотреть в нескольких кластерах

In [28]:
m_ap = []
for js in tqdm(valid_data):
    # just to save time
    
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = index.search(row_dense, num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


0.08268866919103537


In [None]:
# ???

In [29]:
faiss.write_index(index, '../tmp/u2u/faiss.idx')

In [30]:
! ls -lah ../tmp/u2u

total 65M
drwxr-xr-x 6 jovyan users  192 Feb  1 11:50 .
drwxr-xr-x 6 jovyan users  192 Feb  1 11:30 ..
-rw-r--r-- 1 jovyan users 1.8M Feb  1 11:50 faiss.idx
-rw-r--r-- 1 jovyan users  20M Feb  1 11:30 knn.pkl
-rw-r--r-- 1 jovyan users  22M Feb  1 11:30 svd.pkl
-rw-r--r-- 1 jovyan users  22M Feb  1 11:30 X_stored.pkl
