In [1]:
import scipy as sp
import numpy as np
import torch
import pickle
from collections import defaultdict
from rank_bm25 import BM25Okapi
from tqdm.notebook import tqdm

In [2]:
import sys
sys.path.append('./SR-GNN-master/pytorch_code')

In [3]:
from model import *
from utils import *

In [4]:
class Opt():
    def __init__(self, ):
        self.dataset = 'diginetica'
        self.batchSize = 100
        self.hiddenSize = 100
        self.epoch = 30
        self.lr = 0.001
        self.lr_dc = 0.1
        self.lr_dc_step = 3
        self.l2 = 1e-5
        self.step = 1
        self.patience = 10
        self.nonhybrid = False
        self.validation = False
        self.valid_portion = 0.01
        
opt = Opt()

In [5]:
train_data = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/train.txt', 'rb'))
if opt.validation:
    train_data, valid_data = split_validation(train_data, opt.valid_portion)
    test_data = valid_data
else:
    test_data = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/test.txt', 'rb'))

if opt.dataset == 'diginetica':
    n_node = 43098
elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4':
    n_node = 37484
else:
    n_node = 310
    
train_data = [items+[last_item] for items, last_item in zip(*train_data)]
train_seqs = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/all_train_seq.txt', 'rb'))

In [6]:
from __future__ import absolute_import, division, print_function, unicode_literals
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        X = sp.sparse.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float64):
            # preserve float family dtype
            X = sp.sparse.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.sparse.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.sparse.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, attributes='_idf_diag', msg='idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [46]:
class EASE():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data:
            for j in i:
                set_.add(j)
        self.item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping

    def fit(self, reg=1):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        items_count = defaultdict(int)
        session_count = 0
        for items in tqdm(train_seqs, total=len(train_seqs)):
            session_len = len(items)
            for i in items:
                items_count[self.item_mapping[i]] += 1
                Dsi[session_count, self.item_mapping[i]] += 1 /session_len
                Dis[self.item_mapping[i], session_count] += 1
            session_count += 1
        session_num = session_count + 1
        item_num = len(items_count)

        for row, col in Dis.keys():
            Dis[row, col] /= items_count[row]
 
        rows = np.array(list(Dsi.keys()))[:, 0]
        cols = np.array(list(Dsi.keys()))[:, 1]
        vals = np.array(list(Dsi.values()))
        Dsi_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        rows = np.array(list(Dis.keys()))[:, 0]
        cols = np.array(list(Dis.keys()))[:, 1]
        vals = np.array(list(Dis.values()))
        Dis_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))
        
        
        D = (Dis_sparse @ Dsi_sparse).todense()
        
        diag = ([i for i in range(self.item_num)],
                [i for i in range(self.item_num)])
        D[diag] += reg
        
        P = np.linalg.inv(D)
        
        B = -P/P[diag]
        
        B[diag] = 0

        self.W = B

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.array(self.W[session_].sum(0))
            score[0][session_] += 100
            scores.append(score)
        return np.concatenate(scores)

In [47]:
class RP3beta():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data:
            for j in i:
                set_.add(j)
        item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping

    def fit_bm25(self, k1=2.0, b=0.75):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        bm25 = BM25Transformer(k1, b)
        
        session_count = 0
        for items in tqdm(train_seqs, total=len(train_seqs)):
            session_len = len(items)
            for i in items:
                Dsi[session_count, self.item_mapping[i]] += 1 
                Dis[self.item_mapping[i], session_count] += 1
            session_count += 1
            
        rows = np.array(list(Dsi.keys()))[:, 0]
        cols = np.array(list(Dsi.keys()))[:, 1]
        vals = np.array(list(Dsi.values()))
        Dsi_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        rows = np.array(list(Dis.keys()))[:, 0]
        cols = np.array(list(Dis.keys()))[:, 1]
        vals = np.array(list(Dis.values()))
        Dis_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))
        
        bm25.fit(Dsi_sparse)
        Dsi_sparse = bm25.transform(Dsi_sparse)
        
        bm25.fit(Dis_sparse)
        Dis_sparse = bm25.transform(Dis_sparse)
        
        W = Dis_sparse @ Dsi_sparse
        self.W = W
    
    def fit(self, alpha=1, beta=0.5):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        items_count = defaultdict(int)
        session_count = 0
        for items in tqdm(train_seqs, total=len(train_seqs)):
            session_len = len(items)
            for i in items:
                items_count[self.item_mapping[i]] += 1
                Dsi[session_count, self.item_mapping[i]] += 1 /session_len
                Dis[self.item_mapping[i], session_count] += 1
            session_count += 1
        session_num = session_count + 1
        item_num = len(items_count)

        for row, col in Dis.keys():
            Dis[row, col] /= items_count[row]**(1+beta)
        
        # for row, col in Dis.keys():
        #     Dis[row, col] *= np.log(session_num/len(train_seqs[col]))
        #     Dsi[col, row] *= np.log(item_num/items_count[row])
 
        rows = np.array(list(Dsi.keys()))[:, 0]
        cols = np.array(list(Dsi.keys()))[:, 1]
        vals = np.array(list(Dsi.values()))**alpha
        Dsi_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        rows = np.array(list(Dis.keys()))[:, 0]
        cols = np.array(list(Dis.keys()))[:, 1]
        vals = np.array(list(Dis.values()))**alpha
        Dis_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))
        
        W = Dis_sparse @ Dsi_sparse
        self.W = W

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.array(self.W[session_].sum(0))
            # score[0][session_] += 100
            scores.append(score)
        return np.concatenate(scores)

In [48]:
class RandomWalk():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data:
            for j in i:
                set_.add(j)
        self.item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping
    
    def fit(self):
        Dii = defaultdict(int)
        Dii2 = defaultdict(int)
        for items in tqdm(train_seqs, total=len(train_seqs)):
            session_len = len(items)
            for i in range(session_len-1):
                Dii[self.item_mapping[items[i]], self.item_mapping[items[i+1]]] += 1
                
            for i in range(session_len-2):
                Dii2[self.item_mapping[items[i]], self.item_mapping[items[i+2]]] += 1

        rows = np.array(list(Dii.keys()))[:, 0]
        cols = np.array(list(Dii.keys()))[:, 1]
        vals = np.array(list(Dii.values()))
        Dii_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))
        
        rows = np.array(list(Dii2.keys()))[:, 0]
        cols = np.array(list(Dii2.keys()))[:, 1]
        vals = np.array(list(Dii2.values()))
        Dii2_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        W1 = Dii_sparse/(Dii_sparse.sum(1)+1e-3)#/item_probs[:, None]
        self.Ws = dict()
        self.Ws[1] = W1
        self.Ws[2] = Dii2_sparse/(Dii2_sparse.sum(1)+1e-3) #W1 @ W1
        #self.Ws[3] = self.Ws[2] @ W1
        #self.Ws[4] = self.Ws[3] @ W1

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.zeros((1, self.item_num))
            for i in range(1, min(len(session_)+1, len(self.Ws)+1)):
                score += np.array(self.Ws[i][session_[-i]]) / i**1.5
            # score[0][session_] += 10
            scores.append(score)
        return np.concatenate(scores)

In [49]:
class RandomWalk2():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data:
            for j in i:
                set_.add(j)
        self.item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping

    def fit(self, reg=250):
        pass

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.ones((1, self.item_num))
            for i in range(1, min(len(session_)+1, len(self.Diis)+1)):
                score += np.array(self.Diis[i][session_[-i]]) / i**1.5
            #score = np.array(self.Diis[1][session_].sum(0))
            score[0][session_] += 100
            scores.append(score)
        return np.concatenate(scores)

In [50]:
def test(model, test_data):
    hit, mrr = [], []
    batch_size = min(opt.batchSize, len(test_data[0]))
    for i in tqdm(range(0, len(test_data[0]), batch_size)):
        scores = model.predict(test_data[0][i:i+batch_size])
        sub_scores = (-scores).argsort(1)[:, :20]
        targets = test_data[1][i:i+batch_size]
        for score, target in zip(sub_scores, targets):
            target = model.item_mapping.get(target)
            hit.append(np.isin(target, score))
            if len(np.where(score == target)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    print({'hit': hit, 'mrr': mrr})
    return {'hit': hit, 'mrr': mrr}

In [51]:
model = EASE(train_data)
model.fit(reg=250)

  0%|          | 0/186670 [00:00<?, ?it/s]

In [52]:
_ = test(model, test_data)

  0%|          | 0/609 [00:00<?, ?it/s]

{'hit': 40.04896644648198, 'mrr': 11.553126959027118}


In [50]:
import optuna

def objective(trial):
    k1 = trial.suggest_float('k1', 0, 20)
    b = trial.suggest_float('b', 0, 1)
    model = RP3beta(train_seqs)
    model.fit_bm25(k1, b)
    return test(model, test_data)['hit']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2023-03-07 15:55:08,518][0m A new study created in memory with name: no-name-afd3e39d-96f3-4399-a7f8-aaa64a875560[0m


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:56:23,541][0m Trial 0 finished with value: 65.1382875952628 and parameters: {'k1': 11.697319372658288, 'b': 0.18782695130492744}. Best is trial 0 with value: 65.1382875952628.[0m


{'hit': 65.1382875952628, 'mrr': 25.341513793774585}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:57:38,435][0m Trial 1 finished with value: 63.87169487280404 and parameters: {'k1': 19.91154146439115, 'b': 0.02258742096526778}. Best is trial 0 with value: 65.1382875952628.[0m


{'hit': 63.87169487280404, 'mrr': 24.96853581020437}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:58:53,182][0m Trial 2 finished with value: 65.77337292926401 and parameters: {'k1': 4.910255939165069, 'b': 0.6759753713873353}. Best is trial 2 with value: 65.77337292926401.[0m


{'hit': 65.77337292926401, 'mrr': 25.587786850537853}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 16:00:08,059][0m Trial 3 finished with value: 65.46209166696482 and parameters: {'k1': 10.851606664770614, 'b': 0.3376124659239059}. Best is trial 2 with value: 65.77337292926401.[0m


{'hit': 65.46209166696482, 'mrr': 25.457027322611886}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 16:01:23,073][0m Trial 4 finished with value: 65.74832731045834 and parameters: {'k1': 7.3752752853317745, 'b': 0.6267349858791935}. Best is trial 2 with value: 65.77337292926401.[0m


{'hit': 65.74832731045834, 'mrr': 25.57404891191451}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 16:02:38,117][0m Trial 5 finished with value: 65.78231779312318 and parameters: {'k1': 4.547221645664187, 'b': 0.7080074320691024}. Best is trial 5 with value: 65.78231779312318.[0m


{'hit': 65.78231779312318, 'mrr': 25.596532310955805}


  0%|          | 0/1922630 [00:00<?, ?it/s]

[33m[W 2023-03-07 16:02:41,042][0m Trial 6 failed with parameters: {'k1': 18.535809919995966, 'b': 0.5293155856348277} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/daniilstrunov/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3528/3991967122.py", line 7, in objective
    model.fit_bm25(k1, b)
  File "/tmp/ipykernel_3528/2383993741.py", line 108, in fit_bm25
    Dis[self.item_mapping[i], session_count] += 1
KeyboardInterrupt
[33m[W 2023-03-07 16:02:41,043][0m Trial 6 failed with value None.[0m


KeyboardInterrupt: 

In [10]:
model = RP3beta(train_seqs)
piu, pui = model.fit(**{'alpha': 1, 'beta': 0.})

  0%|          | 0/1922630 [00:00<?, ?it/s]

In [32]:
bm25.transform(piu)[:100, :100].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
piu.

<30444x1922630 sparse matrix of type '<class 'numpy.float64'>'
	with 6307201 stored elements in COOrdinate format>