In [45]:
import scipy as sp
import numpy as np
import pickle
from collections import defaultdict
from rank_bm25 import BM25Okapi
from tqdm.notebook import tqdm

In [2]:
import sys
sys.path.append('./SR-GNN-master/pytorch_code')

In [3]:
from model import *
from utils import *

In [4]:
class Opt():
    def __init__(self, ):
        self.dataset = 'yoochoose1_4'
        self.batchSize = 100
        self.hiddenSize = 100
        self.epoch = 30
        self.lr = 0.001
        self.lr_dc = 0.1
        self.lr_dc_step = 3
        self.l2 = 1e-5
        self.step = 1
        self.patience = 10
        self.nonhybrid = False
        self.validation = False
        self.valid_portion = 0.01
        
opt = Opt()

In [5]:
train_data = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/train.txt', 'rb'))
if opt.validation:
    train_data, valid_data = split_validation(train_data, opt.valid_portion)
    test_data = valid_data
else:
    test_data = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/test.txt', 'rb'))

if opt.dataset == 'diginetica':
    n_node = 43098
elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4':
    n_node = 37484
else:
    n_node = 310
train_seqs = pickle.load(open('./SR-GNN-master/datasets/' + opt.dataset + '/all_train_seq.txt', 'rb'))

In [6]:
class EASE():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data[0]:
            for j in i:
                set_.add(j)
        for i in data[1]:
            set_.add(i)
        self.item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping

    def fit(self, alpha=1, beta=0, reg=1):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        items_count = defaultdict(int)
        for session, (items, last_item) in enumerate(tqdm(zip(self.data[0],
                                                              self.data[1]),
                                                          total=len(self.data[0]))):
            items = items + [last_item]
            session_len = len(items)
            for i in items:
                items_count[self.item_mapping[i]] += 1
                Dsi[session, self.item_mapping[i]] += 1 / session_len
                Dis[self.item_mapping[i], session] += 1

        for row, col in Dis.keys():
            Dis[row, col] /= items_count[row]**(1+beta)
 
        rows = np.array(list(Dsi.keys()))[:, 0]
        cols = np.array(list(Dsi.keys()))[:, 1]
        vals = np.array(list(Dsi.values()))**alpha
        Dsi_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        rows = np.array(list(Dis.keys()))[:, 0]
        cols = np.array(list(Dis.keys()))[:, 1]
        vals = np.array(list(Dis.values()))**alpha
        Dis_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))
        D = (Dis_sparse @ Dsi_sparse).todense()
        
        diag = ([i for i in range(self.item_num)],
                [i for i in range(self.item_num)])
        D[diag] += reg
        
        P = np.linalg.inv(D)
        
        B = -P/P[diag]
        
        B[diag] = 0

        self.W = B

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.array(self.W[session_].sum(0))
            score[0][session_] += 1
            scores.append(score)
        return np.concatenate(scores)

In [67]:
class RP3beta():
    def __init__(self, data):
        self.data = data

        set_ = set()
        for i in data:
            for j in i:
                set_.add(j)
        item_num = len(set_)

        item_mapping = {}
        for i, item in enumerate(set_):
            item_mapping[item] = i
 
        self.item_mapping = item_mapping

    def fit_BM25(self, ):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        bm25 = BM25Okapi(self.data)
        for sessid, items in enumerate(self.data):
            item_scores = bm25.get_scores(items)
            for i in range(len(items)):
                Dsi[sessid, items[i]] = item_scores[i]
    
    def fit(self, alpha=1, beta=0.5):
        Dsi = defaultdict(int)
        Dis = defaultdict(int)
        items_count = defaultdict(int)
        session_count = 0
        for items in tqdm(train_seqs, total=len(train_seqs)):
            # for iseq in range(1, len(items)):
            session_len = len(items)
            for i in items:
                items_count[self.item_mapping[i]] += 1
                Dsi[session_count, self.item_mapping[i]] += 1 / session_len
                Dis[self.item_mapping[i], session_count] += 1
            session_count += 1

        session_num = session_count + 1
        item_num = len(items_count)

        for row, col in Dis.keys():
            Dis[row, col] /= items_count[row]**(1+beta)
        
        for row, col in Dis.keys():
            Dis[row, col] *= np.log(session_num/len(train_seqs[col]))
            Dsi[col, row] *= np.log(item_num/items_count[row])
 
        rows = np.array(list(Dsi.keys()))[:, 0]
        cols = np.array(list(Dsi.keys()))[:, 1]
        vals = np.array(list(Dsi.values()))**alpha
        Dsi_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        rows = np.array(list(Dis.keys()))[:, 0]
        cols = np.array(list(Dis.keys()))[:, 1]
        vals = np.array(list(Dis.values()))**alpha
        Dis_sparse = sp.sparse.coo_matrix((vals, (rows, cols)))

        W = Dis_sparse @ Dsi_sparse
        self.W = W

    def predict(self, sessions):
        scores = []
        for session in sessions:
            session_ = list(map(self.item_mapping.get, session))
            session_ = [item for item in session_ if item is not None]
            score = np.array(self.W[session_].sum(0))
            # score[0][session_] += 100
            scores.append(score)
        return np.concatenate(scores)

In [68]:
def test(model, test_data):
    hit, mrr = [], []
    batch_size = min(opt.batchSize, len(test_data[0]))
    for i in tqdm(range(0, len(test_data[0]), batch_size)):
        scores = model.predict(test_data[0][i:i+batch_size])
        sub_scores = (-scores).argsort(1)[:, :20]
        targets = test_data[1][i:i+batch_size]
        for score, target in zip(sub_scores, targets):
            target = model.item_mapping.get(target)
            hit.append(np.isin(target, score))
            if len(np.where(score == target)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    print({'hit': hit, 'mrr': mrr})
    return {'hit': hit, 'mrr': mrr}

In [65]:
model = RP3beta(train_seqs)
model.fit(**{'alpha': 1, 'beta': 0.})
_ = test(model, test_data)

  0%|          | 0/1922630 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import optuna

def objective(trial):
    alpha = trial.suggest_float('alpha', 0, 2)
    beta = trial.suggest_float('beta', 0, 2)
    model = RP3beta(train_seqs)
    model.fit(alpha=alpha, beta=beta)
    return test(model, test_data)['hit']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2023-03-07 15:22:19,442][0m A new study created in memory with name: no-name-116177e7-7aeb-4374-8099-f899f9adc1ef[0m


  0%|          | 0/1922630 [00:00<?, ?it/s]

  vals = np.array(list(Dsi.values()))**alpha


  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:23:56,489][0m Trial 0 finished with value: 66.04529679058285 and parameters: {'alpha': 0.644801813476394, 'beta': 0.4325914507663402}. Best is trial 0 with value: 66.04529679058285.[0m


{'hit': 66.04529679058285, 'mrr': 25.365989669304206}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:25:33,723][0m Trial 1 finished with value: 65.58374181544957 and parameters: {'alpha': 0.5242765184665212, 'beta': 1.6662690788679464}. Best is trial 0 with value: 66.04529679058285.[0m


{'hit': 65.58374181544957, 'mrr': 24.925244021972283}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:27:11,439][0m Trial 2 finished with value: 64.67136570181403 and parameters: {'alpha': 0.7320948256837425, 'beta': 1.5404178966698165}. Best is trial 0 with value: 66.04529679058285.[0m


{'hit': 64.67136570181403, 'mrr': 24.426635841787594}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:28:48,625][0m Trial 3 finished with value: 59.87155175498229 and parameters: {'alpha': 1.550220991545721, 'beta': 1.7405519321848368}. Best is trial 0 with value: 66.04529679058285.[0m


{'hit': 59.87155175498229, 'mrr': 23.086496415214096}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

[32m[I 2023-03-07 15:30:27,822][0m Trial 4 finished with value: 64.44416615979105 and parameters: {'alpha': 0.32845384566442815, 'beta': 0.25790197341712906}. Best is trial 0 with value: 66.04529679058285.[0m


{'hit': 64.44416615979105, 'mrr': 24.952042415041763}


  0%|          | 0/1922630 [00:00<?, ?it/s]

  0%|          | 0/559 [00:00<?, ?it/s]

In [52]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [58]:
query = "Hello windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
doc_scores

array([0.56134684, 0.93729472, 0.        ])

In [49]:
tokenized_query

['windy', 'London']