In [1]:
from itertools import product

import numpy as np
import pandas as pd
from DataProcessor import DataProcessor
from evaluation import evaluate
from lib import index

def to_latex(metrics):
    return (
        metrics
        .applymap(lambda x: f"{x:.3f}")
        .unstack('topk')
        .to_frame('values')
        .T
        .to_latex(index=False)
    )

In [2]:
data_file_name = "./data/lsapp.csv"

process_data = DataProcessor(
    data_file_name,
    column_names='userid,appid,timestamp',
    session_break_delta='15min',
)
process_data.prepare_data(
    usecols=['userid', 'appid', 'timestamp'],
    test_interval='14d',
    valid_interval='7d',
    min_sess_length=2,
    window="3s",
)

# define valuable params of the data:
n_users, n_items = process_data.n_users, process_data.n_items
user_col, item_col = process_data.userid, process_data.itemid

n_sess_overall = (
    process_data.valid.sessid_global.nunique()
    + process_data.train.sessid_global.nunique()
    + process_data.test.sessid_global.nunique()
)

(
    process_data.train.sessid_global.nunique() / n_sess_overall * 100,
    process_data.valid.sessid_global.nunique() / n_sess_overall * 100,
    process_data.test.sessid_global.nunique() / n_sess_overall * 100,
)

(86.66059480856131, 7.4543734893333795, 5.8850317021053)

In [3]:
class ZeroScore:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        return np.zeros_like(item_pool)

class RandomGuess:
    def __init__(self, seed=None):
        self.random_state = np.random.RandomState(seed)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        scores = self.random_state.rand(len(item_pool))
        return scores

class MostRecentlyUsed:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        """
        `item_pool` may have more items than there're in `sess_items`,
        but it's not possible to generate more than |sess_items| scores,
        hence, all other items in `item_pool` are assigned with random score
        """
        scores = np.zeros(len(item_pool))
        for i, item in enumerate(sess_items):
            item_pos = index(item_pool, item)
            if item_pos is not None:
                scores[item_pos] = i+1
        return scores

class MostFrequentlyUsed:
    def __init__(self, userid='userid', itemid='appid'):
        self.userid = userid
        self.itemid = itemid
        self.frequencies = None

    def fit(self, train):
        self.frequencies = (
            train
            .groupby(self.userid)
            [self.itemid]
            .value_counts(sort=False)
            .sort_index()
        )

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[uid], item_pool])
        scores = self.frequencies.reindex(idx, fill_value=-1).values
        return scores

class MarkovianI2I:
    def __init__(self, group_key='sessid_global', itemid='appid'):
        self.group_key = group_key
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby(source_items).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = self.transitions.reindex(idx, fill_value=-1).values
        return scores

class OnDeviceMarkovianI2I:
    def __init__(self, group_key='sessid_global', userid='userid', itemid='appid'):
        self.group_key = group_key
        self.userid = userid
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby([train[self.userid], source_items]).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        user_transitions = self.transitions.loc[uid]
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = user_transitions.reindex(idx, fill_value=-1).values
        return scores

In [4]:
train_valid = pd.concat([process_data.train, process_data.valid], ignore_index=True)
train_valid_seen_interactions = process_data.get_seen_interactions(train_valid)

In [5]:
zrs = ZeroScore()
rnd = RandomGuess(seed=42)

mfu = MostFrequentlyUsed()
mfu.fit(train_valid)

mru = MostRecentlyUsed()

i2i_ub = MarkovianI2I(group_key='userid')
i2i_ub.fit(train_valid)

i2i_sb = MarkovianI2I(group_key='sessid_global')
i2i_sb.fit(train_valid)

i2i_od_ub = OnDeviceMarkovianI2I(group_key='userid')
i2i_od_ub.fit(train_valid)

i2i_od_sb = OnDeviceMarkovianI2I(group_key='sessid_global')
i2i_od_sb.fit(train_valid)

In [6]:
data_args = (process_data.test_sessions, train_valid_seen_interactions)

In [7]:
i2i_od_ub_metrics, i2i_od_ub_stats = evaluate(i2i_od_ub.generate_scores, *data_args)
i2i_od_sb_metrics, i2i_od_sb_stats = evaluate(i2i_od_sb.generate_scores, *data_args)

In [8]:
i2i_ub_metrics, i2i_ub_stats = evaluate(i2i_ub.generate_scores, *data_args)
i2i_sb_metrics, i2i_sb_stats = evaluate(i2i_sb.generate_scores, *data_args)

In [9]:
mfu_metrics, mfu_stats = evaluate(mfu.generate_scores, *data_args)
mru_metrics, mru_stats = evaluate(mru.generate_scores, *data_args)
rnd_metrics, rnd_stats = evaluate(rnd.generate_scores, *data_args)
zrs_metrics, zrs_stats = evaluate(zrs.generate_scores, *data_args)

# Dataframe with results:

In [10]:
ks = [1, 3, 5]
col_names = [f'HR@{i}' for i in ks] + [f'MRR@{i}' for i in ks] + [f'NDCG@{i}' for i in ks]
res_df = pd.DataFrame(columns=col_names) 

# I2I models

In [11]:
res = i2i_od_sb_metrics.groupby(level='topk').mean()
#res_df.loc["i2i_od_sb", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.59924,0.59924,0.59924
3,0.806453,0.694406,0.723346
5,0.881273,0.710894,0.753626


In [12]:
res = i2i_od_ub_metrics.groupby(level='topk').mean()
res_df.loc["SR(od)", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.612252,0.612252,0.612252
3,0.823905,0.707037,0.737117
5,0.892559,0.722393,0.765099


In [13]:
res = i2i_ub_metrics.groupby(level='topk').mean()
res_df.loc["SR", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.610203,0.610203,0.610203
3,0.801301,0.695238,0.722513
5,0.862854,0.709368,0.74792


In [14]:
res = i2i_sb_metrics.groupby(level='topk').mean()
#res_df.loc["i2i_sb", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.610203,0.610203,0.610203
3,0.799238,0.694826,0.721698
5,0.859416,0.708658,0.746552


# MFU, MRU, RND

In [15]:
res = mfu_metrics.groupby(level='topk').mean()
res_df.loc["MFU", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.403459,0.403459,0.403459
3,0.669404,0.515702,0.554967
5,0.777473,0.540918,0.599931


In [16]:
res = mru_metrics.groupby(level='topk').mean()
res_df.loc["MRU", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.618145,0.618145,0.618145
3,0.823426,0.716068,0.743957
5,0.848681,0.721744,0.754274


In [17]:
res = rnd_metrics.groupby(level='topk').mean()
res_df.loc["Random", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.08884,0.08884,0.08884
3,0.239482,0.1528,0.17496
5,0.365042,0.181418,0.226606


In [18]:
zrs_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,0.0,0.0
3,0.035714,0.017857,0.022533
5,0.066964,0.024107,0.034622


## PureSVD:

In [19]:
from seqmf_pp import get_conf_mtx_lap_smooth, dense_Cui
from itertools import product

def build_svd_model(
    data,
    n_users,
    n_items,
    user_col,
    item_col,
    rank,
    lap_smooth,
    gamma,
):
    C, cu = get_conf_mtx_lap_smooth(
        data,
        n_users,
        n_items,
        user_col,
        item_col,
        lap_smooth,
        gamma,
    )
    Cui = dense_Cui(C, cu)

    _, _, vt = np.linalg.svd(Cui)
    item_factors = vt[:rank, :].T
    return (None, None), (item_factors, None)

def get_scores_generator(local_factors, global_factors):
    Q, _ = global_factors
    def generate_scores(uid, sid, sess_items, item_pool):
        scores = Q[item_pool] @ Q[sess_items].sum(axis=0)
        return scores 
    return generate_scores

In [20]:
n_factors_range = [24, 32, 64, 80]
gamma_range = [0.01, 0.1, 0.5, 1]
lap_smooth_range = [0.001, 0.01, 0.1, 0.5, 1.0]
show_result = True

best_params = None
best_hr = 0.0
for gamma in gamma_range:
    for lap_smooth in lap_smooth_range:
        local_factors, global_factors_ = build_svd_model(
            process_data.train,
            n_users,
            n_items,
            user_col,
            item_col,
            max(n_factors_range),
            lap_smooth,
            gamma,
        )
        for rank in n_factors_range:
            global_factors = (global_factors_[0][:, :rank], None)
            up_generate_scores = get_scores_generator(local_factors, global_factors)
            metrics_df, user_stats = evaluate(
                up_generate_scores,
                process_data.valid_sessions,
                process_data.seen_interactions,
            )
            valid_results = (
                metrics_df
                .reset_index()
                .groupby(["topk"])
                .mean()[["hr", "mrr", "ndcg"]]
            )
            hr = valid_results["hr"][5]

            if hr > best_hr:
                best_hr = hr
                best_params = (gamma, lap_smooth, rank)
                if show_result:
                    print(
                        f"PureSVD:"
                        + f"\nBest HR@5: {best_hr}; MRR@5: {valid_results['mrr'][5]}"
                        + f"\nThe best performance parameters:"
                        + f'\n{best_params}'
                    ) 

PureSVD:
Best HR@5: 0.7618435890788575; MRR@5: 0.5639007672285227
The best performance parameters:
(0.01, 0.001, 24)
PureSVD:
Best HR@5: 0.7740413985365939; MRR@5: 0.5758771372372246
The best performance parameters:
(0.01, 0.001, 32)
PureSVD:
Best HR@5: 0.8779239170825854; MRR@5: 0.6311665537262153
The best performance parameters:
(0.01, 0.001, 64)
PureSVD:
Best HR@5: 0.8817417403375082; MRR@5: 0.6661177056455219
The best performance parameters:
(0.01, 1.0, 80)
PureSVD:
Best HR@5: 0.8873897739700013; MRR@5: 0.6642506032601999
The best performance parameters:
(0.1, 0.001, 80)


In [21]:
gamma, lap_smooth, rank = best_params

local_factors, global_factors = build_svd_model(
    train_valid,
    n_users,
    n_items,
    user_col,
    item_col,
    rank,
    lap_smooth,
    gamma,
)

up_generate_scores = get_scores_generator(local_factors, global_factors)

metrics_df, user_stats = evaluate(
    up_generate_scores,
    process_data.test_sessions,
    train_valid_seen_interactions,
)

test_results = (
    metrics_df
    .reset_index()
    .groupby(["topk"])
    .mean()[["hr", "mrr", "ndcg"]]
)

res = test_results
#res_df.loc["PureSVD", :] = res.values.reshape(1, 9, order='F').squeeze()
res

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.530553,0.530553,0.530553
3,0.809642,0.657754,0.696942
5,0.872143,0.672347,0.722955


# Show resulting table for baselines:

In [22]:
res_df = res_df.rename_axis('Models').reset_index(level=0)
res_df.loc[:, col_names] = res_df[col_names].applymap(lambda x: f"{x:.3f}")
res_df.to_csv("metrics_results/lsapp/baselines.csv", sep=',', header=True)
print(res_df.to_latex(index=False))

\begin{tabular}{llllllllll}
\toprule
Models &  HR@1 &  HR@3 &  HR@5 & MRR@1 & MRR@3 & MRR@5 & NDCG@1 & NDCG@3 & NDCG@5 \\
\midrule
SR(od) & 0.612 & 0.824 & 0.893 & 0.612 & 0.707 & 0.722 &  0.612 &  0.737 &  0.765 \\
    SR & 0.610 & 0.801 & 0.863 & 0.610 & 0.695 & 0.709 &  0.610 &  0.723 &  0.748 \\
   MFU & 0.403 & 0.669 & 0.777 & 0.403 & 0.516 & 0.541 &  0.403 &  0.555 &  0.600 \\
   MRU & 0.618 & 0.823 & 0.849 & 0.618 & 0.716 & 0.722 &  0.618 &  0.744 &  0.754 \\
Random & 0.089 & 0.239 & 0.365 & 0.089 & 0.153 & 0.181 &  0.089 &  0.175 &  0.227 \\
\bottomrule
\end{tabular}

