In [1]:
import numpy as np

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from hooi import sparse_hooi
from dataprep import transform_indices

# Preparing Data

In [2]:
mldata = get_movielens_data(include_time=True)
mldata.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Split Data

In [3]:
def split_data(data, userid='userid', itemid='movieid', timeid='timestamp', time_q=0.95, seed=None):
    timepoint = data[timeid].quantile(q=time_q, interpolation='nearest')
    test_data_ = data.query(f'{timeid} >= @timepoint')
    warm_users = test_data_[userid].unique()
    train_data_ = data.query(f'{userid} not in @warm_users and {timeid} < @timepoint')
    training, data_index = transform_indices(train_data_.copy(), userid, itemid)
    
    test_data = reindex(test_data_, data_index['items'])
    random_state = None if seed is None else np.random.RandomState(seed)
    leave_one_out_config = dict(target=timeid, sample_top=True, random_state=random_state)
    # final test data
    testset_, holdout_ = leave_one_out(test_data, **leave_one_out_config)
    testset, holdout = align_test_data(testset_, holdout_)
    # validation data
    testset_valid_, holdout_valid_ = leave_one_out(testset_, **leave_one_out_config)
    testset_valid, holdout_valid = align_test_data(testset_valid_, holdout_valid_)
    return (training, data_index), (testset_valid, holdout_valid), (testset, holdout) 

def align_test_data(testset, holdout, userid='userid', itemid='movieid'):
    test_users = np.intersect1d(testset.userid.unique(), holdout.userid.unique())
    testset = testset.query(f'{userid} in @test_users').sort_values(userid)
    holdout = holdout.query(f'{userid} in @test_users').sort_values(userid)
    return testset, holdout

In [4]:
train_pack, valid_pack, test_pack = split_data(mldata)

Filtered 64 invalid observations.


## Assign positional info

In [5]:
n_pos = 200

In [6]:
def assign_positions(data, userid='userid', itemid='movieid', timeid='timestamp'):
    return (
        data
        .sort_values(timeid)
        .assign(
            pos=lambda df: df.groupby(userid)[itemid].transform(enumerate_events)
        )
        .query('pos>=0')
        .sort_values([userid, timeid])
    )

def enumerate_events(s, maxlen=n_pos):
    return np.arange(maxlen-len(s), maxlen)

In [7]:
training_data = assign_positions(train_pack[0])
testset_valid = assign_positions(valid_pack[0])
testset = assign_positions(test_pack[0])

In [8]:
training_data.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
31,0,2925,4,978300019,147
27,0,1547,4,978300055,148
22,0,1158,5,978300055,149
37,0,939,5,978300055,150
24,0,2111,3,978300103,151


In [9]:
testset_valid.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
2503,20,1529,3,1009669071,191
2512,20,1334,3,1009669071,192
2513,20,3425,2,1009669071,193
2517,20,2610,4,1009669071,194
2504,20,2399,4,1009669115,195


In [10]:
testset.head()

Unnamed: 0,userid,movieid,rating,timestamp,pos
2327,19,308,4,994556598,199
2513,20,3425,2,1009669071,190
2517,20,2610,4,1009669071,191
2512,20,1334,3,1009669071,192
2503,20,1529,3,1009669071,193


# Model

In [16]:
def seqtf_model_build(config, data, data_description, projected=True):
    users = data_description["userid"]
    items = data_description["itemid"]
    positions = data_description["positionid"]

    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    max_pos = data_description["n_pos"]
    shape = (n_users, n_items, max_pos)

    idx = data[[users, items, positions]].values
    val = np.ones(idx.shape[0], dtype='f8')

    return sparse_hooi(
        idx, val, shape, config["mlrank"],
        max_iters = config["num_iters"],
        update_order=(2, 1, 0) if projected else (0, 1, 2),
        materialize_core=not projected,
        growth_tol = config["growth_tol"],
        seed = config["seed"],
        verbose=True,
    )

In [17]:
data_index = train_pack[1]
data_description = dict(
    userid = data_index['users'].name,
    itemid = data_index['items'].name,
    positionid = 'pos',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    n_pos = n_pos
)
data_description

{'userid': 'userid',
 'itemid': 'movieid',
 'positionid': 'pos',
 'n_users': 5227,
 'n_items': 3652,
 'n_pos': 200}

In [18]:
config = {
    "mlrank": (30, 25, 5),
    "n_pos": n_pos,
    "num_iters": 4,
    "growth_tol": 1e-6,
    "seed": 42
}

In [19]:
tf_params_old = seqtf_model_build(config, training_data, data_description, projected=False)

Step 0 growth of the core: 1.0
Step 1 growth of the core: 0.3469342641029833
Step 2 growth of the core: 0.10593666655915099
Step 3 growth of the core: 0.043192304105665026


In [20]:
tf_params_new = seqtf_model_build(config, training_data, data_description, projected=True)

Step 0 growth of the core: 1.0
Step 1 growth of the core: 0.26981724108442284
Step 2 growth of the core: 0.07639764225920083
Step 3 growth of the core: 0.035880103085990246
