In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, coo_matrix

In [2]:
from utils import evaluate, load_data

In [3]:
pd.set_option('display.max_rows', 100)

##### read data

In [4]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [5]:
user_item_data = user_item_data.merge(
    item_meta_data.drop(columns="embeddings"), on="item_id", how="left"
)

In [6]:
user_item_data["timespent_rel"] = (
    user_item_data["timespent"] / user_item_data["duration"]
)

In [7]:
share_weight = 10
bookmarks_weight = 1
timespent_rel_weight = 50

In [8]:
user_item_data["weighted_target"] = user_item_data["like"]*(
    1 + 
    share_weight*user_item_data.share + 
    bookmarks_weight*user_item_data.bookmarks + 
    timespent_rel_weight*user_item_data.timespent_rel)

##### split data

In [9]:
ui_train, ui_val = train_test_split(user_item_data,
                                    test_size=0.15,
                                    random_state=42,
                                    shuffle=False)

In [10]:
u_train = ui_train.user_id
i_train = ui_train.item_id
likes_train = ui_train.like
dislikes_train = ui_train.dislike

u_val = ui_val.user_id
i_val = ui_val.item_id
likes_val = ui_val.like
dislikes_val = ui_val.dislike

In [11]:
weights = coo_matrix((ui_train.weighted_target, (u_train, i_train)))

In [18]:
sparse_train = csr_matrix((likes_train-dislikes_train, (u_train, i_train)))
sparse_val = coo_matrix((likes_val-dislikes_val, (u_val, i_val)))

In [13]:
from sansa import ICFGramianFactorizerConfig

factorizer_config = ICFGramianFactorizerConfig(
    factorization_shift_step = 1e-3,  # initial diagonal shift if incomplete factorization fails
    factorization_shift_multiplier = 2.0,  # multiplier for the shift for subsequent attempts
)



In [14]:
from sansa import UMRUnitLowerTriangleInverterConfig

inverter_config = UMRUnitLowerTriangleInverterConfig(
    scans=1,  # number of scans through all columns of the matrix
    finetune_steps=5,  # number of finetuning steps, targeting worst columns
)

In [15]:
from sansa import SANSAConfig

config = SANSAConfig(
    l2 = 20.0,  # regularization strength
    weight_matrix_density = 5e-5,  # desired density of weights
    gramian_factorizer_config = factorizer_config,  # factorizer configuration
    lower_triangle_inverter_config = inverter_config,  # inverter configuration
)

In [16]:
from sansa import SANSA

X = sparse_train

# Instantiate model with the config
model = SANSA(config)

# Train model on the user-item matrix
model.fit(X)

INFO:sansa.model:Computing column norms of X^TX...


MemoryError: Unable to allocate 123. GiB for an array with shape (16572477719,) and data type int64

In [19]:
# or on a precomputed symmetric item-item matrix
model.fit(X, compute_gramian=False)

INFO:sansa.model:Computing row norms of X...


AttributeError: 'coo_matrix' object has no attribute 'indptr'

In [23]:
sparse_val.indptr

AttributeError: 'coo_matrix' object has no attribute 'indptr'