In [1]:
import numpy as np
import pandas as pd
from implicit.gpu.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
# from sklearn.preprocessing import LabelEncoder

import torch
from torch import FloatTensor
from torch.utils.data import DataLoader, TensorDataset

from src.datasets import DatasetPreprocessor
from src.dataloaders import VAEDataLoader
from src.models.vae import MultiVAE
from src.trainers import MultiVAETrainer

In [2]:
args = {
        "dataset_shortname": "ml_1m",
        "min_rating": 3.5,
        "min_user_count": 5,  # default 5
        "min_item_count": 0,
    }

In [3]:
d = DatasetPreprocessor(**args).load_dataset()
# item_num, dataloader, train = VAEDataLoader(batch_size=512).get_dataloaders(**args)

In [4]:
loader = VAEDataLoader(**args, val_size=0.1, test_size=0.1, batch_size=128)
dataloader, interactions = loader.get_dataloaders()

In [5]:
d.shape

(3985274, 4)

In [8]:
unique_user_num = d.user_id.nunique() 
unique_item_num = d.item_id.nunique()

In [5]:
sorted_df_to_split = (
        d
        .sort_values(by=['user_id', 'timestamp'], ascending=True)
    )

train = (
    sorted_df_to_split
    .groupby('user_id')
    .apply(lambda row: row[:-1], include_groups=False)
    .reset_index()
    .drop(columns=['level_1'])
)
test = (
    sorted_df_to_split
    .groupby('user_id')
    .apply(lambda row: row[-1:], include_groups=False)
    .reset_index()
    .drop(columns=['level_1'])
)

In [9]:
(
    sorted_df_to_split
    .groupby('user_id')
    .apply(lambda row: row[:-1], include_groups=False)
    .reset_index()
    .drop(columns=['level_1'])
)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,22,4,978300019
1,0,15,5,978300055
2,0,19,4,978300055
3,0,28,5,978300055
4,0,27,5,978300172
...,...,...,...,...
569232,6033,99,4,997454349
569233,6033,132,4,997454367
569234,6033,700,5,997454398
569235,6033,842,4,997454429


In [47]:
te = (sorted_df_to_split
    .groupby('user_id')
    .apply(lambda row: row[:-np.ceil(len(row) * 0.2).astype('int')], include_groups=False)
    .reset_index()
    .drop(columns=['level_1']))

te

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,22,4,978300019
1,0,15,5,978300055
2,0,19,4,978300055
3,0,28,5,978300055
4,0,27,5,978300172
...,...,...,...,...
457807,6033,903,4,960972016
457808,6033,1078,4,960972032
457809,6033,336,4,960972645
457810,6033,344,4,960972693


In [58]:
def _process_to_split(self, func_to_apply):
    return (
        self.sorted_items_to_split.apply(func_to_apply, include_groups=False)
        .reset_index()
        .drop(columns=["level_1"])
    )

In [56]:
def bound(row, ratio):
    return np.ceil(len(row) * ratio).astype('int')

In [None]:
train_val_input = _process_to_split(lambda row: row.iloc[:-bound(row, val_ratio + test_ratio)])

train_val_input

In [57]:
train_val_input = (sorted_df_to_split
                    .groupby('user_id')
                    .apply(lambda row: row.iloc[:-bound(row, val_ratio + test_ratio)], include_groups=False)
                    .reset_index()
                    .drop(columns=['level_1'])
                    )

train_val_input

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,22,4,978300019
1,0,15,5,978300055
2,0,19,4,978300055
3,0,28,5,978300055
4,0,27,5,978300172
...,...,...,...,...
457807,6033,903,4,960972016
457808,6033,1078,4,960972032
457809,6033,336,4,960972645
457810,6033,344,4,960972693


In [54]:
test_ratio = 0.1
val_ratio = 0.1

train_val_splitter = lambda row: row[:-np.ceil(len(row) * val_ratio).astype('int')]
train_val_splitter = lambda row: row[-np.ceil(len(row) * val_ratio).astype('int'):[-np.ceil(len(row) * val_ratio).astype('int')]
test_bound = lambda row: np.ceil(len(row) * test_ratio).astype('int')
val_bound = lambda row: np.ceil(len(row) * val_ratio).astype('int')

SyntaxError: '[' was never closed (3089704187.py, line 5)

In [None]:
train_val_input = self.sorted_items_to_split.apply(
            lambda row: row.iloc[:-2], include_groups=False
        ).reset_index().drop(columns=["level_1"])

val_label = self.sorted_items_to_split.apply(
    lambda row: row.iloc[-2:-1], include_groups=False
).reset_index().drop(columns=["level_1"])

test_input = self.sorted_items_to_split.apply(
    lambda row: row.iloc[:-1], include_groups=False
).reset_index().drop(columns=["level_1"])

test_label = self.sorted_items_to_split.apply(
    lambda row: row.iloc[-1:], include_groups=False
).reset_index().drop(columns=["level_1"])

In [26]:
te.groupby('user_id')[['item_id']].count().value_counts()

item_id
2          839
3          760
4          548
5          447
6          389
          ... 
69           1
68           1
67           1
66           1
144          1
Name: count, Length: 76, dtype: int64

In [16]:
(sorted_df_to_split
    .groupby('user_id')
    .apply(lambda row: row[:np.ceil(len(row) * 0.9).astype('int')], include_groups=False)
    .reset_index()
    .drop(columns=['level_1']))

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,22,4,978300019
1,0,15,5,978300055
2,0,19,4,978300055
3,0,28,5,978300055
4,0,27,5,978300172
...,...,...,...,...
520496,6033,1708,4,964828652
520497,6033,253,4,964828706
520498,6033,2088,4,964828734
520499,6033,1907,4,964828734


In [9]:
def _encode_tfidf(interactions: pd.DataFrame) -> pd.DataFrame:

    rating_sum_per_user = interactions.groupby("user_id")["rating"].transform("sum")
    user_count_per_element = interactions.groupby("item_id")["user_id"].transform("size")
    
    tf = interactions["rating"].values / rating_sum_per_user.values
    idf = np.log(len(rating_sum_per_user) / user_count_per_element.values)

    tfidf_values = tf * idf

    return tfidf_values

def _convert_to_sparse(interactions_df, use_tfidf=True) -> csr_matrix:
    if use_tfidf:
        sparse_matrix_values = _encode_tfidf(interactions_df)
    else:
        sparse_matrix_values = np.ones(interactions_df.shape[0])

    # user_index = interactions_df["user_id"].astype("category").cat.codes.values
    user_index = interactions_df["user_id"].values
    item_index = interactions_df["item_id"].values
    assert len(user_index) == len(item_index)

    sparse_interactions = csr_matrix(
        (
            sparse_matrix_values,
            ([user_index, item_index]),
        ),
        shape=(unique_user_num, unique_item_num),
    )

    return sparse_interactions

In [10]:
train_sparse = _convert_to_sparse(train)

In [7]:
user_index = train["user_id"].astype("category").cat.codes.values
item_index = train["item_id"].values

In [8]:
p = {'factors': 1019,
                   'iterations': 548,
                   'regularization': 0.8558091333940743,
                   'alpha': 56},

In [11]:
from implicit.cpu.als import AlternatingLeastSquares

In [12]:
als = AlternatingLeastSquares(
        calculate_training_loss=True
        )


  check_blas_config()


In [13]:
als.fit(train_sparse)

  0%|          | 0/15 [00:00<?, ?it/s]

In [13]:
als.save('als')

In [17]:
als.load('als.npz')

<implicit.cpu.als.AlternatingLeastSquares at 0x73a125dc1030>

In [24]:
del als

In [20]:
als = als.load('als.npz')

In [43]:
train["user_id"].unique()

array([   0,    1,    2, ..., 6031, 6032, 6033])

In [29]:
type(als)

implicit.cpu.als.AlternatingLeastSquares

In [28]:
isinstance(als, None)

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union

In [31]:
train_sparse

<6034x3533 sparse matrix of type '<class 'numpy.float64'>'
	with 569237 stored elements in Compressed Sparse Row format>

In [14]:
als_recommendations_matrix, als_recommendations_scores = als.recommend(
    train["user_id"].unique(),
    train_sparse,
    N=10,
    filter_already_liked_items=True
)

In [15]:
from sklearn.metrics import ndcg_score

In [38]:
test

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,17,5,978824351
1,1,61,4,978300174
2,2,138,4,978298504
3,3,91,5,978294282
4,4,192,4,978246555
...,...,...,...,...
6029,6029,1818,4,956754710
6030,6030,670,4,956801840
6031,6031,1172,5,956717204
6032,6032,934,4,956758029


In [32]:
df = pd.DataFrame({
        'user_id': np.arange(0, len(als_recommendations_matrix)),
        'item_id': list(als_recommendations_matrix),
        'score': list(als_recommendations_scores),
    })
    
recommendations = (
        df
        .explode(['item_id', 'score'], ignore_index=True)
    )

recommendations

Unnamed: 0,user_id,item_id,score
0,0,226,0.317816
1,0,138,0.136776
2,0,337,0.099589
3,0,383,0.059741
4,0,503,0.055705
...,...,...,...
60335,6033,113,0.048448
60336,6033,338,0.048427
60337,6033,443,0.044363
60338,6033,941,0.043985


In [24]:
recommendations = (
        df
        .explode(['item_id', 'score'], ignore_index=True)
    )

recommendations

Unnamed: 0,user_id,item_id,score
0,0,226,0.075352
1,0,219,0.050722
2,0,138,0.049738
3,0,503,0.045629
4,0,282,0.043646
...,...,...,...
60335,6033,734,0.055205
60336,6033,940,0.052575
60337,6033,526,0.052311
60338,6033,941,0.050701


In [None]:
def hits_per_user(recommended, label) -> Optional[np.ndarray]:
    if predicted is None:
        return None

    # assert label is not None
    # assert len(label) > 0

    predicted_np = np.array(predicted[:metric_k])
    ground_truth_np = np.array(ground_truth)

    hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)
    return hits


In [70]:
predicted_np = np.array(recommendations.recommended[0][:10])
ground_truth_np = np.array([384])

hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)

In [71]:
(predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [72]:
ground_truth_np.reshape(1, -1)

array([[384]])

In [75]:
predicted_np.reshape(-1, 2)

array([[226, 138],
       [384, 383],
       [503, 376],
       [387, 405],
       [762,  67]])

In [77]:
ndcg_score(predicted_np.reshape(-1, 2), predicted_np.reshape(-1, 2))

0.9999999999999998

In [136]:
def hits_per_user(recommended, label, k):

    if recommended is None:
        return None

    # assert label is not None
    # assert len(label) > 0

    recommended_array = np.array(recommended[:k])
    label_array = np.array(label)

    hits = (recommended_array.reshape(-1, 1) == label_array.reshape(1, -1)).sum(axis=1)
    return hits

def recall_per_user(row, k) -> float:
    recommended = row['recommended']
    label = row['label']

    hits = hits_per_user(recommended, label, k)

    if hits is None:
        return 0.

    return hits.sum() / min(len(label), k)

def precision_per_user(row, k) -> float:
    recommended = row['recommended']
    label = row['label']

    hits = hits_per_user(recommended, label, k)

    if hits is None:
        return 0.

    return hits.sum() / k


def ndcg_per_user(row, k) -> float:
    recommended = row['recommended']
    label = row['label']

    hits = hits_per_user(recommended, label, k)

    if hits is None:
        return 0.

    recommended_len = min(len(recommended), k)
    label_len = min(len(label), k)
    
    ndcg_weights = 1. / np.log2(np.arange(2, k + 2))

    dcg = (hits * ndcg_weights[:recommended_len]).sum()
    
    idcg = ndcg_weights.cumsum()[label_len-1]

    return dcg / idcg

def compute_metrics(df, ks):
    metrics = {}
    for k in ks:
        mean_ndcg = df.apply(lambda row: ndcg_per_user(row, k=k), axis=1).mean()
        mean_recall = df.apply(lambda row: recall_per_user(row, k=k), axis=1).mean()
        mean_precision = df.apply(lambda row: precision_per_user(row, k=k), axis=1).mean()

        metrics.update({
                f"NDCG@{k}" : mean_ndcg,
                f"Recall@{k}" : mean_recall,
                f"Precision@{k}" : mean_precision
            })

    return metrics

In [91]:
# metric_k = 1
# ndcg_weights = 1.0 / np.log2(np.arange(0, metric_k) + 2)
# ndcg_idcg = ndcg_weights.cumsum()


# def hits_per_user(predicted, ground_truth):

#     if predicted is None:
#         return None

#     assert ground_truth is not None
#     assert len(ground_truth) > 0

#     predicted_np = np.array(predicted[:metric_k])
#     ground_truth_np = np.array(ground_truth)

#     hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)
#     return hits


# def recall_per_user() -> float:
#     hits = hits_per_user(pl_struct)

#     if hits is None:
#         return 0.0

#     gt_count = min(len(pl_struct["ground_truth"]), metric_k)
#     return hits.sum() / gt_count


# def ndcg_per_user(predicted, ground_truth) -> float:
#     hits = hits_per_user(predicted, ground_truth)

#     if hits is None:
#         return 0.0

#     predicted_count = min(len(predicted), metric_k)
#     gt_count = min(len(ground_truth), metric_k)

#     dcg = (hits * ndcg_weights[:predicted_count]).sum()
#     idcg = ndcg_idcg[gt_count - 1]
#     ndcg = dcg / idcg

#     return ndcg


def compute_metrics(submission, gt):
    submission_postprocessed = (
        submission
        .select(
            pl.col("user_id").cast(pl.Int32),
            pl.col("track_id").str.split(" ").cast(pl.List(pl.Int32)).alias("predicted"),
        )
        .unique(subset="user_id")

        # Remove duplicated items
        # IMPORTANT: We keep original order of items, because it affects metric value
        .with_columns(
            pl.col("predicted").list.unique(maintain_order=True)
        )
    )
    ground_truth = gt.groupby("user_id").agg(pl.col("track_id").alias("ground_truth"))
    submission_with_gt = ground_truth.join(submission_postprocessed, on="user_id", how="left")

    metrics_per_user = submission_with_gt.select(
        pl.col("user_id"),
        pl.struct("predicted", "ground_truth").apply(ndcg_per_user).alias("ndcg"),
        pl.struct("predicted", "ground_truth").apply(recall_per_user).alias("recall"),
    )

    mean_ndcg = metrics_per_user.select(pl.col("ndcg").mean())["ndcg"][0]
    mean_recall = metrics_per_user.select(pl.col("recall").mean())["recall"][0]

    return {
        f"ndcg@{metric_k}": mean_ndcg,
        f"recall@{metric_k}": mean_recall,
    }

In [141]:
ndcg

0.03151486642670776

In [140]:
ndcg = labelled_recommendations.apply(lambda row: ndcg_per_user(row, k=10), axis=1).mean()
recall = labelled_recommendations.apply(lambda row: recall_per_user(row, k=10), axis=1)
precision = labelled_recommendations.apply(lambda row: precision_per_user(row, k=10), axis=1)

In [36]:
recommendations = pd.DataFrame({
        'user_id': np.arange(0, len(als_recommendations_matrix)),
        'recommended': list(als_recommendations_matrix.tolist()),
        # 'score': list(als_recommendations_scores),
    })
recommendations

Unnamed: 0,user_id,recommended
0,0,"[226, 138, 384, 383, 503, 376, 387, 405, 762, 67]"
1,1,"[118, 124, 256, 263, 30, 169, 526, 158, 362, 432]"
2,2,"[9, 377, 528, 937, 406, 335, 515, 43, 348, 368]"
3,3,"[118, 91, 637, 47, 67, 817, 417, 746, 259, 508]"
4,4,"[443, 315, 326, 897, 308, 631, 256, 627, 640, ..."
...,...,...
6029,6029,"[148, 216, 508, 624, 113, 142, 569, 557, 817, ..."
6030,6030,"[111, 1435, 671, 567, 197, 446, 826, 1489, 750..."
6031,6031,"[323, 112, 377, 515, 132, 335, 314, 995, 406, ..."
6032,6032,"[826, 6, 827, 778, 323, 931, 38, 941, 111, 798]"


In [37]:
recommendations.recommended[0]

[226, 138, 384, 383, 503, 376, 387, 405, 762, 67]

In [17]:
from sklearn.metrics import ndcg_score, recall_score, precision_score

In [38]:
label = test.groupby('user_id')[['item_id']].agg(list).rename(columns={'item_id' : 'label'})
label

Unnamed: 0_level_0,label
user_id,Unnamed: 1_level_1
0,[17]
1,[61]
2,[138]
3,[91]
4,[192]
...,...
6029,[1818]
6030,[670]
6031,[1172]
6032,[934]


In [96]:
labelled_recommendations = recommendations.merge(label, on='user_id', how='left')

In [100]:
labelled_recommendations

Unnamed: 0,user_id,recommended,label
0,0,"[226, 138, 384, 383, 503, 376, 387, 405, 762, 67]",[17]
1,1,"[118, 124, 256, 263, 30, 169, 526, 158, 362, 432]",[61]
2,2,"[9, 377, 528, 937, 406, 335, 515, 43, 348, 368]",[138]
3,3,"[118, 91, 637, 47, 67, 817, 417, 746, 259, 508]",[91]
4,4,"[443, 315, 326, 897, 308, 631, 256, 627, 640, ...",[192]
...,...,...,...
6029,6029,"[148, 216, 508, 624, 113, 142, 569, 557, 817, ...",[1818]
6030,6030,"[111, 1435, 671, 567, 197, 446, 826, 1489, 750...",[670]
6031,6031,"[323, 112, 377, 515, 132, 335, 314, 995, 406, ...",[1172]
6032,6032,"[826, 6, 827, 778, 323, 931, 38, 941, 111, 798]",[934]


In [None]:
k=1
labelled_recommendations.apply(
            lambda row: ndcg_score(row['recommended'][:k], row['label'], k=k),
            axis=1,
            result_type='expand'
        )
        # .rename(columns={ 0: 'recall', 1: 'map' })

In [27]:
def compute_recsys_metrics(
    predicted: pd.DataFrame,
    test: pd.DataFrame,
    k: int = 10,
    user_key = 'user_id',
    item_key = 'item_id'
):
    assert user_key in predicted.columns
    assert item_key in predicted.columns
    assert user_key in test.columns
    assert item_key in test.columns
    assert k > 0

    predicted_grouped = (
        predicted
        .groupby(user_key)
        .agg({item_key: list})
        .rename(columns={item_key: 'predicted'})
    )

    test_grouped = (
        test
        .groupby(user_key)
        .agg({item_key: set})
        .rename(columns={item_key: 'ground_truth'})
    )

    items_to_compare = predicted_grouped.merge(
        test_grouped,
        on=user_key,
        how='left'
    )

    metrics = (
        items_to_compare
        .apply(
            lambda row: _metrics(row['predicted'], row['ground_truth'], k),
            axis=1,
            result_type='expand'
        )
        .rename(columns={ 0: 'recall', 1: 'map' })
    )

    return metrics.mean().to_dict()


def _metrics(predicted: Sequence, ground_truth: Set, k: int):
    if not ground_truth:
        return 0.0, 0.0

    predicted_k = predicted[:k]

    # Recall@k
    intersection = ground_truth.intersection(predicted_k)
    recall = len(intersection) / min(len(ground_truth), len(predicted_k))

    # Average Precision (AP@k)
    num_hits = 0.0
    ap_sum = 0.0
    for i, pr in enumerate(predicted_k):
        if pr in ground_truth and pr not in predicted[:i]:
            num_hits += 1
            ap_sum += num_hits / (i + 1.0)
    ap_score = ap_sum / min(len(ground_truth), len(predicted_k))

    return recall, ap_score

In [None]:
def recommendations_to_df(
    recommendations_matrix: np.ndarray,
    recommendations_scores: np.ndarray
    ) -> pd.DataFrame:
    
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    # user_mapping = pd.DataFrame({
    #     'user_index': np.arange(0, len(user_encoder.classes_)),
    #     'user_id': user_encoder.classes_,
    # })

    # item_mapping = pd.DataFrame({
    #     'item_index': np.arange(0, len(item_encoder.classes_)),
    #     'track_id': item_encoder.classes_,
    # })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations

In [23]:
m = torch.load("models/ml-1m/val/multivae.pth", map_location=torch.device('cuda'), weights_only=True)

In [31]:
vae = MultiVAE(3533, 200, 1, 600)
mo = vae.load_state_dict(m)
mo

<All keys matched successfully>

In [9]:
from torchmetrics import MetricCollection
from torchmetrics.retrieval import (
    RetrievalNormalizedDCG,
    RetrievalPrecision,
    RetrievalRecall,
)

In [20]:
d.values()

dict_values([8, 6])

In [1]:
d = {'1' : 8, '2' : 6}
d.keys()

dict_keys(['1', '2'])

In [12]:
{k : 0 for k in metrics.keys(keep_base=True)}

{'RetrievalNormalizedDCG@1': 0,
 'RetrievalRecall@1': 0,
 'RetrievalPrecision@1': 0,
 'RetrievalNormalizedDCG@10': 0,
 'RetrievalRecall@10': 0,
 'RetrievalPrecision@10': 0}

In [13]:
{k : c[k] / len(d) for k in c}

{'1': 4.0, '2': 3.0}

In [5]:
from collections import Counter

d = {'1' : 8, '2' : 6}
c = {'1' : 8, '2' : 6}
cd = Counter(d) + Counter(c)
cd += Counter(cd)
cd

Counter({'1': 32, '2': 24})

In [3]:
d = {'1' : 8, '2' : 6}
d.update(d)
d

{'1': 8, '2': 6}

In [16]:
avg=0.7885
print(f'{avg:.4f}')

0.7885


In [10]:
metrics = MetricCollection(
        [MetricCollection([
            RetrievalNormalizedDCG(top_k=k), 
            RetrievalRecall(top_k=k), 
            RetrievalPrecision(top_k=k, adaptive_k=True)
        ], postfix=f'@{k}') 
        for k in [1, 10]
        ])

for metric, value in metrics.items(keep_base=True):
    print(metric, value)

RetrievalNormalizedDCG@1 RetrievalNormalizedDCG()
RetrievalRecall@1 RetrievalRecall()
RetrievalPrecision@1 RetrievalPrecision()
RetrievalNormalizedDCG@10 RetrievalNormalizedDCG()
RetrievalRecall@10 RetrievalRecall()
RetrievalPrecision@10 RetrievalPrecision()


In [2]:
from tempfile import gettempdir
gettempdir()

'/tmp'

In [3]:
import os
os.getcwd()

'/mnt/hdd0/files/diploma/VAE-RecSys'

In [10]:
print(ratings['user_id'].nunique())
print(ratings['item_id'].nunique())

6034
3533


In [15]:
user_label_index = val["user_id"].astype("category").cat.codes.values
label_index = val["label_item_id"].values
assert len(user_label_index) == len(label_index)


interactions_exploaded = val.explode("input_item_id")

user_input_index = interactions_exploaded["user_id"].astype("category").cat.codes.values
input_index = interactions_exploaded["input_item_id"].values
assert len(user_input_index) == len(input_index)

sparse_label = csr_matrix(
    (
        np.ones(val.shape[0]),
        ([user_label_index, label_index]),
    ),
    shape=(val.user_id.nunique(), ratings.item_id.nunique()),
)


sparse_input = csr_matrix(
    (
        np.ones(interactions_exploaded.shape[0]),
        ([user_input_index, input_index]),
    ),
    shape=(val.user_id.nunique(), ratings.item_id.nunique()),
)


dataloader = DataLoader(
    TensorDataset(
        FloatTensor(sparse_input.toarray()), FloatTensor(sparse_label.toarray())
    ),
    batch_size=16,
    shuffle=False,
)

In [33]:
def make_sparse(grouped_ratings):
    ratings = grouped_ratings.explode()

    user_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(ratings.index.to_numpy())

    item_encoder = LabelEncoder()
    item_index = item_encoder.fit_transform(ratings.values)

    user_num = len(grouped_ratings)
    item_num = len(np.unique(item_index))

    sparse_matrix = csr_matrix(
        (np.ones(len(user_index)), (user_index, item_index)),
        shape=(user_num, item_num),
    )

    return sparse_matrix

In [34]:
train_csr = make_sparse(train["item_id"])

In [35]:
train_csr

<6034x3525 sparse matrix of type '<class 'numpy.float64'>'
	with 563203 stored elements in Compressed Sparse Row format>

In [36]:
als = AlternatingLeastSquares(factors=1024, random_state=0)
als.fit(train_csr)

  0%|          | 0/15 [00:00<?, ?it/s]

In [10]:
rec_matrix, rec_scores = als.recommend(
    range(0, len(train)), train_csr, N=50, filter_already_liked_items=True
)