# ALS applications

## Dzen dataset

Data comes from [dzen.ru](https://dzen.ru/) site and consists of likes which users put to text articles

### Columns
1. item_id - unique id of an item (article)
2. user_id - unique id of a user
3. source_id - unique id of an author. If two items have same source_id, then they come from one author
4. Name of item is name of the article
5. Raw dataset represents user_id and list of item_ids which user liked

In [1]:
!curl -O -J -L 'https://www.dropbox.com/s/ia4bvhuqg8kesee/zen_dataset.zip?dl=1'
!unzip zen_dataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   132  100   132    0     0    297      0 --:--:-- --:--:-- --:--:--   297
100    17  100    17    0     0     19      0 --:--:-- --:--:-- --:--:--    19
100   496    0   496    0     0    262      0 --:--:--  0:00:01 --:--:--   802
100 24.0M  100 24.0M    0     0  7701k      0  0:00:03  0:00:03 --:--:-- 37.2M
unzip:  cannot find or open zen_dataset.zip, zen_dataset.zip.zip or zen_dataset.zip.ZIP.


In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm.notebook import tqdm
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn

In [3]:
gpu = torch.device('cuda')
print(gpu)

cuda


In [None]:
item_names = pd.read_csv("zen_item_to_name.csv")
item_sources = pd.read_csv("zen_item_to_source.csv")
# dataset = pd.read_csv("zen_ratings.csv", converters={'item_ids': ast.literal_eval})
dataset = pd.read_csv('zen_ratings.csv',converters={'item_ids': ast.literal_eval},encoding='ISO-8859-1')

In [6]:
dataset = dataset[:5000]

In [7]:
dataset.head(3)

Unnamed: 0,user_id,item_ids
0,993675863667353526,"[15267, 61075, 81203, 17066, 25471, 88427, 638..."
1,4250619547882954185,"[4555, 94644, 84972, 17774, 94962, 78217, 2485..."
2,3847785305345691076,"[1898, 26703, 16525, 86939, 55017, 31069, 4035..."


In [8]:
item_names

Unnamed: 0,id,name
0,94962,Что обычно ожидало русских казачек в руках у к...
1,3972,Почему Россия решила строить новую скоростную ...
2,94644,"5 неприличных фактов об Андрее Макаревиче, кот..."
3,82518,"Что стало с красавицей Хмельницкой, которую му..."
4,53264,"Понять и Простить: Почему угонщики, бежавшие и..."
...,...,...
104498,36769,"Плюс один источник мифа о рыцарях, неспособных..."
104499,9190,Мой сад - малоуходный
104500,52731,Купил первую в жизни циркулярную пилу. Честный...
104501,72660,Решили предложить Марине помощь в лечении ч.10


In [9]:
item_sources

Unnamed: 0,id,source
0,94962,2919814402697966089
1,3972,3263022753228392991
2,94644,-3857390427602554682
3,82518,-9036908390349249792
4,53264,3353856219169766284
...,...,...
104498,36769,3818746211375738614
104499,9190,4975535765688979937
104500,52731,3720366796439288909
104501,72660,-7860042973720636310


In [10]:
dataset

Unnamed: 0,user_id,item_ids
0,993675863667353526,"[15267, 61075, 81203, 17066, 25471, 88427, 638..."
1,4250619547882954185,"[4555, 94644, 84972, 17774, 94962, 78217, 2485..."
2,3847785305345691076,"[1898, 26703, 16525, 86939, 55017, 31069, 4035..."
3,1785181112918558233,"[75601, 102458, 28716, 100694, 5757, 47104, 60..."
4,5078748097863903181,"[72260, 40825, 2615, 42549, 379, 100818, 56827..."
...,...,...
4995,1783247335571225144,"[58649, 40423, 8706, 80540, 48042, 6527, 82238..."
4996,435401615695014005,"[96085, 16607, 100349, 104327, 86713, 99974, 5..."
4997,2324486976962366344,"[54393, 1527, 38651, 38488, 93694, 54932, 5925..."
4998,1084392302990541991,"[54372, 93575, 9661, 69953, 73912, 44480, 1566..."


In [11]:
total_interactions_count = dataset.item_ids.map(len).sum()
user_coo = np.zeros(total_interactions_count, dtype=np.int64)
item_coo = np.zeros(total_interactions_count, dtype=np.int64)
pos = 0

for user_id, item_ids in enumerate(tqdm(dataset.item_ids)):
    user_coo[pos : pos + len(item_ids)] = user_id
    item_coo[pos : pos + len(item_ids)] = item_ids
    pos += len(item_ids)

shape = (max(user_coo) + 1, max(item_coo) + 1)
user_item_matrix = sp.coo_matrix(
    (np.ones(len(user_coo)), (user_coo, item_coo)), shape=shape
)
user_item_matrix = user_item_matrix.tocsr()
sp.save_npz("data_train.npz", user_item_matrix)
# Cleanup memory. Later you need just data_train.npz
del user_coo
del item_coo
del dataset

  0%|          | 0/5000 [00:00<?, ?it/s]

In [12]:
# you could start here if you already done precomputing
user_item_matrix = sp.load_npz("data_train.npz")

In [13]:
user_item_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 378009 stored elements and shape (5000, 104503)>

In [14]:
def sparce_matrix_report(matrix):
    print('Size of raw data:', matrix.data.nbytes / 10**6, 'Mb')
    print('Feedback matrix size:', matrix.shape)

In [15]:
sparce_matrix_report(user_item_matrix)

Size of raw data: 3.024072 Mb
Feedback matrix size: (5000, 104503)


In [16]:
item_weights = np.array(user_item_matrix.tocsc().sum(0))[0]
top_to_bottom_order = np.argsort(-item_weights)
item_mapping = np.empty(top_to_bottom_order.shape, dtype=int)
item_mapping[top_to_bottom_order] = np.arange(len(top_to_bottom_order))
total_item_count = (item_weights > 0).sum()
total_user_count = user_item_matrix.shape[0]


def build_debug_dataset(user_item_matrix, item_pct: float, user_pct: float):
    '''Get given percent of top rated items and given percent of random users'''
    user_count = int(total_user_count * user_pct),
    item_count = int(total_item_count * item_pct)
    item_ids = top_to_bottom_order[:item_count]
    user_ids = np.random.choice(
        np.arange(user_item_matrix.shape[0]), size=user_count, replace=False
    )
    train = user_item_matrix[user_ids]
    train = train[:, item_ids]
    return train

In [17]:
debug_dataset = build_debug_dataset(user_item_matrix, 0.05, 0.05)

sparce_matrix_report(debug_dataset)

Size of raw data: 0.060312 Mb
Feedback matrix size: (250, 3222)


This is useful for debugging (just to save time).

**Final answers should use full dataset!!!**

## Split dataset matrix (5 points)

in the following way: for 20% of users (random) remove one like - this will be test data. The rest is train data.

In [18]:
def split_data(ratings):
    # your code here
    train = ratings.copy().tolil()
    test = sp.lil_matrix(ratings.shape)

    for user in range(ratings.shape[0]):
        item_indices = ratings[user].nonzero()[1]
        if len(item_indices) == 0:
            continue
        test_item = np.random.choice(item_indices)

        # Set that one item in test set
        test[user, test_item] = ratings[user, test_item]

        # Remove it from train set
        train[user, test_item] = 0

    return train.tocsr(), test.tocsr()

In [19]:
train_ratings, test_ratings = split_data(user_item_matrix)

In [20]:
print(train_ratings.nnz, test_ratings.nnz)

373011 4998


## Implement IALS (10 points each)

Note that due to size of data you need to implement algorithm with _sparce matrices_!

You are welcome to use classes like on the seminar:)

In [21]:
def ials(ratings, k=20, lam=0.1, alpha=40, n_iters=10):
    '''Implicit Alternating Least Squares algorithm

    Args:
        ratings: sparce matrix of ratings
        k: size of embeddings
        lam: regularization term

    Returns:
        two matrices: of user embeddings and of item embeddings
    '''
    # your code here
    users, items = ratings.shape
    X = torch.randn(users, k, device=gpu) * (1. / k)
    Y = torch.randn(items, k, device=gpu) * (1. / k)

    ratings_coo = ratings.tocoo()
    values = torch.tensor(ratings_coo.data, dtype=torch.float32, device=gpu)
    row = torch.tensor(ratings_coo.row, device=gpu)
    col = torch.tensor(ratings_coo.col, device=gpu)

    Cui = torch.sparse_coo_tensor(
        torch.stack([row, col]),
        1 + alpha * values,
        size=(users, items)
    ).coalesce()

    for iteration in range(n_iters):
        # Update user factors
        for u in tqdm(range(users)):
            idx = ratings[u].indices
            if len(idx) == 0:
                continue
            Y_i = Y[idx]
            Cu = (1 + alpha) * torch.ones(len(idx), device=gpu)
            A = (Y_i.t() @ (Cu[:, None] * Y_i)) + lam * torch.eye(k, device=gpu)
            b = (Y_i.t() @ Cu)
            X[u] = torch.linalg.solve(A, b)

        # Update item factors
        for i in tqdm(range(items)):
            idx = ratings[:, i].indices
            if len(idx) == 0:
                continue
            X_u = X[idx]
            Ci = (1 + alpha) * torch.ones(len(idx), device=gpu)
            A = (X_u.t() @ (Ci[:, None] * X_u)) + lam * torch.eye(k, device=gpu)
            b = (X_u.t() @ Ci)
            Y[i] = torch.linalg.solve(A, b)

    return X, Y

## Compute MRR@100 metric for test users

For ALS and IALS algorithms.

**Don't forget to use full dataset!**

In [22]:
def mrr(predictions, test_matrix, k=100):
    mrr_total = 0.0
    n_users = test_matrix.shape[0]

    for u in range(n_users):
        true_items = test_matrix[u].indices
        if len(true_items) == 0:
            continue

        scores = predictions[u]
        top_k = torch.topk(scores, k=k).indices.cpu().numpy()

        rank = np.where(top_k == true_items[0])[0]
        if len(rank) > 0:
            mrr_total += 1.0 / (rank[0] + 1)

    return mrr_total / n_users


In [23]:
X, Y = ials(train_ratings, k=20, lam=0.1, alpha=40, n_iters=10)
ials_predictions = X @ Y.T
mrr_ials = mrr(ials_predictions, test_ratings)
print("MRR@100 for IALS:", mrr_ials)

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

MRR@100 for IALS: 0.0009177865612648222


## Adjust hyperparameters of IALS to maximize MRR (10 points)

Main hyperparameters are regularization and weights for implicit case.

In [24]:
# your code here
best_mrr = 0
best_params = {}

for k in [20, 40]:
    for lam in [0.1, 1]:
        for alpha in [10, 40]:
            X, Y = ials(train_ratings, k=k, lam=lam, alpha=alpha, n_iters=5)
            pred = X @ Y.T
            score = mrr(pred, test_ratings)
            if score > best_mrr:
                best_mrr = score
                best_params = {'k': k, 'lam': lam, 'alpha': alpha}

print("Best IALS params:", best_params, "with MRR@100:", best_mrr)


  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/104503 [00:00<?, ?it/s]

Best IALS params: {'k': 20, 'lam': 1, 'alpha': 10} with MRR@100: 0.004053450915556655



Optimal parameters of IALS are:

....

## Get similarities from item2item CF (10 points)

Item2item can be taken from the first homework, SLIM was implemented in the class.

Alternatively you could use libraries, but in this case you will need to convert dataset to their format.

You need to compute only item similarities, not predictions for users.

In [1]:
from sklearn.metrics.pairwise import cosine_similarity

def item2item_similarity(user_item_matrix):
    item_vectors = user_item_matrix.T
    return cosine_similarity(item_vectors)

i2i_similarities = item2item_similarity(user_item_matrix)

NameError: name 'user_item_matrix' is not defined

## Compare similarities from four algorithms (20 points)

* plot distributions
* compute metrics (which you think are relevant)
* look at several top similar lists

Make conclusion how these methods differ in computing similarities

In [None]:
# your code here
methods = {
    "User-Item Matrix": user_item_matrix.T,
    "IALS Embeddings": Y
}

for name, mat in methods.items():
    sim = cosine_similarity(mat)
    sim_flat = sim.flatten()
    sns.histplot(sim_flat, bins=50, kde=True, label=name)

plt.legend()
plt.title("Similarity Distribution Comparison")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()


In [None]:
def print_top_similar_items(sim_matrix, item_names, item_id, top_k=5):
    top_items = np.argsort(-sim_matrix[item_id])[:top_k+1]
    print(f"Top similar items to: {item_names[item_id]}")
    for i in top_items[1:]:  # skip itself
        print(f"   {item_names[i]}")

item_id_example = 1000  # Change as needed
print_top_similar_items(i2i_similarities, item_names['name'].values, item_id_example)

Conclusion:

....