In [10]:
%load_ext autoreload
%autoreload 2

In [1]:
! pip install --quiet "pytorch-lightning >=2.0,<2.6" "matplotlib" "torch >=1.8.1,<2.8" "seaborn" "torchmetrics >=1.0,<1.8" "numpy <3.0" "torchvision"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Standard libraries
import math
import os
import urllib.request
from functools import partial
from urllib.error import HTTPError

# Plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np

# PyTorch Lightning
import pytorch_lightning as pl
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Torchvision
import torchvision
from pytorch_lightning.callbacks import ModelCheckpoint
from torchvision import transforms
from torchvision.datasets import CIFAR100
from tqdm.notebook import tqdm

plt.set_cmap("cividis")
%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

Seed set to 42


Device: mps


<Figure size 640x480 with 0 Axes>

In [None]:
def scaled_dot_product(q:torch.Tensor, k:torch.Tensor, v:torch.Tensor, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attn_logits, v)
    return values, attention

In [5]:
seq_len, d_k = 3, 2
pl.seed_everything(42)
q = torch.randn(seq_len, d_k)
k = torch.randn(seq_len, d_k)
v = torch.randn(seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)

Seed set to 42


Q
 tensor([[ 0.3367,  0.1288],
        [ 0.2345,  0.2303],
        [-1.1229, -0.1863]])
K
 tensor([[ 2.2082, -0.6380],
        [ 0.4617,  0.2674],
        [ 0.5349,  0.8094]])
V
 tensor([[ 1.1103, -1.6898],
        [-0.9890,  0.9580],
        [ 1.3221,  0.8172]])
Values
 tensor([[ 0.5698, -0.1520],
        [ 0.5379, -0.0265],
        [ 0.2246,  0.5556]])
Attention
 tensor([[0.4028, 0.2886, 0.3086],
        [0.3538, 0.3069, 0.3393],
        [0.1303, 0.4630, 0.4067]])


In [29]:
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Stack all weight matrices 1...h together for efficiency
        # Note that in many implementations you see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)

        self._reset_parameters()

    def _reset_parameters(self):
        # Original Transformer initialization, see PyTorch documentation
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x:torch.Tensor, mask=None, return_attention=False):
        batch_size, seq_length, embed_dim = x.size()
        qkv:torch.Tensor = self.qkv_proj(x)

        # Separate Q, K, V from linear output
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)  # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)

        # Determine value outputs
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3)  # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [30]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        """EncoderBlock.

        Args:
            input_dim: Dimensionality of the input
            num_heads: Number of heads to use in the attention block
            dim_feedforward: Dimensionality of the hidden layer in the MLP
            dropout: Dropout probability to use in the dropout layers

        """
        super().__init__()

        # Attention layer
        self.self_attn = MultiheadAttention(input_dim, input_dim, num_heads)

        # Two-layer MLP
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            nn.Linear(dim_feedforward, input_dim),
        )

        # Layers to apply in between the main layers
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Attention part
        attn_out = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)

        # MLP part
        linear_out = self.linear_net(x)
        x = x + self.dropout(linear_out)
        x = self.norm2(x)

        return x

In [31]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, **block_args):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(**block_args) for _ in range(num_layers)])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x

    def get_attention_maps(self, x, mask=None):
        attention_maps = []
        for layer in self.layers:
            _, attn_map = layer.self_attn(x, mask=mask, return_attention=True)
            attention_maps.append(attn_map)
            x = layer(x)
        return attention_maps

In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """Positional Encoding.

        Args:
            d_model: Hidden dimensionality of the input.
            max_len: Maximum length of a sequence to expect.

        """
        super().__init__()

        # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        # register_buffer => Tensor which is not a parameter, but should be part of the modules state.
        # Used for tensors that need to be on the same device as the module.
        # persistent=False tells PyTorch to not add the buffer to the state dict (e.g. when we save the model)
        self.register_buffer("pe", pe, persistent=False)

    def forward(self, x:torch.Tensor):
        x = x + self.pe[:, : x.size(1)]
        return x

In [12]:
max_len = 100
d_model = 32
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)

In [16]:
pe[:, :20].size()

torch.Size([1, 20, 32])

In [33]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [34]:
class TransformerPredictor(pl.LightningModule):
    def __init__(
        self,
        input_dim,
        model_dim,
        num_classes,
        num_heads,
        num_layers,
        lr,
        warmup,
        max_iters,
        dropout=0.0,
        input_dropout=0.0,
    ):
        """TransformerPredictor.

        Args:
            input_dim: Hidden dimensionality of the input
            model_dim: Hidden dimensionality to use inside the Transformer
            num_classes: Number of classes to predict per sequence element
            num_heads: Number of heads to use in the Multi-Head Attention blocks
            num_layers: Number of encoder blocks to use.
            lr: Learning rate in the optimizer
            warmup: Number of warmup steps. Usually between 50 and 500
            max_iters: Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler
            dropout: Dropout to apply inside the model
            input_dropout: Dropout to apply on the input features

        """
        super().__init__()
        self.save_hyperparameters()
        self._create_model()

    def _create_model(self):
        # Input dim -> Model dim
        self.input_net = nn.Sequential(
            nn.Dropout(self.hparams.input_dropout), nn.Linear(self.hparams.input_dim, self.hparams.model_dim)
        )
        # Positional encoding for sequences
        self.positional_encoding = PositionalEncoding(d_model=self.hparams.model_dim)
        # Transformer
        self.transformer = TransformerEncoder(
            num_layers=self.hparams.num_layers,
            input_dim=self.hparams.model_dim,
            dim_feedforward=2 * self.hparams.model_dim,
            num_heads=self.hparams.num_heads,
            dropout=self.hparams.dropout,
        )
        # Output classifier per sequence element
        self.output_net = nn.Sequential(
            nn.Linear(self.hparams.model_dim, self.hparams.model_dim),
            nn.LayerNorm(self.hparams.model_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(self.hparams.dropout),
            nn.Linear(self.hparams.model_dim, self.hparams.num_classes),
        )

    def forward(self, x, mask=None, add_positional_encoding=True):
        """
        Args:
            x: Input features of shape [Batch, SeqLen, input_dim]
            mask: Mask to apply on the attention outputs (optional)
            add_positional_encoding: If True, we add the positional encoding to the input.
                                      Might not be desired for some tasks.
        """
        x = self.input_net(x)
        if add_positional_encoding:
            x = self.positional_encoding(x)
        x = self.transformer(x, mask=mask)
        x = self.output_net(x)
        return x

    @torch.no_grad()
    def get_attention_maps(self, x, mask=None, add_positional_encoding=True):
        """Function for extracting the attention matrices of the whole Transformer for a single batch.

        Input arguments same as the forward pass.

        """
        x = self.input_net(x)
        if add_positional_encoding:
            x = self.positional_encoding(x)
        attention_maps = self.transformer.get_attention_maps(x, mask=mask)
        return attention_maps

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)

        # We don't return the lr scheduler because we need to apply it per iteration, not per epoch
        self.lr_scheduler = CosineWarmupScheduler(
            optimizer, warmup=self.hparams.warmup, max_iters=self.hparams.max_iters
        )
        return optimizer

    def optimizer_step(self, *args, **kwargs):
        super().optimizer_step(*args, **kwargs)
        self.lr_scheduler.step()  # Step per iteration

    def training_step(self, batch, batch_idx):
        raise NotImplementedError

    def validation_step(self, batch, batch_idx):
        raise NotImplementedError

    def test_step(self, batch, batch_idx):
        raise NotImplementedError

In [35]:
class ReverseDataset(data.Dataset):
    def __init__(self, num_categories, seq_len, size):
        super().__init__()
        self.num_categories = num_categories
        self.seq_len = seq_len
        self.size = size

        self.data = torch.randint(self.num_categories, size=(self.size, self.seq_len))

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        inp_data = self.data[idx]
        labels = torch.flip(inp_data, dims=(0,))
        return inp_data, labels

In [36]:
dataset = partial(ReverseDataset, 10, 16)
train_loader = data.DataLoader(dataset(50000), batch_size=128, shuffle=True, drop_last=True, pin_memory=True)
val_loader = data.DataLoader(dataset(1000), batch_size=128)
test_loader = data.DataLoader(dataset(10000), batch_size=128)

In [21]:
inp_data, labels = train_loader.dataset[0]
print("Input data:", inp_data)
print("Labels:    ", labels)

Input data: tensor([9, 6, 2, 0, 6, 2, 7, 9, 7, 3, 3, 4, 3, 7, 0, 9])
Labels:     tensor([9, 0, 7, 3, 4, 3, 3, 7, 9, 7, 2, 6, 0, 2, 6, 9])


In [37]:
class ReversePredictor(TransformerPredictor):
    def _calculate_loss(self, batch, mode="train"):
        # Fetch data and transform categories to one-hot vectors
        inp_data, labels = batch
        inp_data = F.one_hot(inp_data, num_classes=self.hparams.num_classes).float()

        # Perform prediction and calculate loss and accuracy
        preds = self.forward(inp_data, add_positional_encoding=True)
        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), labels.view(-1))
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # Logging
        self.log(f"{mode}_loss", loss)
        self.log(f"{mode}_acc", acc)
        return loss, acc

    def training_step(self, batch, batch_idx):
        loss, _ = self._calculate_loss(batch, mode="train")
        return loss

    def validation_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="val")

    def test_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="test")

In [38]:
def train_reverse(**kwargs):
    trainer = pl.Trainer(
        accelerator="mps",
        devices=1,
        max_epochs=10,
        gradient_clip_val=5,
    )
    trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need
    
    model = ReversePredictor(max_iters=trainer.max_epochs * len(train_loader), **kwargs)
    trainer.fit(model, train_loader, val_loader)

    # Test best model on validation and test set
    val_result = trainer.test(model, dataloaders=val_loader, verbose=False)
    test_result = trainer.test(model, dataloaders=test_loader, verbose=False)
    result = {"test_acc": test_result[0]["test_acc"], "val_acc": val_result[0]["test_acc"]}

    model = model.to(device)
    return model, result

In [39]:
reverse_model, reverse_result = train_reverse(
    input_dim=train_loader.dataset.num_categories,
    model_dim=32,
    num_heads=1,
    num_classes=train_loader.dataset.num_categories,
    num_layers=1,
    dropout=0.0,
    lr=5e-4,
    warmup=50,
)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | input_net           | Sequential         | 352    | train
1 | positional_encoding | PositionalEncoding | 0      | train
2 | transformer         | TransformerEncoder | 8.5 K  | train
3 | output_net          | Sequential         | 1.4 K  | train
-------------------------------------------------------------------
10.3 K    Trainable params
0         Non-trainable params
10.3 K    Total params
0.041     Total estimated model params size (MB)
24        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 390/390 [00:03<00:00, 128.20it/s, v_num=9]        

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 390/390 [00:03<00:00, 127.71it/s, v_num=9]
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 281.36it/s]
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 79/79 [00:00<00:00, 385.64it/s]


In [40]:
print("Val accuracy:  %4.2f%%" % (100.0 * reverse_result["val_acc"]))
print("Test accuracy: %4.2f%%" % (100.0 * reverse_result["test_acc"]))

Val accuracy:  73.88%
Test accuracy: 73.92%


In [1]:
import pandas as pd
import os
import numpy as np
import joblib

In [2]:
folder = "datasets/ml-32m"

ratings_path = os.path.join(folder, 'ratings.csv')
movies_path = os.path.join(folder, 'movies.csv')
tags_path = os.path.join(folder, 'tags.csv')

rating_column_names = ['userId', 'movieId', 'rating', 'timestamp']
movies_column_names = ['movieId', 'title', 'genres']
tags_column_names = ['userId', 'movieId', 'tag', 'timestamp']

df_ratings = pd.read_csv(ratings_path, sep=',', names=rating_column_names, dtype={'userId':'int32', 'movieId':'int32', 'rating':float, 'timestamp':'int64'}, header=0)
df_movies = pd.read_csv(movies_path, sep=',', names=movies_column_names, dtype={'movieId':'int32', 'title':'object', 'genres':'object'}, header=0)
df_tags = pd.read_csv(tags_path, sep=',', names=tags_column_names, dtype={'userId':'int32', 'movieId':'int32', 'tag':'object', 'timestamp':'int64'}, header=0)

df_ratings.dropna(inplace=True, subset=['userId', 'movieId', 'rating'])
df_movies.dropna(inplace=True, subset=['movieId', 'title', 'genres'])
df_tags.dropna(inplace=True, subset=['userId', 'movieId', 'tag'])
df_tags.drop(columns=["userId","timestamp"], inplace=True)

# Extract movie genres
df_movies['genres'] = df_movies['genres'].apply(lambda x: x.lower().split('|'))

# Extract movie year from title
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
def remove_stop(x):
    out = []
    for y in x:
        if len(y) > 0 and y not in stopwords:
            out += [y]
    return out

def flatten_lists(x):
    out = set()
    for y in x:
        out.update(y.split(" "))
    out = list(out)
    return out

df_movies['movie_year'] = df_movies['title'].str.extract(r'\((\d{4})\)').fillna("2025").astype('int')

df_movies['title'] = df_movies['title'].str.replace(r'\((\d{4})\)', '', regex=True)
df_movies['title'] = df_movies['title'].str.replace(r'[^a-zA-Z0-9\s]+', '', regex=True)
df_movies['title'] = df_movies['title'].apply(lambda x: x.strip().lower().split(" "))
df_movies['title'] = df_movies['title'].apply(lambda x: remove_stop(x))

df_tags['tag'] = df_tags['tag'].str.replace(r'[^a-zA-Z0-9\s]+', '', regex=True)
df_tags['tag'] = df_tags['tag'].apply(lambda x: x.strip().lower())
df_tags = df_tags.groupby("movieId").agg(set).reset_index()
df_tags['tag'] = df_tags['tag'].apply(list)
df_tags['tag'] = df_tags['tag'].apply(lambda x: flatten_lists(x))
df_tags['tag'] = df_tags['tag'].apply(lambda x: remove_stop(x))
df_tags['tag'] = df_tags['tag'].astype("object")

df_movies = df_movies.merge(df_tags, on=['movieId'], how='left')
df_movies["tag"] = df_movies["tag"].fillna({i: [""] for i in df_movies.index})
df_movies["description"] = df_movies["title"] + df_movies["tag"]
df_movies.drop(columns=["tag"], inplace=True)
df_movies.drop(columns=["title"], inplace=True)

In [3]:
df_ratings[:100]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858
...,...,...,...,...
95,1,1944,2.0,943231120
96,1,1952,4.0,944253272
97,1,1960,1.0,943231236
98,1,1961,1.0,944250182


In [4]:
df_movies[:100]

Unnamed: 0,movieId,genres,movie_year,description
0,1,"[adventure, animation, children, comedy, fantasy]",1995,"[toy, story, robot, antenna, slow, karate, cha..."
1,2,"[adventure, children, fantasy]",1995,"[jumanji, coming, effect, small, poison, blund..."
2,3,"[comedy, romance]",1995,"[grumpier, old, men, duringcreditsstinger, mid..."
3,4,"[comedy, drama, romance]",1995,"[waiting, exhale, slurs, divorce, flick, inter..."
4,5,[comedy],1995,"[father, bride, part, ii, aging, parent, famil..."
...,...,...,...,...
95,97,"[crime, drama]",1995,"[hate, haine, la, biting, jockey, cow, disc, t..."
96,98,"[action, thriller]",1994,"[shopping, want, law, directorial, debut, jude]"
97,99,[documentary],1995,"[heidi, fleiss, hollywood, madam, hollywood, p..."
98,100,"[drama, thriller]",1996,"[city, hall, election, politics, investigation..."


In [5]:
def normalize_ratings(df:pd.DataFrame):
    df2 = df[["userId", "rating"]].groupby(by=["userId"]).agg(mean_user_rating=('rating', 'mean'), std_user_rating=('rating', 'std'))
    df = df.merge(df2, on=["userId"], how="inner")
    df["normalized_rating"] = (df["rating"] - df["mean_user_rating"])/df["std_user_rating"]
    df["normalized_rating"] = df["normalized_rating"].fillna(df["rating"])
    df.drop(columns=["mean_user_rating", "std_user_rating", "rating"], inplace=True)
    return df

In [6]:
df_ratings = normalize_ratings(df_ratings)

In [7]:
df_ratings['label'] = [0 for _ in range(len(df_ratings))]
df_ratings['label'] = np.where(df_ratings["normalized_rating"] > 0, 1, df_ratings['label'])

In [8]:
df_ratings[:100]

Unnamed: 0,userId,movieId,timestamp,normalized_rating,label
0,1,17,944249077,0.304372,1
1,1,25,944250228,-1.646377,0
2,1,29,943230976,-0.996127,0
3,1,30,944249077,0.954622,1
4,1,32,943228858,0.954622,1
...,...,...,...,...,...
95,1,1944,943231120,-0.996127,0
96,1,1952,944253272,0.304372,1
97,1,1960,943231236,-1.646377,0
98,1,1961,944250182,-1.646377,0


In [9]:
def split_train_test(df:pd.DataFrame, min_rated=10, test_ratio=0.8, val_ratio=0.8):
    print("Splitting data into train test and validation...")
    # Split data into training, testing and validation
    df = df.sort_values(by='timestamp')
    df2 = df[["userId", "movieId"]].groupby(by=["userId"]).agg(list).reset_index()

    # Filter all user_ids who have rated more than 'min_rated' movies
    df2 = df2[df2.movieId.apply(len) > min_rated]
    df = df.merge(df2, on=["userId"], how="inner", suffixes=("", "_right"))
    df.drop(columns=['movieId_right'], inplace=True)

    n = df.shape[0]
    m = int(test_ratio*n)

    df_train_val = df[:m]
    df_test = df[m:]

    k = int(val_ratio*m)
    df_train = df_train_val[:k]
    df_val = df_train_val[k:]

    return df_train, df_val, df_test

In [10]:
df_ratings_train, df_ratings_val, df_ratings_test = split_train_test(df_ratings, min_rated=1)

Splitting data into train test and validation...


In [11]:
df_ratings_train.sort_values(by=["userId", "timestamp"], inplace=True)
df_ratings_val.sort_values(by=["userId", "timestamp"], inplace=True)
df_ratings_test.sort_values(by=["userId", "timestamp"], inplace=True)

In [12]:
def transform(x, vocab):
    if isinstance(x, list):
        out = []
        for y in x:
            out += [vocab[y]] if y in vocab else [0]
        return out
    else:
        return vocab[x] if x in vocab else 0

In [13]:
def categorical_encoding(df:pd.DataFrame, col:str, max_vocab_size=1000):
    all_vals = df[col].tolist()
    unique_vals = {}

    if len(all_vals) > 0 and isinstance(all_vals[0], list):
        for v in all_vals:
            for x in v:
                if x not in unique_vals:
                    unique_vals[x] = 0
                unique_vals[x] += 1
    else:
        for x in all_vals:
            if x not in unique_vals:
                unique_vals[x] = 0
            unique_vals[x] += 1
    
    unique_vals = sorted(unique_vals.items(), key=lambda item: item[1], reverse=True)
    unique_vals = dict(unique_vals[:min(max_vocab_size, len(unique_vals))])
    unique_vals = sorted(unique_vals.keys())
    vocab = {unique_vals[i] : i+1 for i in range(len(unique_vals))}
        
    df[col] = df[col].apply(lambda x: transform(x, vocab))
    return df[col], vocab

In [14]:
vocabulary = {}
max_vocab_size = {'userId':1e100, 'movieId':1e100, 'description':1e5, 'genres':100, 'movie_year':1e100}

for col in ['userId']:
    print(col)
    df_ratings_train[col], v = categorical_encoding(df_ratings_train, col, max_vocab_size[col])
    vocabulary[col] = v

for col in ['movieId', 'description', 'genres', 'movie_year']:
    print(col)
    df_movies[col], v = categorical_encoding(df_movies, col, max_vocab_size[col])
    vocabulary[col] = v

userId
movieId
description
genres
movie_year


In [15]:
df_ratings_val = df_ratings_val.reset_index()
for col in ['userId']:
    print(col)
    df_ratings_val[col] = df_ratings_val[col].apply(lambda x: transform(x, vocabulary[col]))

userId


In [16]:
df_ratings_test = df_ratings_test.reset_index()
for col in ['userId']:
    print(col)
    df_ratings_test[col] = df_ratings_test[col].apply(lambda x: transform(x, vocabulary[col]))

userId


In [17]:
del df_ratings

In [None]:
def get_historical_user_features(df:pd.DataFrame, max_hist=20):
	df["seq_id"] = list(range(df.shape[0]))
	df2 = df[["seq_id", "userId", "movieId", "normalized_rating", "timestamp"]].sort_values(by=["userId", "timestamp"])
	
	df2 = df2[["userId", "movieId", "normalized_rating", "seq_id"]].groupby(by=["userId"]).agg(list).reset_index()
	df2.rename(columns={"movieId":"prev_movie_ids", "normalized_rating":"prev_ratings", "seq_id":"prev_seq_ids"}, inplace=True)

	user_ids = []
	p_m_ids = []
	p_r_ids = []
	p_seq_ids = []

	for i in range(df2.shape[0]):
		seq_id = df2.loc[i, "prev_seq_ids"]
		u_id   = df2.loc[i, "userId"]
		m_ids  = df2.loc[i, "prev_movie_ids"]
		r_ids  = df2.loc[i, "prev_ratings"]

		for j in range(len(m_ids)):
			user_ids += [u_id]
			p_seq_ids += [seq_id[j]]
			p_m_ids += [m_ids[:j][-max_hist:]] if j > 0 else [[]]
			p_r_ids += [r_ids[:j][-max_hist:]] if j > 0 else [[]]
	
	df3 = pd.DataFrame({"userId":user_ids, "prev_movie_ids":p_m_ids, "prev_ratings":p_r_ids, "seq_id":p_seq_ids})
	df = df.merge(df3, on=["userId", "seq_id"], how="left")
	df.drop(columns=["seq_id"], inplace=True)

In [18]:
import importlib
import ml_32m_py
import numpy as np

importlib.reload(ml_32m_py)

def get_historical_user_features_cpp(df:pd.DataFrame, max_hist=20):
        user_ids = df['userId'].to_numpy().astype(np.uint32)
        movie_ids = df['movieId'].to_numpy().astype(np.uint32)
        ratings = df['normalized_rating'].to_numpy().astype(np.float32)
        timestamps = df['timestamp'].to_numpy().astype(np.uint64)

        prev_movie_ids, prev_ratings  = ml_32m_py.py_get_historical_features(user_ids, movie_ids, timestamps, ratings, df.shape[0], max_hist)

        df["prev_movie_ids"] = prev_movie_ids
        df["prev_ratings"] = prev_ratings

In [19]:
get_historical_user_features_cpp(df_ratings_train)

In [20]:
get_historical_user_features_cpp(df_ratings_val)

In [21]:
get_historical_user_features_cpp(df_ratings_test)

In [22]:
import joblib
joblib.dump(vocabulary, "vocabulary.pkl")
joblib.dump(df_ratings_train, "df_ratings_train.pkl")
joblib.dump(df_ratings_val, "df_ratings_val.pkl")
joblib.dump(df_ratings_test, "df_ratings_test.pkl")
joblib.dump(df_movies, "df_movies.pkl")

['df_movies.pkl']

In [24]:
df_ratings_train[:100]

Unnamed: 0,userId,movieId,timestamp,normalized_rating,label,prev_movie_ids,prev_ratings
3237871,1,2997,943226846,0.304372,1,[],[]
3237872,1,2966,943226846,-1.646377,0,[2997],[0.30437225103378296]
3237887,1,2890,943226916,0.304372,1,"[2997, 2966]","[0.30437225103378296, -1.6463772058486938]"
3237908,1,3078,943226986,-0.996127,0,"[2997, 2966, 2890]","[0.30437225103378296, -1.6463772058486938, 0.3..."
3237980,1,2882,943227458,-1.646377,0,"[2997, 2966, 2890, 3078]","[0.30437225103378296, -1.6463772058486938, 0.3..."
...,...,...,...,...,...,...,...
3305726,1,835,944248888,-0.345878,0,"[1885, 1080, 176, 2973, 2243, 2502, 1060, 933,...","[0.30437225103378296, -1.6463772058486938, 0.3..."
3305727,1,608,944248943,-0.996127,0,"[1080, 176, 2973, 2243, 2502, 1060, 933, 1270,...","[-1.6463772058486938, 0.30437225103378296, 0.9..."
3305728,1,2268,944248943,-1.646377,0,"[176, 2973, 2243, 2502, 1060, 933, 1270, 1259,...","[0.30437225103378296, 0.9546220898628235, -1.6..."
3305729,1,80,944248943,0.954622,1,"[2973, 2243, 2502, 1060, 933, 1270, 1259, 915,...","[0.9546220898628235, -1.6463772058486938, -0.3..."


In [25]:
import random
num_parts = 32
df_ratings_train["partition"] = [random.randint(1, num_parts) for _ in range(len(df_ratings_train))]
df_ratings_val["partition"]   = [random.randint(1, num_parts) for _ in range(len(df_ratings_val))]
df_ratings_test["partition"]  = [random.randint(1, num_parts) for _ in range(len(df_ratings_test))]

In [None]:
import shutil
out_path = "parquet_dataset_ml_32m/"
if os.path.exists(out_path):
    try:
        shutil.rmtree(out_path)
    except:
        pass
os.makedirs(out_path, exist_ok=True)
joblib.dump(vocabulary, f"parquet_dataset_ml_32m/vocabulary.pkl")
df_ratings_train.to_parquet(out_path + "train/", partition_cols=["partition"])
df_ratings_val.to_parquet(out_path + "validation/", partition_cols=["partition"])
df_ratings_test.to_parquet(out_path + "test/", partition_cols=["partition"])
df_movies.to_parquet(out_path + "movies.parquet")

In [32]:
df_movies

Unnamed: 0,movieId,genres,movie_year,description
0,1,"[3, 4, 5, 6, 10]",114,"[67865, 63820, 56112, 3754, 61552, 35107, 1212..."
1,2,"[3, 5, 10]",114,"[34536, 14196, 20591, 61609, 51356, 8627, 5045..."
2,3,"[6, 16]",114,"[28008, 47432, 42191, 20134, 42734, 1766, 4148..."
3,4,"[6, 9, 16]",114,"[71951, 22280, 61583, 18900, 24140, 32889, 126..."
4,5,[6],114,"[23069, 9709, 49188, 31779, 2197, 49084, 22827..."
...,...,...,...,...
87580,87581,[9],141,"[43805, 2056, 1]"
87581,87582,"[6, 9]",142,"[60008, 62100, 1]"
87582,87583,[9],142,"[47819, 1]"
87583,87584,[9],87,"[3529, 9621, 1]"


In [31]:
df_ratings_train[:100]

Unnamed: 0,userId,movieId,timestamp,normalized_rating,label,prev_movie_ids,prev_ratings,partition
3237871,1,2997,943226846,0.304372,1,[],[],13
3237872,1,2966,943226846,-1.646377,0,[2997],[0.30437225103378296],11
3237887,1,2890,943226916,0.304372,1,"[2997, 2966]","[0.30437225103378296, -1.6463772058486938]",9
3237908,1,3078,943226986,-0.996127,0,"[2997, 2966, 2890]","[0.30437225103378296, -1.6463772058486938, 0.3...",7
3237980,1,2882,943227458,-1.646377,0,"[2997, 2966, 2890, 3078]","[0.30437225103378296, -1.6463772058486938, 0.3...",18
...,...,...,...,...,...,...,...,...
3305726,1,835,944248888,-0.345878,0,"[1885, 1080, 176, 2973, 2243, 2502, 1060, 933,...","[0.30437225103378296, -1.6463772058486938, 0.3...",29
3305727,1,608,944248943,-0.996127,0,"[1080, 176, 2973, 2243, 2502, 1060, 933, 1270,...","[-1.6463772058486938, 0.30437225103378296, 0.9...",31
3305728,1,2268,944248943,-1.646377,0,"[176, 2973, 2243, 2502, 1060, 933, 1270, 1259,...","[0.30437225103378296, 0.9546220898628235, -1.6...",29
3305729,1,80,944248943,0.954622,1,"[2973, 2243, 2502, 1060, 933, 1270, 1259, 915,...","[0.9546220898628235, -1.6463772058486938, -0.3...",18


In [38]:
df_ratings_train_mmap = np.memmap("df_ratings_train.mmap", dtype=np.object_, mode="w+", shape=df_ratings_train.shape)
df_ratings_train_mmap[:,:] = df_ratings_train.to_numpy()

In [40]:
df_ratings_val_mmap = np.memmap("df_ratings_val.mmap", dtype=np.object_, mode="w+", shape=df_ratings_val.shape)
df_ratings_val_mmap[:,:] = df_ratings_val.to_numpy()

In [41]:
df_ratings_test_mmap = np.memmap("df_ratings_test.mmap", dtype=np.object_, mode="w+", shape=df_ratings_test.shape)
df_ratings_test_mmap[:,:] = df_ratings_test.to_numpy()

In [26]:
!gsutil -m cp -R parquet_dataset_ml_32m gs://r6-ae-dev-adperf-adintelligence-data/amondal/

/bin/bash: gsutil: command not found


In [None]:
import math
import os
import numpy as np
import pandas as pd
import random
import uuid
import joblib
import pytorch_lightning as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

def checkpoint(model:nn.Module, optimizer:torch.optim.Optimizer, filename):
    torch.save({'optimizer':optimizer.state_dict(), 'model':model.state_dict()}, filename)
    
def load_model(filename):
    chkpt = torch.load(filename, weights_only=False)
    return chkpt['model'], chkpt['optimizer']

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 42


Device: mps


In [None]:
def attention(q:torch.Tensor, k:torch.Tensor, v:torch.Tensor, mask=None):
    d_k = q.size()[-1] # q,k,v : (batch, head, seq_len, embed_size_per_head)
    attn_logits = torch.matmul(q, k.transpose(-2, -1)) # (batch, head, seq_len, seq_len)
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v) # (batch, head, seq_len, embed_size_per_head)
    return values, attention


def init_weights(x:nn.Linear):
    with torch.no_grad():
        nn.init.xavier_uniform_(x.weight)
        x.bias.data.fill_(0)
		
        
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()

        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model) # (seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x:torch.Tensor):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)   
        return self.dropout(x)
	
    
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, input_dim:int, d_model: int, h: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h

        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h

        self.w_q = nn.Linear(input_dim, d_model) # Wq
        self.w_k = nn.Linear(input_dim, d_model) # Wk
        self.w_v = nn.Linear(input_dim, d_model) # Wv
        self.w_o = nn.Linear(d_model, d_model) # Wo

        init_weights(self.w_q)
        init_weights(self.w_k)
        init_weights(self.w_v)
        init_weights(self.w_o)

    def forward(self, q_x:torch.Tensor, k_x:torch.Tensor, v_x:torch.Tensor, mask=None):
        q:torch.Tensor = self.w_q(q_x) # (batch, seq_len, d_model)
        k:torch.Tensor = self.w_k(k_x) # (batch, seq_len, d_model)
        v:torch.Tensor = self.w_v(v_x) # (batch, seq_len, d_model)

        q_h = q.reshape(q.shape[0], q.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        k_h = k.reshape(k.shape[0], k.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        v_h = v.reshape(v.shape[0], v.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)

        attn_out, _ = attention(q_h, k_h, v_h, mask) # (batch, head, seq_len, embed_size_per_head)
        attn_out = attn_out.transpose(1, 2) # (batch, seq_len, head, embed_size_per_head)
        attn_out = attn_out.reshape(attn_out.shape[0], attn_out.shape[1], attn_out.shape[2]*attn_out.shape[3]) # (batch, seq_len, d_model)

        return self.w_o(attn_out) # (batch, seq_len, d_model)
    
    
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()

        self.self_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)

        self.ffn_1 = nn.Linear(input_dim, dim_feedforward)
        self.ffn_2 = nn.Linear(dim_feedforward, input_dim)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)

        self.ffn = nn.Sequential(
            self.ffn_1,
            nn.Dropout(dropout),
            nn.GELU(),
            self.ffn_2,
        )

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.self_attn(x, x, x, mask=mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(attn_out) # (batch, seq_len, input_dim)
        x = self.norm1(x) # (batch, seq_len, input_dim)

        ffn_out = self.ffn(x) # (batch, seq_len, input_dim)
        x = x + self.dropout(ffn_out) # (batch, seq_len, input_dim)
        x = self.norm2(x) # (batch, seq_len, input_dim)

        return x
	
    
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x
	
    
class CrossFeatureLayer(nn.Module):
    def __init__(self, input_dim, num_layers,dropout=0.0) -> None:
        super(CrossFeatureLayer, self).__init__()
        
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)

        self.cross_layer_params = []
        self.cross_layer_norms = []
        
        for _ in range(num_layers):
            h = nn.Linear(input_dim, input_dim)
            init_weights(h)
            self.cross_layer_params += [h]

            g = nn.Sequential(
                nn.GELU(),
                nn.Dropout(dropout),
                nn.LayerNorm(input_dim)
            )

            self.cross_layer_norms += [g]

        self.cross_layer_params = nn.ModuleList(self.cross_layer_params)
        self.cross_layer_norms = nn.ModuleList(self.cross_layer_norms)

    def forward(self, x):
        x_initial = torch.Tensor(x) # (batch, ..., input_dim)

        for i in range(self.num_layers):
            x = x_initial*self.cross_layer_params[i](x) + x # (batch, ..., input_dim)
            x = self.cross_layer_norms[i](x) # (batch, ..., input_dim)
        
        return x

In [None]:
class MovieId:
    movie_id_emb = None

    def __new__(cls, movie_id_size, emb_size=512):
        if cls.movie_id_emb is None:
            cls.movie_id_emb = nn.Embedding(movie_id_size, emb_size, padding_idx=0)
        return cls.movie_id_emb

In [None]:
def emb_averaging(inp:torch.Tensor, emb_layer:nn.Module, padding_idx:int=0):
    # inp : (batch, num_tokens)
    embeddings = emb_layer(inp) # (batch, num_tokens, emb_size)
    mask = (inp != padding_idx).float().unsqueeze(-1) # (batch, num_tokens, 1)
    masked_embeddings = embeddings * mask # (batch, num_tokens, emb_size)
    sum_embeddings = 1.0 + torch.sum(masked_embeddings, dim=1) # (batch, emb_size)
    sequence_lengths = 1.0 + torch.sum(mask, dim=1) # (batch, 1) # prevents division by zero by + 1
    averaged_embeddings = sum_embeddings / sequence_lengths # (batch, emb_size)
    return averaged_embeddings

class MovieEncoder(nn.Module):
    def __init__(
            self, 
            movie_id_size, 
            movie_desc_size,
            movie_genres_size,
            movie_year_size, 
            embedding_size, 
            dropout=0.0
        ) -> None:
        
        super(MovieEncoder, self).__init__()
        
        self.movie_id_emb = MovieId(movie_id_size, 512)
        self.movie_desc_emb = nn.Embedding(movie_desc_size, 1024, padding_idx=0)
        self.movie_genres_emb = nn.Embedding(movie_genres_size, 8, padding_idx=0)
        self.movie_year_emb = nn.Embedding(movie_year_size, 16, padding_idx=0)

        self.fc_concat = nn.Linear(1560, embedding_size)
        init_weights(self.fc_concat)

        self.fc = nn.Sequential(
            self.fc_concat,
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(embedding_size)
        )

        self.cross_features = CrossFeatureLayer(1560, 3, 0.0)

    def forward(
            self, 
            ids:torch.Tensor, 
            descriptions:torch.Tensor, 
            genres:torch.Tensor, 
            years:torch.Tensor
        ):
        id_emb = self.movie_id_emb(ids) # (batch, 512)
        desc_emb = emb_averaging(descriptions, self.movie_desc_emb) # (batch, 1024)
        genres_emb = emb_averaging(genres, self.movie_genres_emb) # (batch, 8)
        years_emb = self.movie_year_emb(years) # (batch, 16)

        movie_embedding = torch.concat([id_emb, desc_emb, genres_emb, years_emb], dim=-1) # (batch, 1560)
        movie_embedding = self.cross_features(movie_embedding) + movie_embedding # (batch, 1560)
        movie_embedding = self.fc(movie_embedding) # (batch, emb_size)

        return movie_embedding

In [None]:
class UserEncoder(nn.Module):
    def __init__(
            self, 
            user_id_size, 
            user_prev_rated_movie_ids_size,
            embedding_size, 
            prev_rated_seq_len, 
            num_encoder_layers, 
            num_heads=3, 
            dim_ff=512,
            dropout=0.0
        ) -> None:

        super(UserEncoder, self).__init__()

        self.user_id_emb = nn.Embedding(user_id_size, 512, padding_idx=0)
        self.movie_id_emb = MovieId(user_prev_rated_movie_ids_size, 512)

        self.positional_encoding = PositionalEncoding(512, prev_rated_seq_len, 0.0)
        self.encoder_block = Encoder(num_encoder_layers, 512, num_heads, dim_ff, 0.0)

        self.fc_concat = nn.Linear(1024, embedding_size)
        init_weights(self.fc_concat)

        self.fc = nn.Sequential(
            self.fc_concat,
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(embedding_size)
        )

        self.num_heads = num_heads


    def forward(
            self, 
            user_ids:torch.Tensor, 
            prev_rated_movie_ids:torch.Tensor, 
            prev_ratings:torch.Tensor
        ):
        user_id_emb:torch.Tensor = self.user_id_emb(user_ids) # (batch, 512)

        mask = (prev_rated_movie_ids != 0).float().unsqueeze(-1) # (batch, prev_rated_seq_len, 1)
        mask = torch.matmul(mask, mask.transpose(-2,-1)).unsqueeze(1).repeat(1,self.num_heads,1,1) # (batch, num_heads, prev_rated_seq_len, prev_rated_seq_len)
        
        rated_movie_emb = self.movie_id_emb(prev_rated_movie_ids)   # (batch, prev_rated_seq_len, 512)
        rated_movie_emb = self.positional_encoding(rated_movie_emb) # (batch, prev_rated_seq_len, 512)
        rated_movie_emb = self.encoder_block(rated_movie_emb, mask) # (batch, prev_rated_seq_len, 512)

        rated_movie_ratings = prev_ratings.unsqueeze(1) # (batch, 1, prev_rated_seq_len)
        # weighted sum of ratings
        rated_movie_emb_weighted = torch.matmul(rated_movie_ratings, rated_movie_emb).squeeze(1) # (batch, 512)

        user_embedding = torch.concat([user_id_emb, rated_movie_emb_weighted], dim=-1) # (batch, 1024)
        user_embedding = self.fc(user_embedding) # (batch, emb_size)

        return user_embedding

In [None]:
class RecommenderSystem(nn.Module):
    def __init__(
            self, 
            user_id_size, 
            user_prev_rated_movie_ids_size,
            user_embedding_size, 
            user_prev_rated_seq_len, 
            user_num_encoder_layers, 
            user_num_heads, 
            user_dim_ff,
            user_dropout,
            movie_id_size, 
            movie_desc_size,
            movie_genres_size,
            movie_year_size, 
            movie_embedding_size, 
            movie_dropout,
            embedding_size,
            dropout=0.0
        ) -> None:

        super(RecommenderSystem, self).__init__()

        self.movie_encoder = \
            MovieEncoder\
            (
                movie_id_size, 
                movie_desc_size,
                movie_genres_size,
                movie_year_size, 
                movie_embedding_size, 
                movie_dropout
            )
        
        self.user_encoder = \
            UserEncoder\
            (
                user_id_size, 
                user_prev_rated_movie_ids_size,
                user_embedding_size, 
                user_prev_rated_seq_len, 
                user_num_encoder_layers, 
                user_num_heads, 
                user_dim_ff,
                user_dropout
            )

        self.fc_concat = nn.Linear(user_embedding_size + movie_embedding_size, embedding_size)
        init_weights(self.fc_concat)

        self.fc = nn.Sequential(
            self.fc_concat,
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(embedding_size)
        )

        self.cross_features = CrossFeatureLayer(embedding_size, 3, 0.0)

        self.fc_out = nn.Linear(embedding_size, 1)
        init_weights(self.fc_out)

        self.out = nn.Sequential(
            self.fc_out,
            nn.Sigmoid()
        )


    def forward(
            self, 
            user_ids:torch.Tensor, # (batch,)
            user_prev_rated_movie_ids:torch.Tensor, 
            user_prev_ratings:torch.Tensor,
            movie_ids:torch.Tensor, 
            movie_descriptions:torch.Tensor, 
            movie_genres:torch.Tensor, 
            movie_years:torch.Tensor
        ):
        
        movie_embeddings = \
            self.movie_encoder\
            (
                movie_ids, 
                movie_descriptions, 
                movie_genres, 
                movie_years
            ) # (batch, 1, embedding_size)
        
        user_embeddings = \
            self.user_encoder\
                (
                    user_ids, 
                    user_prev_rated_movie_ids, 
                    user_prev_ratings,
                )                     # (batch, 1, embedding_size), (batch, movie_seq_len, embedding_size)
        
        emb_concat = torch.concat([movie_embeddings, user_embeddings], dim=-1)
        
        emb  = self.fc_concat(emb_concat)
        emb  = self.cross_features(emb)
        out  = self.out(emb)

        return out

In [None]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [None]:
movie_ids = df_movies["movieId"].to_numpy(dtype=np.uint32)
movie_descriptions = df_movies["description"].to_numpy(dtype=np.uint32)
movie_genres = df_movies["genres"].to_numpy(dtype=np.uint8)
movie_years = df_movies["movie_year"].to_numpy(dtype=np.uint16)



In [44]:
g = df_ratings_train["prev_movie_ids"].to_numpy()

In [57]:
import torch

In [64]:
def pad_batch(values, dtype, max_seq_len=None):
    if max_seq_len is None:
        max_seq_len = max([len(x) for x in values])
    
    arr = np.zeros((len(values), max_seq_len), dtype=dtype)

    for i in range(len(values)):
        k = max_seq_len-1
        for j in range(len(values[i])-1, -1, -1):
            arr[i,k] = values[i][j]
            k -= 1
    
    return arr


In [None]:
def prepare_batches(n, batch_size=128):
    max_seq_len = 20
    columns = df_ratings_train.columns
    df_ratings_train_mmap = np.memmap("df_ratings_train.mmap", dtype=np.object_, mode="r", shape=df_ratings_train.shape)

    while True:
        for i in range(0, n, batch_size):
            df_ratings_batch = df_ratings_train_mmap[i:min(n,i+batch_size)]

            df_ratings_batch_df = pd.DataFrame(df_ratings_batch, columns=columns)
            df_ratings_batch_df = df_ratings_batch_df.merge(df_movies, on=["movieId"], how="left")

            user_ids = df_ratings_batch_df["userId"].to_numpy(dtype=np.uint32)
            user_prev_rated_movie_ids = pad_batch(df_ratings_batch_df["prev_movie_ids"].to_numpy(), dtype=np.uint32, max_seq_len=max_seq_len)
            user_prev_ratings = pad_batch(df_ratings_batch_df["prev_ratings"].to_numpy(), dtype=np.float32, max_seq_len=max_seq_len)

            movie_ids = df_ratings_batch_df["movieId"].to_numpy(dtype=np.uint32)
            movie_descriptions = pad_batch(df_ratings_batch_df["description"].to_numpy(), dtype=np.uint32)
            movie_genres = pad_batch(df_ratings_batch_df["genres"].to_numpy(), dtype=np.uint8)
            movie_years = df_ratings_batch_df["movie_year"].to_numpy(dtype=np.uint16)

            user_ids = torch.from_numpy(user_ids)
            user_prev_rated_movie_ids = torch.from_numpy(user_prev_rated_movie_ids)
            user_prev_ratings = torch.from_numpy(user_prev_ratings)

            movie_ids = torch.from_numpy(movie_ids)
            movie_descriptions = torch.from_numpy(movie_descriptions)
            movie_genres = torch.from_numpy(movie_genres)
            movie_years = torch.from_numpy(movie_years)

            labels = torch.from_numpy(df_ratings_batch_df["label"].to_numpy(dtype=np.uint32))

            yield [user_ids, user_prev_rated_movie_ids, user_prev_ratings, movie_ids, movie_descriptions, movie_genres, movie_years], labels
    

In [70]:
prepare_batches(df_ratings_train.shape[0])

tensor([[    0,     0,     0,  ..., 52087, 52378, 16288],
        [    0,     0,     0,  ..., 64905,  4044, 58330],
        [    0,     0,     0,  ..., 17668, 27702, 44365],
        ...,
        [    0,     0,     0,  ..., 37732, 13745, 34444],
        [    0,     0,     0,  ..., 39769,  4037, 70090],
        [    0,     0,     0,  ..., 54393, 51458,  6928]], dtype=torch.uint32)
torch.Size([128, 634])


AssertionError: 