In [10]:
%load_ext autoreload
%autoreload 2

In [1]:
! pip install --quiet "pytorch-lightning >=2.0,<2.6" "matplotlib" "torch >=1.8.1,<2.8" "seaborn" "torchmetrics >=1.0,<1.8" "numpy <3.0" "torchvision"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Standard libraries
import math
import os
import urllib.request
from functools import partial
from urllib.error import HTTPError

# Plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np

# PyTorch Lightning
import pytorch_lightning as pl
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Torchvision
import torchvision
from pytorch_lightning.callbacks import ModelCheckpoint
from torchvision import transforms
from torchvision.datasets import CIFAR100
from tqdm.notebook import tqdm

plt.set_cmap("cividis")
%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

Seed set to 42


Device: mps


<Figure size 640x480 with 0 Axes>

In [None]:
def scaled_dot_product(q:torch.Tensor, k:torch.Tensor, v:torch.Tensor, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attn_logits, v)
    return values, attention

In [5]:
seq_len, d_k = 3, 2
pl.seed_everything(42)
q = torch.randn(seq_len, d_k)
k = torch.randn(seq_len, d_k)
v = torch.randn(seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)

Seed set to 42


Q
 tensor([[ 0.3367,  0.1288],
        [ 0.2345,  0.2303],
        [-1.1229, -0.1863]])
K
 tensor([[ 2.2082, -0.6380],
        [ 0.4617,  0.2674],
        [ 0.5349,  0.8094]])
V
 tensor([[ 1.1103, -1.6898],
        [-0.9890,  0.9580],
        [ 1.3221,  0.8172]])
Values
 tensor([[ 0.5698, -0.1520],
        [ 0.5379, -0.0265],
        [ 0.2246,  0.5556]])
Attention
 tensor([[0.4028, 0.2886, 0.3086],
        [0.3538, 0.3069, 0.3393],
        [0.1303, 0.4630, 0.4067]])


In [29]:
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Stack all weight matrices 1...h together for efficiency
        # Note that in many implementations you see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)

        self._reset_parameters()

    def _reset_parameters(self):
        # Original Transformer initialization, see PyTorch documentation
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x:torch.Tensor, mask=None, return_attention=False):
        batch_size, seq_length, embed_dim = x.size()
        qkv:torch.Tensor = self.qkv_proj(x)

        # Separate Q, K, V from linear output
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)  # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)

        # Determine value outputs
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3)  # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [30]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        """EncoderBlock.

        Args:
            input_dim: Dimensionality of the input
            num_heads: Number of heads to use in the attention block
            dim_feedforward: Dimensionality of the hidden layer in the MLP
            dropout: Dropout probability to use in the dropout layers

        """
        super().__init__()

        # Attention layer
        self.self_attn = MultiheadAttention(input_dim, input_dim, num_heads)

        # Two-layer MLP
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            nn.Linear(dim_feedforward, input_dim),
        )

        # Layers to apply in between the main layers
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Attention part
        attn_out = self.self_attn(x, mask=mask)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)

        # MLP part
        linear_out = self.linear_net(x)
        x = x + self.dropout(linear_out)
        x = self.norm2(x)

        return x

In [31]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, **block_args):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(**block_args) for _ in range(num_layers)])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x

    def get_attention_maps(self, x, mask=None):
        attention_maps = []
        for layer in self.layers:
            _, attn_map = layer.self_attn(x, mask=mask, return_attention=True)
            attention_maps.append(attn_map)
            x = layer(x)
        return attention_maps

In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """Positional Encoding.

        Args:
            d_model: Hidden dimensionality of the input.
            max_len: Maximum length of a sequence to expect.

        """
        super().__init__()

        # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        # register_buffer => Tensor which is not a parameter, but should be part of the modules state.
        # Used for tensors that need to be on the same device as the module.
        # persistent=False tells PyTorch to not add the buffer to the state dict (e.g. when we save the model)
        self.register_buffer("pe", pe, persistent=False)

    def forward(self, x:torch.Tensor):
        x = x + self.pe[:, : x.size(1)]
        return x

In [12]:
max_len = 100
d_model = 32
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)

In [16]:
pe[:, :20].size()

torch.Size([1, 20, 32])

In [33]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [34]:
class TransformerPredictor(pl.LightningModule):
    def __init__(
        self,
        input_dim,
        model_dim,
        num_classes,
        num_heads,
        num_layers,
        lr,
        warmup,
        max_iters,
        dropout=0.0,
        input_dropout=0.0,
    ):
        """TransformerPredictor.

        Args:
            input_dim: Hidden dimensionality of the input
            model_dim: Hidden dimensionality to use inside the Transformer
            num_classes: Number of classes to predict per sequence element
            num_heads: Number of heads to use in the Multi-Head Attention blocks
            num_layers: Number of encoder blocks to use.
            lr: Learning rate in the optimizer
            warmup: Number of warmup steps. Usually between 50 and 500
            max_iters: Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler
            dropout: Dropout to apply inside the model
            input_dropout: Dropout to apply on the input features

        """
        super().__init__()
        self.save_hyperparameters()
        self._create_model()

    def _create_model(self):
        # Input dim -> Model dim
        self.input_net = nn.Sequential(
            nn.Dropout(self.hparams.input_dropout), nn.Linear(self.hparams.input_dim, self.hparams.model_dim)
        )
        # Positional encoding for sequences
        self.positional_encoding = PositionalEncoding(d_model=self.hparams.model_dim)
        # Transformer
        self.transformer = TransformerEncoder(
            num_layers=self.hparams.num_layers,
            input_dim=self.hparams.model_dim,
            dim_feedforward=2 * self.hparams.model_dim,
            num_heads=self.hparams.num_heads,
            dropout=self.hparams.dropout,
        )
        # Output classifier per sequence element
        self.output_net = nn.Sequential(
            nn.Linear(self.hparams.model_dim, self.hparams.model_dim),
            nn.LayerNorm(self.hparams.model_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(self.hparams.dropout),
            nn.Linear(self.hparams.model_dim, self.hparams.num_classes),
        )

    def forward(self, x, mask=None, add_positional_encoding=True):
        """
        Args:
            x: Input features of shape [Batch, SeqLen, input_dim]
            mask: Mask to apply on the attention outputs (optional)
            add_positional_encoding: If True, we add the positional encoding to the input.
                                      Might not be desired for some tasks.
        """
        x = self.input_net(x)
        if add_positional_encoding:
            x = self.positional_encoding(x)
        x = self.transformer(x, mask=mask)
        x = self.output_net(x)
        return x

    @torch.no_grad()
    def get_attention_maps(self, x, mask=None, add_positional_encoding=True):
        """Function for extracting the attention matrices of the whole Transformer for a single batch.

        Input arguments same as the forward pass.

        """
        x = self.input_net(x)
        if add_positional_encoding:
            x = self.positional_encoding(x)
        attention_maps = self.transformer.get_attention_maps(x, mask=mask)
        return attention_maps

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)

        # We don't return the lr scheduler because we need to apply it per iteration, not per epoch
        self.lr_scheduler = CosineWarmupScheduler(
            optimizer, warmup=self.hparams.warmup, max_iters=self.hparams.max_iters
        )
        return optimizer

    def optimizer_step(self, *args, **kwargs):
        super().optimizer_step(*args, **kwargs)
        self.lr_scheduler.step()  # Step per iteration

    def training_step(self, batch, batch_idx):
        raise NotImplementedError

    def validation_step(self, batch, batch_idx):
        raise NotImplementedError

    def test_step(self, batch, batch_idx):
        raise NotImplementedError

In [35]:
class ReverseDataset(data.Dataset):
    def __init__(self, num_categories, seq_len, size):
        super().__init__()
        self.num_categories = num_categories
        self.seq_len = seq_len
        self.size = size

        self.data = torch.randint(self.num_categories, size=(self.size, self.seq_len))

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        inp_data = self.data[idx]
        labels = torch.flip(inp_data, dims=(0,))
        return inp_data, labels

In [36]:
dataset = partial(ReverseDataset, 10, 16)
train_loader = data.DataLoader(dataset(50000), batch_size=128, shuffle=True, drop_last=True, pin_memory=True)
val_loader = data.DataLoader(dataset(1000), batch_size=128)
test_loader = data.DataLoader(dataset(10000), batch_size=128)

In [21]:
inp_data, labels = train_loader.dataset[0]
print("Input data:", inp_data)
print("Labels:    ", labels)

Input data: tensor([9, 6, 2, 0, 6, 2, 7, 9, 7, 3, 3, 4, 3, 7, 0, 9])
Labels:     tensor([9, 0, 7, 3, 4, 3, 3, 7, 9, 7, 2, 6, 0, 2, 6, 9])


In [37]:
class ReversePredictor(TransformerPredictor):
    def _calculate_loss(self, batch, mode="train"):
        # Fetch data and transform categories to one-hot vectors
        inp_data, labels = batch
        inp_data = F.one_hot(inp_data, num_classes=self.hparams.num_classes).float()

        # Perform prediction and calculate loss and accuracy
        preds = self.forward(inp_data, add_positional_encoding=True)
        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), labels.view(-1))
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # Logging
        self.log(f"{mode}_loss", loss)
        self.log(f"{mode}_acc", acc)
        return loss, acc

    def training_step(self, batch, batch_idx):
        loss, _ = self._calculate_loss(batch, mode="train")
        return loss

    def validation_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="val")

    def test_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="test")

In [38]:
def train_reverse(**kwargs):
    trainer = pl.Trainer(
        accelerator="mps",
        devices=1,
        max_epochs=10,
        gradient_clip_val=5,
    )
    trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need
    
    model = ReversePredictor(max_iters=trainer.max_epochs * len(train_loader), **kwargs)
    trainer.fit(model, train_loader, val_loader)

    # Test best model on validation and test set
    val_result = trainer.test(model, dataloaders=val_loader, verbose=False)
    test_result = trainer.test(model, dataloaders=test_loader, verbose=False)
    result = {"test_acc": test_result[0]["test_acc"], "val_acc": val_result[0]["test_acc"]}

    model = model.to(device)
    return model, result

In [39]:
reverse_model, reverse_result = train_reverse(
    input_dim=train_loader.dataset.num_categories,
    model_dim=32,
    num_heads=1,
    num_classes=train_loader.dataset.num_categories,
    num_layers=1,
    dropout=0.0,
    lr=5e-4,
    warmup=50,
)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | input_net           | Sequential         | 352    | train
1 | positional_encoding | PositionalEncoding | 0      | train
2 | transformer         | TransformerEncoder | 8.5 K  | train
3 | output_net          | Sequential         | 1.4 K  | train
-------------------------------------------------------------------
10.3 K    Trainable params
0         Non-trainable params
10.3 K    Total params
0.041     Total estimated model params size (MB)
24        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 390/390 [00:03<00:00, 128.20it/s, v_num=9]        

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 390/390 [00:03<00:00, 127.71it/s, v_num=9]
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 281.36it/s]
Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 79/79 [00:00<00:00, 385.64it/s]


In [40]:
print("Val accuracy:  %4.2f%%" % (100.0 * reverse_result["val_acc"]))
print("Test accuracy: %4.2f%%" % (100.0 * reverse_result["test_acc"]))

Val accuracy:  73.88%
Test accuracy: 73.92%


In [2]:
import pandas as pd
import os

In [7]:
folder = "datasets/ml-32m"

ratings_path = os.path.join(folder, 'ratings.csv')
genres_path = os.path.join(folder, 'movies.csv')
tags_path = os.path.join(folder, 'tags.csv')

rating_column_names = ['userId', 'movieId', 'rating', 'timestamp']
genres_column_names = ['movieId', 'title', 'genres']
tags_column_names = ['userId', 'movieId', 'tag', 'timestamp']

df_rating = pd.read_csv(ratings_path, sep=',', names=rating_column_names, dtype={'userId':'int32', 'movieId':'int32', 'rating':float, 'timestamp':'int64'}, header=0)
df_genres = pd.read_csv(genres_path, sep=',', names=genres_column_names, dtype={'movieId':'int32', 'title':'object', 'genres':'object'}, header=0)
df_tags = pd.read_csv(tags_path, sep=',', names=tags_column_names, dtype={'userId':'int32', 'movieId':'int32', 'tag':'object', 'timestamp':'int64'}, header=0)

df_rating.dropna(inplace=True, subset=['userId', 'movieId', 'rating'])
df_genres.dropna(inplace=True, subset=['movieId', 'title', 'genres'])
df_tags.dropna(inplace=True, subset=['userId', 'movieId', 'tag'])
df_tags.drop(columns=["userId","timestamp"], inplace=True)

# Extract movie genres
df_genres['genres'] = df_genres['genres'].apply(lambda x: x.lower().split('|'))

# Extract movie year from title
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
def remove_stop(x):
    out = []
    for y in x:
        if len(y) > 0 and y not in stopwords:
            out += [y]
    return out

def flatten_lists(x):
    x = x[:20]
    out = set()
    for y in x:
        out.update(y.split(" "))
    out = list(out)
    return out

df_genres['movie_year'] = df_genres['title'].str.extract(r'\((\d{4})\)').fillna("2025").astype('int')

df_genres['title'] = df_genres['title'].str.replace(r'\((\d{4})\)', '', regex=True)
df_genres['title'] = df_genres['title'].str.replace(r'[^a-zA-Z0-9\s]+', '', regex=True)
df_genres['title'] = df_genres['title'].apply(lambda x: x.strip().lower().split(" "))
df_genres['title'] = df_genres['title'].apply(lambda x: remove_stop(x))

df_tags['tag'] = df_tags['tag'].str.replace(r'[^a-zA-Z0-9\s]+', '', regex=True)
df_tags['tag'] = df_tags['tag'].apply(lambda x: x.strip().lower())
df_tags = df_tags.groupby("movieId").agg(set).reset_index()
df_tags['tag'] = df_tags['tag'].apply(list)
df_tags['tag'] = df_tags['tag'].apply(lambda x: flatten_lists(x))
df_tags['tag'] = df_tags['tag'].apply(lambda x: remove_stop(x))
df_tags['tag'] = df_tags['tag'].astype("object")

df = df_rating.merge(df_genres, on=['movieId'], how='left')
df = df.merge(df_tags, on=['movieId'], how='left')
df["tag"] = df["tag"].fillna({i: [""] for i in df.index})
df["description"] = df["title"] + df["tag"]
df.drop(columns=["tag"], inplace=True)
df.drop(columns=["title"], inplace=True)

In [8]:
df[:100]

Unnamed: 0,userId,movieId,rating,timestamp,genres,movie_year,description
0,1,17,4.0,944249077,"[drama, romance]",1995,"[sense, sensibility, decorum, 18th, bibliothek..."
1,1,25,1.0,944250228,"[drama, romance]",1995,"[leaving, las, vegas, existential, enough, lov..."
2,1,29,2.0,943230976,"[adventure, drama, fantasy, mystery, sci-fi]",1995,"[city, lost, children, cit, des, enfants, perd..."
3,1,30,5.0,944249077,"[crime, drama]",1995,"[shanghai, triad, yao, yao, yao, dao, waipo, q..."
4,1,32,5.0,943228858,"[mystery, sci-fi, thriller]",1995,"[twelve, monkeys, aka, 12, monkeys, theater, t..."
...,...,...,...,...,...,...,...
95,1,1944,2.0,943231120,"[drama, romance, war]",1953,"[eternity, military, prostitute, boxing, infid..."
96,1,1952,4.0,944253272,[drama],1969,"[midnight, cowboy, clash, sex, frontal, cultur..."
97,1,1960,1.0,943231236,[drama],1987,"[last, emperor, bibliothek, theater, better, c..."
98,1,1961,1.0,944250182,[drama],1988,"[rain, man, love, savant, best, brief, story, ..."


In [9]:
def normalize_ratings(df:pd.DataFrame):
    df2 = df[["userId", "rating"]].groupby(by=["userId"]).agg(mean_user_rating=('rating', 'mean'), std_user_rating=('rating', 'std'))
    df = df.merge(df2, on=["userId"], how="inner")
    df["normalized_rating"] = (df["rating"] - df["mean_user_rating"])/df["std_user_rating"]
    df["normalized_rating"] = df["normalized_rating"].fillna(df["rating"])
    return df

In [10]:
df = normalize_ratings(df)
df.drop(columns=["mean_user_rating", "std_user_rating", "rating"], inplace=True)

In [11]:
df[:100]

Unnamed: 0,userId,movieId,timestamp,genres,movie_year,description,normalized_rating
0,1,17,944249077,"[drama, romance]",1995,"[sense, sensibility, decorum, 18th, bibliothek...",0.304372
1,1,25,944250228,"[drama, romance]",1995,"[leaving, las, vegas, existential, enough, lov...",-1.646377
2,1,29,943230976,"[adventure, drama, fantasy, mystery, sci-fi]",1995,"[city, lost, children, cit, des, enfants, perd...",-0.996127
3,1,30,944249077,"[crime, drama]",1995,"[shanghai, triad, yao, yao, yao, dao, waipo, q...",0.954622
4,1,32,943228858,"[mystery, sci-fi, thriller]",1995,"[twelve, monkeys, aka, 12, monkeys, theater, t...",0.954622
...,...,...,...,...,...,...,...
95,1,1944,943231120,"[drama, romance, war]",1953,"[eternity, military, prostitute, boxing, infid...",-0.996127
96,1,1952,944253272,[drama],1969,"[midnight, cowboy, clash, sex, frontal, cultur...",0.304372
97,1,1960,943231236,[drama],1987,"[last, emperor, bibliothek, theater, better, c...",-1.646377
98,1,1961,944250182,[drama],1988,"[rain, man, love, savant, best, brief, story, ...",-1.646377


In [12]:
def split_train_test(df:pd.DataFrame, min_rated=10, test_ratio=0.8, val_ratio=0.8):
    print("Splitting data into train test and validation...")
    # Split data into training, testing and validation
    df = df.sort_values(by='timestamp')
    df2 = df[["userId", "movieId"]].groupby(by=["userId"]).agg(list).reset_index()

    # Filter all user_ids who have rated more than 'min_rated' movies
    df2 = df2[df2.movieId.apply(len) > min_rated]
    df = df.merge(df2, on=["userId"], how="inner", suffixes=("", "_right"))
    df.drop(columns=['movieId_right'], inplace=True)

    n = df.shape[0]
    m = int(test_ratio*n)

    df_train_val = df[:m]
    df_test = df[m:]

    k = int(val_ratio*m)
    df_train = df_train_val[:k]
    df_val = df_train_val[k:]

    return df_train, df_val, df_test

In [13]:
df_train, df_val, df_test = split_train_test(df, min_rated=5)

Splitting data into train test and validation...


In [9]:
def transform(x, vocab):
    if isinstance(x, list):
        out = []
        for y in x:
            out += [vocab[y]] if y in vocab else [0]
        return out
    else:
        return vocab[x] if x in vocab else 0

In [10]:
def categorical_encoding(df:pd.DataFrame, col:str, max_vocab_size=1000):
    all_vals = df[col].tolist()
    unique_vals = {}

    if len(all_vals) > 0 and isinstance(all_vals[0], list):
        for v in all_vals:
            for x in v:
                if x not in unique_vals:
                    unique_vals[x] = 0
                unique_vals[x] += 1
    else:
        for x in all_vals:
            if x not in unique_vals:
                unique_vals[x] = 0
            unique_vals[x] += 1
    
    unique_vals = sorted(unique_vals.items(), key=lambda item: item[1], reverse=True)
    unique_vals = dict(unique_vals[:min(max_vocab_size, len(unique_vals))])
    unique_vals = sorted(unique_vals.keys())
    vocab = {unique_vals[i] : i+1 for i in range(len(unique_vals))}
        
    df[col] = df[col].apply(lambda x: transform(x, vocab))
    return df[col], vocab

In [11]:
vocabulary = {}
max_vocab_size = {'userId':1e100, 'movieId':1e100, 'title':1e6, 'genres':100, 'movie_year':1e100}

for col in ['userId', 'movieId', 'title', 'genres', 'movie_year']:
    print(col)
    df_train[col], v = categorical_encoding(df_train, col, max_vocab_size[col])
    vocabulary[col] = v

userId
movieId
title
genres
movie_year


In [12]:
df_val = df_val.reset_index()
for col in ['userId', 'movieId', 'title', 'genres', 'movie_year']:
    print(col)
    df_val[col] = df_val[col].apply(lambda x: transform(x, vocabulary[col]))

In [13]:
df_test = df_test.reset_index()
for col in ['userId', 'movieId', 'title', 'genres', 'movie_year']:
    print(col)
    df_test[col] = df_test[col].apply(lambda x: transform(x, vocabulary[col]))

userId
movieId
title
genres
movie_year


In [None]:
import joblib
joblib.dump(vocabulary, "vocabulary.pkl")
joblib.dump(df_train, "df_train.pkl")
joblib.dump(df_val, "df_val.pkl")
joblib.dump(df_test, "df_test.pkl")

['df_test.pkl']

In [16]:
df_train[:100]

Unnamed: 0,userId,movieId,timestamp,title,genres,movie_year,normalized_rating
0,18045,1149,789652004,"[12326, 24446, 45006, 12326, 45130, 10759, 455...","[9, 10, 16]",97,0.367212
1,22172,1053,789652009,"[15214, 6782, 45686, 4086, 17848, 18863, 33258...","[6, 7]",94,-0.484696
2,22172,47,789652009,"[37792, 1482, 37218, 37448, 4821, 9882, 46613,...","[15, 18]",101,1.741315
3,22172,21,789652009,"[16837, 38290, 31210, 14120, 18863, 14687, 115...","[6, 7, 18]",101,-0.484696
4,27825,2,822873600,"[22045, 31210, 18863, 15192, 35686, 4857, 2836...","[3, 5, 10]",101,0.291454
...,...,...,...,...,...,...,...
95,33616,66,823185246,"[23938, 25738, 401, 4769, 10331, 17848, 18863,...","[2, 17, 18]",102,-0.243857
96,33616,74,823185247,"[4377, 35829, 15375, 35700, 46670, 15447, 3479...","[9, 16]",102,-0.243857
97,33616,73,823185247,"[27541, 24252, 23737, 29713, 1103, 31134, 2394...","[9, 19]",101,0.614943
98,33616,9,823185248,"[40951, 10793, 17848, 18863, 20127, 32585, 146...",[2],101,-0.243857


In [3]:
def get_historical_user_features(df:pd.DataFrame, max_hist=20):
	df["seq_id"] = list(range(df.shape[0]))
	df2 = df[["seq_id", "userId", "movieId", "normalized_rating", "timestamp"]].sort_values(by=["userId", "timestamp"])
	
	df2 = df2[["userId", "movieId", "normalized_rating", "seq_id"]].groupby(by=["userId"]).agg(list).reset_index()
	df2.rename(columns={"movieId":"prev_movie_ids", "normalized_rating":"prev_ratings", "seq_id":"prev_seq_ids"}, inplace=True)

	user_ids = []
	p_m_ids = []
	p_r_ids = []
	p_seq_ids = []

	for i in range(df2.shape[0]):
		seq_id = df2.loc[i, "prev_seq_ids"]
		u_id   = df2.loc[i, "userId"]
		m_ids  = df2.loc[i, "prev_movie_ids"]
		r_ids  = df2.loc[i, "prev_ratings"]

		for j in range(len(m_ids)):
			user_ids += [u_id]
			p_seq_ids += [seq_id[j]]
			p_m_ids += [m_ids[:j][-max_hist:]] if j > 0 else [[]]
			p_r_ids += [r_ids[:j][-max_hist:]] if j > 0 else [[]]
	
	df3 = pd.DataFrame({"userId":user_ids, "prev_movie_ids":p_m_ids, "prev_ratings":p_r_ids, "seq_id":p_seq_ids})
	df = df.merge(df3, on=["userId", "seq_id"], how="left")
	df.drop(columns=["seq_id"], inplace=True)
	
	return df

In [4]:
import importlib
import ml_32m_py
import numpy as np

importlib.reload(ml_32m_py)

def get_historical_user_features_cpp(df:pd.DataFrame, max_hist=20):
        user_ids = df['userId'].to_numpy().astype(np.uint32)
        movie_ids = df['movieId'].to_numpy().astype(np.uint32)
        ratings = df['normalized_rating'].to_numpy().astype(np.float32)
        timestamps = df['timestamp'].to_numpy().astype(np.uint64)

        a, b  = ml_32m_py.py_get_historical_features(user_ids, movie_ids, timestamps, ratings, df.shape[0], max_hist)

        df["prev_movie_ids"] = a
        df["prev_ratings"] = b

        return df

In [5]:
import joblib
vocabulary = joblib.load("vocabulary.pkl")
df_train = joblib.load("df_train.pkl")
df_val = joblib.load("df_val.pkl")
df_test = joblib.load("df_test.pkl")

In [6]:
df_train = get_historical_user_features_cpp(df_train)
print("here1")
df_val = get_historical_user_features_cpp(df_val)
print("here2")
df_test = get_historical_user_features_cpp(df_test)
print("here3")

: 

In [None]:
joblib.dump(df_train, "df_train.pkl")
joblib.dump(df_val, "df_val.pkl")
joblib.dump(df_test, "df_test.pkl")

In [19]:
df_train[:100]

Unnamed: 0,userId,movieId,timestamp,title,genres,movie_year,normalized_rating,prev_movie_ids,prev_ratings
0,1,2905,943226846,"[21839, 25689, 27697, 25787, 35686, 34951, 404...","[6, 9, 10]",105,0.304372,[],[]
1,1,2874,943226846,"[40573, 40547, 34790, 27555, 489, 43580, 42083...","[3, 9]",105,-1.646377,[2905],[0.3043722566393317]
2,1,2798,943226916,"[42431, 22828, 22828, 9185, 9393, 20127, 12873...","[2, 3, 6, 9, 19]",105,0.304372,"[2905, 2874]","[0.3043722566393317, -1.6463772063672941]"
3,1,2985,943226986,"[24401, 18835, 1189, 18789, 21685, 32055, 6138...",[9],105,-0.996127,"[2905, 2874, 2798]","[0.3043722566393317, -1.6463772063672941, 0.30..."
4,1,2790,943227458,"[21424, 24375, 36963, 28874, 46289, 16876, 202...",[9],105,-1.646377,"[2905, 2874, 2798, 2985]","[0.3043722566393317, -1.6463772063672941, 0.30..."
...,...,...,...,...,...,...,...,...,...
95,1,818,944248888,"[15774, 22114, 24254, 1339, 11053, 15020, 4185...",[9],102,-0.345878,"[1796, 1054, 175, 2881, 2152, 2411, 1037, 913,...","[0.3043722566393317, -1.6463772063672941, 0.30..."
96,1,601,944248943,"[14614, 29541, 18863, 40209, 46432, 9934, 4645...","[6, 7, 9, 18]",102,-0.996127,"[1054, 175, 2881, 2152, 2411, 1037, 913, 1238,...","[-1.6463772063672941, 0.3043722566393317, 0.95..."
97,1,2177,944248943,"[17337, 26815, 43604, 14120, 7283, 14769, 387,...","[7, 9, 18]",98,-1.646377,"[175, 2881, 2152, 2411, 1037, 913, 1238, 1227,...","[0.3043722566393317, 0.9546220776415403, -1.64..."
98,1,80,944248943,"[46146, 3863, 3725, 37367, 17276, 11609, 8394,...","[5, 9]",101,0.954622,"[2881, 2152, 2411, 1037, 913, 1238, 1227, 895,...","[0.9546220776415403, -1.6463772063672941, -0.3..."


In [23]:
import random
num_parts = 32
df_train["partition"] = [random.randint(1, num_parts) for _ in range(len(df_train))]
df_val["partition"] = [random.randint(1, num_parts) for _ in range(len(df_val))]
df_test["partition"] = [random.randint(1, num_parts) for _ in range(len(df_test))]

In [24]:
df_train[:100]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year,normalized_rating,prev_movie_ids,prev_ratings,partition
0,1,2906,4.0,943226846,"[466, 547]","[6, 9, 10]",118,0.304372,[],[],1
1,1,2875,1.0,943226846,"[0, 842]","[3, 9]",118,-1.646377,[2906],[0.3043722566393317],24
2,1,2799,4.0,943226916,"[879, 494]","[2, 3, 6, 9, 19]",118,0.304372,"[2906, 2875]","[0.3043722566393317, -1.6463772063672941]",24
3,1,2986,2.0,943226986,"[0, 0]",[9],118,-0.996127,"[2906, 2875, 2799]","[0.3043722566393317, -1.6463772063672941, 0.30...",21
4,1,2791,1.0,943227458,"[0, 523]",[9],118,-1.646377,"[2906, 2875, 2799, 2986]","[0.3043722566393317, -1.6463772063672941, 0.30...",31
...,...,...,...,...,...,...,...,...,...,...,...
95,1,818,3.0,944248888,[0],[9],115,-0.345878,"[1797, 1054, 175, 2882, 2153, 2412, 1037, 913,...","[0.3043722566393317, -1.6463772063672941, 0.30...",29
96,1,601,2.0,944248943,[298],"[6, 7, 9, 18]",115,-0.996127,"[1054, 175, 2882, 2153, 2412, 1037, 913, 1238,...","[-1.6463772063672941, 0.3043722566393317, 0.95...",15
97,1,2178,1.0,944248943,"[366, 564]","[7, 9, 18]",111,-1.646377,"[175, 2882, 2153, 2412, 1037, 913, 1238, 1227,...","[0.3043722566393317, 0.9546220776415403, -1.64...",11
98,1,80,5.0,944248943,"[965, 0, 0, 0]","[5, 9]",114,0.954622,"[2882, 2153, 2412, 1037, 913, 1238, 1227, 895,...","[0.9546220776415403, -1.6463772063672941, -0.3...",11


In [25]:
out_path = "parquet_dataset_ml_32m/"
df_train.to_parquet(out_path + "train/", partition_cols=["partition"])
df_val.to_parquet(out_path + "validation/", partition_cols=["partition"])
df_test.to_parquet(out_path + "test/", partition_cols=["partition"])

In [None]:
import joblib
joblib.dump(vocabulary, f"parquet_dataset_ml_32m/vocabulary.pkl")

In [26]:
!gsutil -m cp -R parquet_dataset_ml_32m gs://r6-ae-dev-adperf-adintelligence-data/amondal/

/bin/bash: gsutil: command not found


In [28]:
import math
import os
import numpy as np
import pandas as pd
import random
import uuid
import joblib
import pytorch_lightning as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

def checkpoint(model:nn.Module, optimizer:torch.optim.Optimizer, filename):
    torch.save({'optimizer':optimizer.state_dict(), 'model':model.state_dict()}, filename)

    
def load_model(filename):
    chkpt = torch.load(filename, weights_only=False)
    return chkpt['model'], chkpt['optimizer']

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 42


Device: mps
