In [1]:
import pandas as pd
from torch.utils.data import Dataset
import torch

In [2]:
df = pd.read_csv("../data/ml-100k/u.data", sep="\t", header=None)

In [3]:
# removing negative ratings
df = df[df[2] > 2]

In [4]:
df.drop(columns=[2], inplace=True)

In [5]:
df.sort_values(by=[0, 3], ascending=True, inplace=True)

In [6]:
# split data grouping by user, where 20% of the data ordered by timestamp is used for testing
train = df.groupby(0).apply(lambda x: x.iloc[:-int(len(x) * 0.2)])
test = df.groupby(0).apply(lambda x: x.iloc[-int(len(x) * 0.2):])

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

  train = df.groupby(0).apply(lambda x: x.iloc[:-int(len(x) * 0.2)])
  test = df.groupby(0).apply(lambda x: x.iloc[-int(len(x) * 0.2):])


In [7]:
# format data into rows of user, item
train = train.groupby(0).apply(lambda x: x[1].tolist()).tolist()
test = test.groupby(0).apply(lambda x: x[1].tolist()).tolist()

  train = train.groupby(0).apply(lambda x: x[1].tolist()).tolist()
  test = test.groupby(0).apply(lambda x: x[1].tolist()).tolist()


In [8]:
# if list has more than 10 items, split it into multiple lists of 10 items

def split_inner_lists(train, chunk_size):
    """Splits each inner list in train into sublists of specified chunk size."""
    result = [sublist[i:i + chunk_size] for sublist in train for i in range(0, len(sublist), chunk_size)]
    return result

train = split_inner_lists(train, 10)
test = split_inner_lists(test, 10)

In [13]:
class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = torch.tensor(self.data[idx][:-1], dtype=torch.long)
        target = torch.tensor(self.data[idx][1:], dtype=torch.long)

        return inputs, target

    def collate_fn(self, batch):
        inputs, target = zip(*batch)
        max_len = max(map(len, inputs))
        inputs = torch.stack([torch.nn.functional.pad(i, (0, max_len - len(i))) for i in inputs])
        target = torch.stack([torch.nn.functional.pad(t, (0, max_len - len(t))) for t in target])

        return inputs, target


In [11]:
import torch.nn as nn
import torch.nn.functional as F


class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size=33278, ninp=200, nhid=200, nlayers=2, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(vocab_size, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout, batch_first=True)
        self.decoder = nn.Linear(nhid, vocab_size)

        self.nlayers = nlayers
        self.nhid = nhid
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.vocab_size)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (
            weight.new_zeros(self.nlayers, bsz, self.nhid),
            weight.new_zeros(self.nlayers, bsz, self.nhid),
        )

In [14]:
import torch
from torch.utils.data import DataLoader
from lightning.pytorch.demos import SequenceSampler
import lightning as L


class LanguageModel(L.LightningModule):
    def __init__(self, vocab_size=33278):
        super().__init__()
        self.model = SimpleLSTM(vocab_size)
        self.hidden = None

    def on_train_epoch_end(self):
        self.hidden = None

    def training_step(self, batch, batch_idx):
        input, target = batch
        if self.hidden is None:
            self.hidden = self.model.init_hidden(input.size(0))
        self.hidden = (self.hidden[0].detach(), self.hidden[1].detach())
        output, self.hidden = self.model(input, self.hidden)
        loss = torch.nn.functional.nll_loss(output, target.view(-1), ignore_index=0)


        self.log("train_loss", loss, prog_bar=True)
        return loss

    def predict(self, input):
        hidden = self.model.init_hidden(1)
        output, _ = self.model(input, hidden)
        return output.squeeze().exp()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=0.001)

dataset = MovieLensDataset(train)
dataloader = DataLoader(dataset, batch_sampler=SequenceSampler(dataset, batch_size=64), collate_fn=dataset.collate_fn)
model = LanguageModel(vocab_size=1683)
trainer = L.Trainer(gradient_clip_val=0.25, max_epochs=20)
trainer.fit(model, dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | SimpleLSTM | 1.3 M  | train
---------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.272     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
/Users/dominykas.seputis/github/msc-thesis/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 110/110 [00:00<00:00, 125.96it/s, v_num=55, train_loss=5.080]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 110/110 [00:00<00:00, 123.62it/s, v_num=55, train_loss=5.080]


In [15]:
movies = pd.read_csv("../data/ml-100k/u.item", sep="|", header=None, encoding="latin-1")

In [16]:
# create dictionary of movie id to movie name
movie_dict = dict(zip(movies[0], movies[1]))

In [17]:
# remove rows with less than 3 records
test = [i for i in test if len(i) > 3]

In [None]:
# evaluate
model.eval()


# randomly shuffle test data
import random
random.shuffle(test)

test_dataset = MovieLensDataset(test)

test_dataloader = DataLoader(test_dataset, batch_sampler=SequenceSampler(test_dataset, batch_size=1))


acc_at_10 = []

for batch in test_dataloader:
    input, target = batch
    # get next target
    target = target[0][-1]
    output = model.predict(input)[0]
    target = input[0][-1].item()
    topk = torch.topk(output, 10).indices.tolist()
    acc_at_10.append(target in topk)


print(f"Accuracy at 10: {sum(acc_at_10) / len(acc_at_10)}")

Accuracy at 10: 0.040804918949133594


In [20]:
# get some sample predictions

import random
random.shuffle(test)

for i in range(10):
    input = torch.tensor(test[i][:-1]).unsqueeze(0)
    target = test[i][0]

    output = model.predict(input)[0]
    topk = torch.topk(output, 10).indices.tolist()

    print("Last three movies watched:")
    for movie in input[0][-3:]:
        print(movie_dict[movie.item()])

    print("\n")

    print("Next movie to watch:")
    print(movie_dict[target])

    print("\n")

    print("Top 10 recommendations:")
    for movie in topk:
        print(movie_dict[movie])

    print("--------")

Last three movies watched:
Sleepers (1996)
Ghost (1990)
Aliens (1986)


Next movie to watch:
Boys of St. Vincent, The (1993)


Top 10 recommendations:
Carrie (1976)
Omen, The (1976)
Cape Fear (1991)
American Werewolf in London, An (1981)
Copycat (1995)
Cinderella (1950)
Sound of Music, The (1965)
Mary Poppins (1964)
Groundhog Day (1993)
Dolores Claiborne (1994)
--------
Last three movies watched:
Bananas (1971)
Clerks (1994)
Dances with Wolves (1990)


Next movie to watch:
Back to the Future (1985)


Top 10 recommendations:
Empire Strikes Back, The (1980)
Pulp Fiction (1994)
Raiders of the Lost Ark (1981)
Princess Bride, The (1987)
Back to the Future (1985)
Blues Brothers, The (1980)
Aliens (1986)
Terminator, The (1984)
Jaws (1975)
Sting, The (1973)
--------
Last three movies watched:
Caught (1996)
Mr. Holland's Opus (1995)
Night on Earth (1991)


Next movie to watch:
Go Fish (1994)


Top 10 recommendations:
Great White Hype, The (1996)
Maverick (1994)
Grease (1978)
Nine Months (1995)


In [459]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from lightning.pytorch.demos import SequenceSampler
import lightning as L

class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        watched_sequence = torch.tensor(self.data[idx][:-1])
        next_movie = torch.tensor(self.data[idx][-1])
        return watched_sequence, next_movie

    def collate_fn(self, batch):
        watched_sequences, next_movies = zip(*batch)
        max_len = max(map(len, watched_sequences))
        watched_sequences = torch.stack([torch.nn.functional.pad(i, (0, max_len - len(i))) for i in watched_sequences])

        return watched_sequences, torch.tensor(next_movies)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1)].transpose(0, 1)
        return self.dropout(x)


class SimpleDecoderTransformer(nn.Module):
    def __init__(self, vocab_size=33278, ninp=200, nhid=200, nhead=2, nlayers=2, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, ninp)
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        decoder_layer = nn.TransformerDecoderLayer(d_model=ninp, nhead=nhead, dim_feedforward=nhid, dropout=dropout, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, nlayers)
        self.decoder = nn.Linear(ninp, vocab_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embed.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src):
        embedded = self.embed(src) * math.sqrt(self.embed.embedding_dim)
        embedded = self.pos_encoder(embedded)

        seq_len = embedded.size(1)

        # Causal mask
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(embedded.device)

        # Because we lack an encoder, create a dummy memory filled with zeros
        dummy_memory = torch.zeros_like(embedded)

        output = self.transformer_decoder(embedded, dummy_memory, tgt_mask=causal_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)


class LanguageModel(L.LightningModule):
    def __init__(self, vocab_size=33278):
        super().__init__()
        self.model = SimpleDecoderTransformer(vocab_size)

    def training_step(self, batch, batch_idx):
        inputs, target = batch
        output = self.model(inputs)
        # We only care about the prediction after the last observed movie in sequences
        last_output = output[:, -1, :]
        loss = torch.nn.functional.nll_loss(last_output, target, ignore_index=0)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=0.001)

    def predict_next(self, input_sequence):
        # Assume input_sequence is padded alike training inputs with trailing 0s if needed
        self.eval()
        with torch.no_grad():
            output = self.model(input_sequence)
            return output[:, -1, :]


dataset = MovieLensDataset(train)
dataloader = DataLoader(dataset, batch_sampler=SequenceSampler(dataset, batch_size=64), collate_fn=dataset.collate_fn)
model = LanguageModel(vocab_size=1683)
trainer = L.Trainer(max_epochs=100)
trainer.fit(model, dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type                     | Params | Mode 
-----------------------------------------------------------
0 | model | SimpleDecoderTransformer | 1.5 M  | train
-----------------------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.925     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: No training batches.


In [453]:
# evaluate
model.eval()
test_dataset = MovieLensDataset(test)
test_dataloader = DataLoader(test_dataset, batch_sampler=SequenceSampler(test_dataset, batch_size=1))

acc_at_10 = []
for batch in test_dataloader:
    input, target = batch
    target = target[0].item()
    # get next target
    output = model.predict_next(input)
    topk = torch.topk(output, 10).indices.tolist()[0]
    acc_at_10.append(target in topk)


print(f"Accuracy at 10: {sum(acc_at_10) / len(acc_at_10)}")

Accuracy at 10: 0.016568047337278107


In [None]:
# get some sample predictions

import random
random.shuffle(test)

for i in range(10):
    input = torch.tensor(test[i][:-1]).unsqueeze(0)
    target = test[i][0]

    output = model.predict_next(input)[0]
    topk = torch.topk(output, 10).indices.tolist()

    print("Last three movies watched:")
    for movie in input[0][-3:]:
        print(movie_dict[movie.item()])

    print("\n")

    print("Next movie to watch:")
    print(movie_dict[target])

    print("\n")

    print("Top 10 recommendations:")
    for movie in topk:
        print(movie_dict[movie])

    print("--------")

In [452]:
input = torch.tensor([1, 94, 261, 422, 477])
out = model.predict_next(input.unsqueeze(0))

recommendations = torch.topk(out, 10).indices.squeeze().tolist()

print("Top 10 recommendations:")
for movie in recommendations:
    print(movie_dict[movie])

Top 10 recommendations:
Ulee's Gold (1997)
M (1931)
Candidate, The (1972)
Mulholland Falls (1996)
Last Supper, The (1995)
Ransom (1996)
Spawn (1997)
Lone Star (1996)
Basic Instinct (1992)
Full Monty, The (1997)
