In [None]:
%pip install torch
%pip install pytorch-lightning

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True, timeout_ms=120000)

In [4]:
%cd /content/gdrive/MyDrive/movie_recommendation/movie_dataset_bert/
%ls

/content/gdrive/MyDrive/movie_recommendation/movie_dataset_bert
genome-scores.csv  links.csv   ratings.csv  tags.csv
genome-tags.csv    movies.csv  README.txt


In [7]:
import torch
import pytorch_lightning as pl
import pandas as pd
import numpy as np
import random
import torch.nn as nn

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader
from torch.nn import Linear
from torch.nn import functional as F

In [18]:
def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction="none")

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)


class Recommender(pl.LightningModule):
    def __init__(
        self,
        vocab_size,
        channels=128,
        cap=0,
        mask=1,
        dropout=0.4,
        lr=1e-4,
    ):
        super().__init__()

        self.cap = cap
        self.mask = mask

        self.lr = lr
        self.dropout = dropout
        self.vocab_size = vocab_size

        self.item_embeddings = torch.nn.Embedding(
            self.vocab_size, embedding_dim=channels
        )

        self.input_pos_embedding = torch.nn.Embedding(512, embedding_dim=channels)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=6)

        self.linear_out = Linear(channels, self.vocab_size)

        self.do = nn.Dropout(p=self.dropout)

    def encode_src(self, src_items):
        src_items = self.item_embeddings(src_items)

        batch_size, in_sequence_len = src_items.size(0), src_items.size(1)
        pos_encoder = (
            torch.arange(0, in_sequence_len, device=src_items.device)
            .unsqueeze(0)
            .repeat(batch_size, 1)
        )
        pos_encoder = self.input_pos_embedding(pos_encoder)

        src_items += pos_encoder

        src = src_items.permute(1, 0, 2)

        src = self.encoder(src)

        return src.permute(1, 0, 2)

    def forward(self, src_items):

        src = self.encode_src(src_items)

        out = self.linear_out(src)

        return out

    def training_step(self, batch, batch_index):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log('train_loss', loss)
        self.log('train_accuracy', accuracy)

        return loss

    def validation_step(self, batch, batch_index):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log('valid_loss', loss)
        self.log('valid_accuracy', accuracy)

        return loss

    def test_step(self, batch, batch_index):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log('test_loss', loss)
        self.log('test_accuracy', accuracy)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'valid_loss',
        }

In [73]:
def mask_list(l1, p=0.8):

    l1 = [a if random.random() < p else MASK for a in l1]

    return l1


def mask_last_elements_list(l1, val_context_size: int = 5):

    l1 = l1[:-val_context_size] + mask_list(l1[-val_context_size:], p=0.5)

    return l1


class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, group_by, split, history_size=120):
        self.groups = groups
        self.group_by = group_by
        self.split = split
        self.history_size = history_size

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.group_by.get_group(group)

        context = get_context(df, split=self.split, context_size=self.history_size)

        trg_items = context['movieId_mapped'].tolist()

        if self.split == 'train':
            src_items = mask_list(trg_items)
        else:
            src_items = mask_last_elements_list(trg_items)

        pad_mode = 'left' if random.random() < 0.5 else 'right'
        trg_items = pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = pad_list(src_items, history_size=self.history_size, mode=pad_mode)

        src_items = torch.tensor(src_items, dtype=torch.long)

        trg_items = torch.tensor(trg_items, dtype=torch.long)

        return src_items, trg_items


def train(
    data_csv_path: str,
    log_dir: str = 'recommender_logs',
    model_dir: str = 'recommender_models',
    batch_size: int = 32,
    epochs: int = 2000,
    history_size: int = 120,
):
    data = pd.read_csv(data_csv_path)

    data.sort_values(by='timestamp', inplace=True)

    data, mapping, inverse_mapping = map_column(data, col_name='movieId')

    group_by_trains = data.groupby(by='userId')

    groups = list(group_by_trains.groups)

    train_data = Dataset(groups=groups, group_by=group_by_trains, split='train', history_size=history_size,)
    val_data = Dataset(groups=groups, group_by=group_by_trains, split='val', history_size=history_size,)

    print(f'len(train_data): {len(train_data)}')
    print(f'len(val_data): {len(val_data)}')

    train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=False,
    )

    model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )

    logger = TensorBoardLogger(
        save_dir=log_dir,
    )

    checkpoint_callback = ModelCheckpoint(
        monitor='valid_loss',
        mode='min',
        dirpath=model_dir,
        filename='recommender',
    )

    trainer = pl.Trainer(
        max_epochs=epochs,
        gpus=1,
        logger=logger,
        callbacks=[checkpoint_callback],
    )
    trainer.fit(model, train_loader, val_loader)

    result_val = trainer.test(test_dataloaders=val_loader)

    output_json = {
        'val_loss': result_val[0]['test_loss'],
        'best_model_path': checkpoint_callback.best_model_path,
    }

    print(output_json)

    return output_json

In [None]:
# put this and the class above in a separate *.py file for best model training
# that means using the `if __name__ == '__main__':` line
# with an __init__.py file
def training():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_csv_path")
    parser.add_argument("--epochs", type=int, default=500)
    args = parser.parse_args()

    train(
        data_csv_path=args.data_csv_path,
        epochs=args.epochs,
    )

In [None]:
# These are tests
# Tests are meant to run in a separate *.py file, not in a notebook like this one.

# preprocessing test
import pytest

@pytest.fixture
def data():
    df = pd.DataFrame({'id': range(1000, 1010)})
    return df


def test_map_column(data):
    df, mapping, inverse_mapping = map_column(data, col_name='id')

    assert mapping == {
        1000: 2,
        1001: 3,
        1002: 4,
        1003: 5,
        1004: 6,
        1005: 7,
        1006: 8,
        1007: 9,
        1008: 10,
        1009: 11,
    }
    assert inverse_mapping == {
        2: 1000,
        3: 1001,
        4: 1002,
        5: 1003,
        6: 1004,
        7: 1005,
        8: 1006,
        9: 1007,
        10: 1008,
        11: 1009,
    }


# model test 
def test_recommender():
    n_items = 1000

    recommender = Recommender(vocab_size=1000)

    src_items = torch.randint(low=0, high=n_items, size=(32, 30))

    src_items[:, 0] = 1

    trg_out = torch.randint(low=0, high=n_items, size=(32, 30))

    out = recommender(src_items)

    assert out.shape == torch.Size([32, 30, 1000])

    loss = recommender.training_step((src_items, trg_out), batch_idx=1)

    assert isinstance(loss, torch.Tensor)

    assert not torch.isnan(loss).any()

    assert loss.size() == torch.Size([])

In [10]:
PAD = 0
MASK = 1

In [12]:
def map_column(df: pd.DataFrame, col_name: str):
    values = sorted(list(df[col_name].unique()))
    mapping = {k: i + 2 for i, k in enumerate(values)}
    inverse_mapping = {v: k for k, v in mapping.items()}

    df[col_name + '_mapped'] = df[col_name].map(mapping)

    return df, mapping, inverse_mapping


def get_context(df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 5):
    if split == 'train':
        end_index = random.randint(10, df.shape[0] - val_context_size)
    elif split in ['val', 'test']:
        end_index = df.shape[0]
    else:
        raise ValueError

    start_index = max(0, end_index - context_size)

    context = df[start_index:end_index]

    return context


def pad_array(array: np.ndarray, expected_size: int = 30):
    array = np.pad(array, [(expected_size - array.shape[0], 0), (0, 0)], mode='edge')
    return array


def pad_list(list_integers, history_size: int, pad_val: int = PAD, mode='left'):

    if len(list_integers) < history_size:
        if mode == 'left':
            list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (history_size - len(list_integers))

    return list_integers


def df_to_np(df, expected_size=30):
    array = np.array(df)
    array = pad_array(array, expected_size=expected_size)
    return array


def genome_mapping(genome):
    genome.sort_values(by=['movieId', 'tagId'], inplace=True)
    movie_genome = genome.groupby('movieId')['relevance'].agg(list).reset_index()

    movie_genome = {a: b for a, b in zip(movie_genome['movieId'], movie_genome['relevance'])}

    return movie_genome

In [13]:
data_path = '/content/gdrive/MyDrive/movie_recommendation/movie_dataset_bert/ratings.csv'
movie_path = '/content/gdrive/MyDrive/movie_recommendation/movie_dataset_bert/movies.csv'

In [35]:
saved_model = '' # https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html
# but training is costly so...chugging on with inaccurate results below.

In [14]:
data = pd.read_csv(data_path)
movies = pd.read_csv(movie_path)

In [15]:
data.sort_values(by='timestamp', inplace=True)

In [16]:
data, mapping, inverse_mapping = map_column(data, col_name = 'movieId')
group_by_trains = data.groupby(by='userId')

In [45]:
print(group_by_trains.head())

          userId  movieId  rating   timestamp  movieId_mapped
326761      2262       21     3.0   789652009              22
326810      2262     1079     3.0   789652009            1054
326767      2262       47     5.0   789652009              48
15845015  102689        1     4.0   822873600               2
15845023  102689       39     5.0   822873600              40
...          ...      ...     ...         ...             ...
13207835   85523      318     5.0  1574325400             316
13207852   85523    58559     4.0  1574325420           12218
13207846   85523    33794     4.0  1574325422           10002
13207848   85523    44191     4.5  1574325426           10679
13207849   85523    48516     4.5  1574325429           11124

[812705 rows x 5 columns]


In [61]:
random.sample(list(group_by_trains.groups), k = 10)

[78869, 28165, 120446, 18011, 25044, 121316, 6741, 31004, 13680, 144183]

In [68]:
model = Recommender(vocab_size = len(mapping) + 2, dropout = 0.3, lr = 0.0001)
model.eval()
# model.load_state_dict(torch.load(saved_model)['state_dict']) # load a pre-trained and saved model here
# no saved model to load so error is to be expected if uncommented
# ...wow...10 samples is bad....just....so bad.

Recommender(
  (item_embeddings): Embedding(59049, 128)
  (input_pos_embedding): Embedding(512, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (linear_out): Linear(in_features=128, out_features=59049, bias=True)
  (do): Dropout(p=0.3, inplace=False)
)

In [69]:
movie_to_index = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
index_to_movie = {v: k for k, v in movie_to_index.items()}

In [64]:
def predict(list_movies, model, movie_to_index, index_to_movie):
    ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_index[a] for a in list_movies] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [index_to_movie[a] for a in sorted_predicted_ids[:30] if a in index_to_movie]

In [70]:
list_movies = ['Harry Potter and the Sorcerer\'s Stone (a.k.a. Harry Potter and the Philosopher\'s Stone) (2001)',
               'Harry Potter and the Chamber of Secrets (2002)',
               'Harry Potter and the Prisoner of Azkaban (2004)',
               'Harry Potter and the Goblet of Fire (2005)']

top_movie_adventure_fantasy = predict(list_movies, model, movie_to_index, index_to_movie)
top_movie_adventure_fantasy

['Stand Up and Cheer! (1934)',
 'Reluctant Saint, The (1962)',
 'Fury of Achilles (1962)',
 'Le noeud cravate (2008)',
 'Take the Trash (2008)',
 'Questi fantasmi (1962)',
 'Taken (2002)',
 'Far Cry (2008)',
 'Blue Monkey (1987)',
 'Strayed (2009)',
 'Who Am I (Kein System Ist Sicher) (2014)',
 'Today We Kill, Tomorrow We Die! (1968)',
 'My Old Lady (2014)',
 'American Outlaws (2001)',
 'The Witch Files (2018)',
 'Little Galicia (2015)',
 'Nomis (2018)',
 'Players, The (Les infidèles) (2012)',
 'Bardelys the Magnificent (1926)',
 'Jumper (2008)',
 'Cry in the Dark, A (1988)',
 "Rooster's Breakfast (Petelinji zajtrk) (2007)",
 'The Royal Tailor (2014)',
 'Daughters Courageous (1939)',
 'The Tunnel (1933)',
 'The Great Martian War 1913 - 1917 (2013)',
 'No Escape (1994)',
 'Bis (2015)',
 'Tabasco Road (1957)',
 'The Parent Trap II (1986)']

In [71]:
list_movies = ["Black Panther (2017)",
               "Avengers, The (2012)",
               "Avengers: Infinity War - Part I (2018)",
               "Logan (2017)",
               "Spider-Man (2002)",
               "Spider-Man 3 (2007)",
               "Spider-Man: Far from Home (2019)"]
top_movie_action_adv = predict(list_movies, model, movie_to_index, index_to_movie)
top_movie_action_adv

['Stand Up and Cheer! (1934)',
 'Reluctant Saint, The (1962)',
 'Fury of Achilles (1962)',
 'Le noeud cravate (2008)',
 'Take the Trash (2008)',
 'Questi fantasmi (1962)',
 'Blue Monkey (1987)',
 'Taken (2002)',
 'Far Cry (2008)',
 'Strayed (2009)',
 'Who Am I (Kein System Ist Sicher) (2014)',
 'Today We Kill, Tomorrow We Die! (1968)',
 'My Old Lady (2014)',
 'The Tunnel (1933)',
 'Little Galicia (2015)',
 'Players, The (Les infidèles) (2012)',
 'American Outlaws (2001)',
 'The Witch Files (2018)',
 'Nomis (2018)',
 'The Parent Trap II (1986)',
 "Rooster's Breakfast (Petelinji zajtrk) (2007)",
 'Bis (2015)',
 'Jumper (2008)',
 'A Girl Named Sooner (1975)',
 'Tabasco Road (1957)',
 'Daughters Courageous (1939)',
 'No Escape (1994)',
 'The Great Martian War 1913 - 1917 (2013)',
 'Bardelys the Magnificent (1926)',
 'Cry in the Dark, A (1988)']

In [72]:
list_movies = ["Zootopia (2016)",
               "Toy Story 3 (2010)",
               "Toy Story 4 (2019)",
               "Finding Nemo (2003)",
               "Ratatouille (2007)",
               "The Lego Movie (2014)",
               "Ghostbusters (a.k.a. Ghost Busters) (1984)",
               "Ace Ventura: When Nature Calls (1995)"]
top_movie_comedy = predict(list_movies, model, movie_to_index, index_to_movie)
top_movie_comedy

['Stand Up and Cheer! (1934)',
 'Reluctant Saint, The (1962)',
 'Fury of Achilles (1962)',
 'Take the Trash (2008)',
 'Le noeud cravate (2008)',
 'Questi fantasmi (1962)',
 'Blue Monkey (1987)',
 'Far Cry (2008)',
 'Who Am I (Kein System Ist Sicher) (2014)',
 'Taken (2002)',
 'Strayed (2009)',
 'My Old Lady (2014)',
 'Today We Kill, Tomorrow We Die! (1968)',
 'American Outlaws (2001)',
 'Little Galicia (2015)',
 'The Tunnel (1933)',
 'The Parent Trap II (1986)',
 'Players, The (Les infidèles) (2012)',
 'Nomis (2018)',
 'Bis (2015)',
 'Bardelys the Magnificent (1926)',
 'A Girl Named Sooner (1975)',
 'Cry in the Dark, A (1988)',
 'Ardh Satya (1983)',
 'The Witch Files (2018)',
 'No Escape (1994)',
 'Tabasco Road (1957)',
 'Jumper (2008)',
 'Daughters Courageous (1939)',
 "Rooster's Breakfast (Petelinji zajtrk) (2007)"]

['How to Use Guys with Secret Tips (2013)',
 'More Than a Miracle (1967)',
 'Once Upon a Warrior (Anaganaga O Dheerudu) (2011)',
 'Inside Man: Most Wanted (2019)',
 'Celeste and Jesse Forever (Celeste & Jesse Forever) (2012)',
 'Last Unicorn, The (1982)',
 '48 Christmas Wishes (2017)',
 'Piranha II: The Spawning (1981)',
 'All Cheerleaders Die (2013)',
 'San Francisco International (1970)',
 'Big Gold Dream: Scottish Post-Punk and Infiltrating the Mainstream (2015)',
 'Brotherhood of Death (1976)',
 'P.S. (2004)',
 'Pope John Paul II (2005)',
 'Malice@Doll (2004)',
 'Eddie Izzard: Circle (2002)',
 "Everybody's Fine (Stanno tutti bene) (1990)",
 "You Can't Kill Stephen King (2013)",
 'Before the Rain (Pred dozhdot) (1994)',
 'That still Karloson! (2012)',
 'Asteria (2017)',
 'Resina (2018)',
 'Gary Gulman: In This Economy? (2012)',
 'The Auction (2013)',
 'Sleep No More (2018)',
 'Tim (1979)',
 'The Wild Country (1970)',
 'Learning to Lie (2003)',
 "McCinsey's Island (1998)",
 'Infiltra