In [2]:
# Standard libraries
import math
import os
import urllib.request
from functools import partial
from urllib.error import HTTPError

# Plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np

# PyTorch Lightning
import pytorch_lightning as pl
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Torchvision
import torchvision
from pytorch_lightning.callbacks import ModelCheckpoint
from torchvision import transforms
from torchvision.datasets import CIFAR100
from tqdm.notebook import tqdm

plt.set_cmap("cividis")
%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 42


Device: mps


<Figure size 640x480 with 0 Axes>

In [7]:
a = torch.randint(0, 99, (128, 30, 10))
b = nn.Embedding(100, 60)
c = b(a)
d = torch.mean(c, dim=2)
print(d.shape)

torch.Size([128, 30, 60])


In [448]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()

        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model) # (seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x:torch.Tensor):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)   
        return self.dropout(x)

In [468]:
def attention(q:torch.Tensor, k:torch.Tensor, v:torch.Tensor, ratings:torch.Tensor, mask=None):
    d_k = q.size()[-1] # q,k,v : (batch, head, seq_len, embed_size_per_head)
    attn_logits = torch.matmul(q, k.transpose(-2, -1)) # (batch, head, seq_len, seq_len)
    if ratings is not None:
        attn_logits = attn_logits*ratings.unsqueeze(1).unsqueeze(2)
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attn_logits, v) # (batch, head, seq_len, embed_size_per_head)
    return values, attention

def init_weights(x:nn.Linear):
    with torch.no_grad():
        nn.init.xavier_uniform_(x.weight)
        x.bias.data.fill_(0)

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, input_dim:int, d_model: int, h: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h

        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h

        self.w_q = nn.Linear(input_dim, d_model) # Wq
        self.w_k = nn.Linear(input_dim, d_model) # Wk
        self.w_v = nn.Linear(input_dim, d_model) # Wv
        self.w_o = nn.Linear(d_model, d_model) # Wo

        init_weights(self.w_q)
        init_weights(self.w_k)
        init_weights(self.w_v)
        init_weights(self.w_o)

    def forward(self, q_x:torch.Tensor, k_x:torch.Tensor, v_x:torch.Tensor, ratings:torch.Tensor, mask=None):
        q:torch.Tensor = self.w_q(q_x) # (batch, seq_len, d_model)
        k:torch.Tensor = self.w_k(k_x) # (batch, seq_len, d_model)
        v:torch.Tensor = self.w_v(v_x) # (batch, seq_len, d_model)

        q_h = q.reshape(q.shape[0], q.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        k_h = k.reshape(k.shape[0], k.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        v_h = v.reshape(v.shape[0], v.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)

        attn_out, _ = attention(q_h, k_h, v_h, ratings, mask) # (batch, head, seq_len, embed_size_per_head)
        attn_out = attn_out.transpose(1, 2) # (batch, seq_len, head, embed_size_per_head)
        attn_out = attn_out.reshape(attn_out.shape[0], attn_out.shape[1], attn_out.shape[2]*attn_out.shape[3]) # (batch, seq_len, d_model)

        return self.w_o(attn_out) # (batch, seq_len, d_model)

In [469]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()

        self.self_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)

        self.ffn_1 = nn.Linear(input_dim, dim_feedforward)
        self.ffn_2 = nn.Linear(dim_feedforward, input_dim)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)

        self.ffn = nn.Sequential(
            self.ffn_1,
            nn.Dropout(dropout),
            nn.GELU(),
            self.ffn_2,
        )

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, ratings, mask=None):
        attn_out = self.self_attn(x, x, x, ratings, mask=mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(attn_out) # (batch, seq_len, input_dim)
        x = self.norm1(x) # (batch, seq_len, input_dim)

        ffn_out = self.ffn(x) # (batch, seq_len, input_dim)
        x = x + self.dropout(ffn_out) # (batch, seq_len, input_dim)
        x = self.norm2(x) # (batch, seq_len, input_dim)

        return x

In [470]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, x, ratings, mask=None):
        for layer in self.layers:
            x = layer(x, ratings, mask=mask)
        return x

In [471]:
class DecoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0)->None:
        super().__init__()

        self.self_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)
        self.crss_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)

        self.ffn_1 = nn.Linear(input_dim, dim_feedforward)
        self.ffn_2 = nn.Linear(dim_feedforward, input_dim)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)
        
        self.ffn = nn.Sequential(
            self.ffn_1,
            nn.Dropout(dropout),
            nn.GELU(),
            self.ffn_2,
        )

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.norm3 = nn.LayerNorm(input_dim)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_ratings, encoder_output, pred_mask, pad_mask):
        self_attn_out = self.self_attn(x, x, x, None, mask=pred_mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(self_attn_out) # (batch, seq_len, input_dim)
        x = self.norm1(x) # (batch, seq_len, input_dim)

        crss_attn_out = self.crss_attn(x, encoder_output, encoder_output, enc_ratings, mask=pad_mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(crss_attn_out) # (batch, seq_len, input_dim)
        x = self.norm2(x) # (batch, seq_len, input_dim)

        ffn_out = self.ffn(x) # (batch, seq_len, input_dim)
        x = x + self.dropout(ffn_out) # (batch, seq_len, input_dim)
        x = self.norm3(x) # (batch, seq_len, input_dim)

        return x

In [472]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout):
        super().__init__()
        self.layers = nn.ModuleList([DecoderBlock(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, x, enc_ratings, encoder_output, pred_mask=None, pad_mask=None):
        for layer in self.layers:
            x = layer(x, enc_ratings, encoder_output, pred_mask, pad_mask)
        return x

In [473]:
class Transformer(nn.Module):
    def __init__(self, user_vocab_size, interval_vocab_size, genres_vocab_size, years_vocab_size, src_movie_vocab_size, tgt_movie_vocab_size, src_seq_len, tgt_seq_len, d_model, num_heads, dim_feedforward, num_encoder_layers, num_decoder_layers, dropout=0.0) -> None:
        super(Transformer, self).__init__()

        self.user_embedding = nn.Embedding(user_vocab_size, d_model)
        self.interval_embedding = nn.Embedding(interval_vocab_size, d_model)
        self.years_embedding = nn.Embedding(years_vocab_size, d_model)
        self.encoder_embedding = nn.Embedding(src_movie_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_movie_vocab_size, d_model)

        self.src_positional_encoding = PositionalEncoding(d_model, src_seq_len, dropout)
        self.tgt_positional_encoding = PositionalEncoding(d_model, tgt_seq_len, dropout)

        self.encoder_block = Encoder(num_encoder_layers, d_model, num_heads, dim_feedforward, dropout)
        self.decoder_block = Decoder(num_decoder_layers, d_model, num_heads, dim_feedforward, dropout)

        self.genres_encoder = nn.Linear(genres_vocab_size, d_model)
        init_weights(self.genres_encoder)

        self.fc_encoder = nn.Linear(5*d_model, d_model)
        init_weights(self.fc_encoder)

        self.fc_decoder = nn.Linear(5*d_model, d_model)
        init_weights(self.fc_decoder)

        self.fc = nn.Linear(d_model, tgt_movie_vocab_size)
        init_weights(self.fc)

        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=-1)       


    def generate_mask(self, src:torch.Tensor, tgt:torch.Tensor):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2) # (batch, 1, 1, seq_len)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3) # (batch, 1, seq_len, 1)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device=device) # (1, seq_len, seq_len)
        tgt_mask = tgt_mask & nopeak_mask # (batch, 1, seq_len, seq_len)
        return src_mask, tgt_mask
    

    def forward(
            self, 
            user_ids:torch.Tensor, 
            interval:torch.Tensor, 
            genres:torch.Tensor, 
            years:torch.Tensor,
            ratings:torch.Tensor, 
            src_movie_ids:torch.Tensor, 
            tgt_movie_ids:torch.Tensor):
        
        src_mask, tgt_mask = self.generate_mask(src_movie_ids, tgt_movie_ids)

        src_movie_embedding = self.encoder_embedding(src_movie_ids) # (batch, seq_len, d_model)
        tgt_movie_embedding = self.decoder_embedding(tgt_movie_ids) # (batch, seq_len, d_model)

        interval_embedding = self.interval_embedding(interval) # (batch, seq_len, d_model)
        years_embedding = self.years_embedding(years) # (batch, seq_len, d_model)
        genres_embedding = self.genres_encoder(genres) # (batch, seq_len, d_model)
        user_embed = self.user_embedding(user_ids) # (batch, seq_len, d_model)

        src_movie_embedding = torch.concat([src_movie_embedding, user_embed, interval_embedding, years_embedding, genres_embedding], dim=2) # (batch, seq_len, 5*d_model)
        src_movie_embedding = self.fc_encoder(src_movie_embedding) # (batch, seq_len, d_model)

        src_movie_embedding = self.src_positional_encoding(src_movie_embedding) # (batch, seq_len, d_model)
        tgt_movie_embedding = self.tgt_positional_encoding(tgt_movie_embedding) # (batch, seq_len, d_model)

        src_movie_embedding = self.dropout(src_movie_embedding) # (batch, seq_len, d_model)
        tgt_movie_embedding = self.dropout(tgt_movie_embedding) # (batch, seq_len, d_model)

        enc_output = self.encoder_block(src_movie_embedding, ratings, src_mask) # (batch, seq_len, d_model)
        dec_output = self.decoder_block(tgt_movie_embedding, ratings, enc_output, tgt_mask, src_mask) # (batch, seq_len,d_model)

        return self.fc(dec_output) # (batch, seq_len, tgt_vocab_size)

In [474]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [12]:
import pandas as pd

ratings_path = '/Users/amondal/recsys/datasets/ml-32m/ratings.csv'
genres_path = '/Users/amondal/recsys/datasets/ml-32m/movies.csv'

rating_column_names = ['userId', 'movieId', 'rating', 'timestamp']
genres_column_names = ['movieId', 'title', 'genres']

df_rating = pd.read_csv(ratings_path, sep=',', names=rating_column_names, dtype={'userId':'int32', 'movieId':'int32', 'rating':float, 'timestamp':'int64'}, header=0)
df_genres = pd.read_csv(genres_path, sep=',', names=genres_column_names, dtype={'movieId':'int32', 'title':'object', 'genres':'object'}, header=0)

df_rating.dropna(inplace=True, subset=['userId', 'movieId', 'rating'])
df_genres.dropna(inplace=True, subset=['movieId', 'title', 'genres'])

df_genres['genres'] = df_genres['genres'].apply(lambda x: x.split('|'))
df_genres['movie_year'] = df_genres['title'].str.extract(r'\((\d{4})\)').fillna("1").astype('int')
df_genres.drop(columns=['title'], inplace=True)

df = df_rating.merge(df_genres, on=['movieId'], how='left')

In [13]:
df

Unnamed: 0,userId,movieId,rating,timestamp,genres,movie_year
0,1,17,4.0,944249077,"[Drama, Romance]",1995
1,1,25,1.0,944250228,"[Drama, Romance]",1995
2,1,29,2.0,943230976,"[Adventure, Drama, Fantasy, Mystery, Sci-Fi]",1995
3,1,30,5.0,944249077,"[Crime, Drama]",1995
4,1,32,5.0,943228858,"[Mystery, Sci-Fi, Thriller]",1995
...,...,...,...,...,...,...
32000199,200948,79702,4.5,1294412589,"[Action, Comedy, Fantasy, Musical, Romance]",2010
32000200,200948,79796,1.0,1287216292,"[Action, Adventure, Drama, Thriller, War]",2010
32000201,200948,80350,0.5,1294412671,[Comedy],2010
32000202,200948,80463,3.5,1350423800,[Drama],2010


In [14]:
all_genres = df['genres'].tolist()

genres_set = set()
for x in all_genres:
    genres_set.update(set(x))

genres_set = list(genres_set)
inv_idx = {genres_set[i]:i for i in range(len(genres_set))}

genres_mh = []
for x in all_genres:
    h = [0]*len(genres_set)
    for y in x:
        h[inv_idx[y]] = 1
    genres_mh += [h]

df['genres_mh'] = genres_mh
df.drop(columns=['genres'], inplace=True)

In [15]:
df

Unnamed: 0,userId,movieId,rating,timestamp,movie_year,genres_mh
0,1,17,4.0,944249077,1995,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,25,1.0,944250228,1995,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,29,2.0,943230976,1995,"[0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, ..."
3,1,30,5.0,944249077,1995,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ..."
4,1,32,5.0,943228858,1995,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."
...,...,...,...,...,...,...
32000199,200948,79702,4.5,1294412589,2010,"[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, ..."
32000200,200948,79796,1.0,1287216292,2010,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, ..."
32000201,200948,80350,0.5,1294412671,2010,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
32000202,200948,80463,3.5,1350423800,2010,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# import pandas as pd
# import os

# path = '/Users/amondal/recsys/datasets/ml-1m/ratings.dat'
# user_ids, movie_ids, ratings, timestamps = [], [], [], []
# with open(path) as f:
#     data = f.readlines()
#     data = [x.rstrip().split('::') for x in data]
#     for x in data:
#         user_ids += [int(x[0])]
#         movie_ids += [int(x[1])]
#         ratings += [float(x[2])]
#         timestamps += [int(x[3])]

# df = pd.DataFrame(data={'userId':user_ids, 'movieId':movie_ids, 'rating':ratings, 'timestamp':timestamps})

In [None]:
df = df.sort_values(by='timestamp')
df2 = df[["userId", "movieId"]].groupby(by=["userId"]).agg(list).reset_index()
df2 = df2[df2.movieId.apply(len) > 20]
df = df.merge(df2, on=["userId"], how="inner", suffixes=("", "_right"))
df['timestamp'] = df['timestamp']/86400
df['timestamp'] = df['timestamp'].astype(int)
df.drop(columns=['movieId_right'], inplace=True)

n = df.shape[0]
m = int(0.8*n)

df_train = df[:m]
df_test = df[m:]

In [394]:
df_train

Unnamed: 0,userId,movieId,rating,timestamp,movie_year,genres_mh
0,85028,32,5.0,9524,1995,"[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,85028,39,5.0,9533,1995,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,85028,25,5.0,9547,1995,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,85028,111,5.0,9548,1976,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ..."
4,35011,446,4.0,9556,1993,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...
2567279,150312,3114,3.0,17814,1999,"[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
2567280,150312,74580,3.5,17814,2010,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2567281,150312,8372,0.5,17814,2004,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2567282,150312,4340,2.0,17814,2001,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [343]:
df['rating'].unique()

array([4. , 3. , 5. , 1. , 2. , 4.5, 3.5, 2.5, 1.5, 0.5])

In [489]:
import random

src_seq_len = 20
tgt_seq_len = 5
max_len = src_seq_len + tgt_seq_len

def get_movies_data(df:pd.DataFrame):
    df2 = df.groupby(by=["userId"]).agg(list).reset_index()

    user_ids = []
    intervals = []
    genres = []
    ratings = []
    years = []
    movie_ids_src, movie_ids_tgt = [], []
    interval_vocab_size = 0

    for i in range(df2.shape[0]):
        movie_ids_seq = df2.loc[i, 'movieId']
        user_id = df2.loc[i, 'userId']
        ts_seq = df2.loc[i, 'timestamp']
        genres_seq = df2.loc[i, 'genres_mh']
        ratings_seq = df2.loc[i, 'rating']
        years_seq = df2.loc[i, 'movie_year']

        for j in range(len(movie_ids_seq)-src_seq_len-1):
            m_src = movie_ids_seq[j:j+src_seq_len]
            m_tgt = movie_ids_seq[j+src_seq_len:min(len(movie_ids_seq), j+max_len)]
            m_tgt += [0]*(tgt_seq_len-len(m_tgt))

            user_ids += [[user_id]*src_seq_len]
            genres += [genres_seq[j:j+src_seq_len]]
            ratings += [ratings_seq[j:j+src_seq_len]]
            years += [years_seq[j:j+src_seq_len]]

            u = ts_seq[j:j+src_seq_len]
            v = u[:]
            k = len(u)-1
            while k >= 0:
                if k == len(u)-1:
                    u[k] = 1
                else:
                    u[k] = v[-1]-v[k]+1
                k -= 1
            
            interval_vocab_size = max(interval_vocab_size, max(u)+1)

            movie_ids_src += [m_src]
            movie_ids_tgt += [m_tgt]
            intervals += [u]
        
    movie_ids_src = torch.tensor(movie_ids_src, dtype=torch.int32)
    movie_ids_tgt = torch.tensor(movie_ids_tgt, dtype=torch.int32)
    user_ids = torch.tensor(user_ids, dtype=torch.int32)
    intervals = torch.tensor(intervals, dtype=torch.int64)
    ratings = torch.tensor(ratings, dtype=torch.float32)
    genres = torch.tensor(genres, dtype=torch.int8)
    years = torch.tensor(years, dtype=torch.int32)

    return user_ids, intervals, ratings, genres, years, movie_ids_src, movie_ids_tgt, interval_vocab_size

In [490]:
user_id_vocab_size = int(df_train["userId"].max()+1)
movie_id_vocab_size = int(df_train["movieId"].max()+1)
genres_vocab_size = len(genres_set)
years_vocab_size = int(df_train["movie_year"].max()+1)

In [491]:
user_ids_train, intervals_train, ratings_train, genres_train, years_train, movie_ids_src_train, movie_ids_tgt_train, interval_vocab_size_train = get_movies_data(df_train)

: 

In [None]:
user_ids_test, intervals_test, ratings_test, genres_test, years_test, movie_ids_src_test, movie_ids_tgt_test, interval_vocab_size_test = get_movies_data(df_test)

In [464]:
interval_vocab_size = interval_vocab_size_train

In [465]:
interval_vocab_size

4704

In [479]:
1024/32

32.0

In [484]:
d_model = 256
num_heads = 8
num_layers = 1
d_ff = 32
src_seq_length = 20
tgt_seq_length = 5
dropout = 0.0

transformer = Transformer(user_id_vocab_size, interval_vocab_size, genres_vocab_size, years_vocab_size, movie_id_vocab_size, movie_id_vocab_size, src_seq_length, tgt_seq_length, d_model, num_heads, d_ff, num_layers, num_layers, dropout).to(device=device)

In [485]:
n_epochs = 50    # number of epochs to run
batch_size = 128  # size of each batch
batches_per_epoch = movie_ids_src_train.shape[0] // batch_size

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.001)
lr_scheduler = CosineWarmupScheduler(optimizer, warmup=50, max_iters=batches_per_epoch*n_epochs)

transformer.train()

for epoch in range(n_epochs):
    indices = torch.randperm(movie_ids_src_train.shape[0])

    for i in range(batches_per_epoch):
        optimizer.zero_grad()
        start = i * batch_size
        batch_indices = indices[start:start+batch_size]

        movie_ids_src_train_batch = movie_ids_src_train[batch_indices]
        movie_ids_tgt_train_batch = movie_ids_tgt_train[batch_indices]

        user_ids_batch = user_ids_train[batch_indices]
        intervals_train_batch = intervals_train[batch_indices]
        genres_train_batch = genres_train[batch_indices].to(dtype=torch.float32)
        years_train_batch = years_train[batch_indices]
        ratings_train_batch = ratings_train[batch_indices]

        output:torch.Tensor = transformer(user_ids_batch.to(device=device), intervals_train_batch.to(device=device), genres_train_batch.to(device=device), years_train_batch.to(device=device), ratings_train_batch.to(device=device), movie_ids_src_train_batch.to(device=device), movie_ids_tgt_train_batch[:, :-1].to(device=device))
        loss:torch.Tensor = criterion(output.contiguous().view(-1, movie_id_vocab_size), movie_ids_tgt_train_batch[:, 1:].to(device=device).contiguous().view(-1))

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item()}")

Epoch: 1, Batch: 1, Loss: 12.185012817382812
Epoch: 1, Batch: 2, Loss: 12.18105411529541
Epoch: 1, Batch: 3, Loss: 12.181358337402344
Epoch: 1, Batch: 4, Loss: 12.179984092712402
Epoch: 1, Batch: 5, Loss: 12.173215866088867
Epoch: 1, Batch: 6, Loss: 12.1746244430542
Epoch: 1, Batch: 7, Loss: 12.17248821258545
Epoch: 1, Batch: 8, Loss: 12.163264274597168
Epoch: 1, Batch: 9, Loss: 12.160335540771484
Epoch: 1, Batch: 10, Loss: 12.142532348632812
Epoch: 1, Batch: 11, Loss: 12.130736351013184
Epoch: 1, Batch: 12, Loss: 12.096434593200684
Epoch: 1, Batch: 13, Loss: 12.074976921081543
Epoch: 1, Batch: 14, Loss: 12.013662338256836
Epoch: 1, Batch: 15, Loss: 11.977794647216797
Epoch: 1, Batch: 16, Loss: 11.897197723388672
Epoch: 1, Batch: 17, Loss: 11.833232879638672
Epoch: 1, Batch: 18, Loss: 11.759215354919434
Epoch: 1, Batch: 19, Loss: 11.63644790649414
Epoch: 1, Batch: 20, Loss: 11.540830612182617
Epoch: 1, Batch: 21, Loss: 11.456860542297363
Epoch: 1, Batch: 22, Loss: 11.394157409667969
Ep

KeyboardInterrupt: 

In [None]:
df_train = df.groupby(by=["userId"]).agg(list).reset_index()

In [None]:
3900

In [32]:
df2[df2.movieId.apply(len) > 100]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,"[2997, 2966, 2890, 3078, 2882, 541, 838, 1136,...","[4.0, 1.0, 4.0, 2.0, 1.0, 5.0, 5.0, 1.0, 5.0, ...","[943226846, 943226846, 943226916, 943226986, 9..."
2,3,"[2012, 466, 2268, 168, 1544, 4306, 1485, 2617,...","[3.0, 1.0, 4.0, 3.5, 4.0, 3.5, 4.0, 4.0, 3.5, ...","[1084484354, 1084484362, 1084484382, 108448438..."
9,10,"[48, 2006, 1954, 1909, 1690, 5218, 858, 733, 4...","[3.5, 2.0, 3.5, 2.5, 2.0, 4.0, 5.0, 4.5, 3.5, ...","[1169260535, 1169260570, 1169260574, 116926059..."
15,16,"[1196, 2571, 7153, 2353, 3994, 2006, 1198, 499...","[0.5, 2.0, 4.0, 4.5, 3.5, 1.0, 3.5, 4.0, 4.0, ...","[1517020327, 1517020360, 1517020362, 151702040..."
17,18,"[3252, 1894, 2467, 3159, 4823, 4681, 64839, 48...","[4.0, 0.5, 4.0, 4.5, 1.0, 4.0, 3.5, 3.0, 2.5, ...","[1251917373, 1251917516, 1251917545, 125191760..."
...,...,...,...,...
200939,200940,"[2020, 2915, 2064, 830, 637, 743, 2116, 3704, ...","[4.0, 3.5, 3.5, 2.5, 2.0, 2.0, 0.5, 1.0, 4.5, ...","[1194106282, 1194106296, 1194106299, 119410632..."
200942,200943,"[1957, 4321, 2478, 2686, 1779, 2046, 2528, 309...","[3.0, 2.0, 1.0, 4.5, 2.0, 1.5, 2.0, 2.5, 2.0, ...","[1225217623, 1225217626, 1225217651, 122521766..."
200943,200944,"[260, 1196, 318, 2571, 1291, 7153, 1210, 13413...","[4.0, 3.5, 5.0, 5.0, 3.5, 5.0, 4.0, 5.0, 3.0, ...","[1454247309, 1454247312, 1454247318, 145424732..."
200944,200945,"[318, 8874, 2762, 92259, 79132, 593, 1246, 168...","[5.0, 2.5, 4.0, 5.0, 5.0, 4.0, 3.5, 4.0, 4.0, ...","[1517070023, 1517070056, 1517070090, 151707009..."


In [22]:
2**32

4294967296

In [25]:
user_ids = torch.tensor(df2['userId'].tolist(), dtype=torch.uint32).unsqueeze(1)

In [40]:
len(df2.loc[1, 'movieId'][-100:])

52

In [47]:
movie_ids_src, movie_ids_tgt = [], []
for i in range(df2.shape[0]):
    h = df2.loc[i, 'movieId'][-100:]
    h = h + [0]*(100-len(h))
    movie_ids_src += [h[:50]]
    movie_ids_tgt += [h[50:]]
    
movie_ids_src = torch.tensor(movie_ids_src, dtype=torch.uint32)
movie_ids_tgt = torch.tensor(movie_ids_tgt, dtype=torch.uint32)

In [48]:
ts_src, ts_tgt = [], []
for i in range(df2.shape[0]):
    h = df2.loc[i, 'timestamp'][-100:]
    for j in range(len(h)-1, 0, -1):
        h[j] = h[j]-h[j-1]+1
    h[0] = 1
    h = h + [0]*(100-len(h))
    ts_src += [h[:50]]
    ts_tgt += [h[50:]]
    
ts_src = torch.tensor(ts_src, dtype=torch.uint64)
ts_tgt = torch.tensor(ts_tgt, dtype=torch.uint64)

In [None]:
n = df2.shape[0]
m = int(0.8*n)

user_ids

tensor([  1,   2,   1,  36,   1,   1,   1,  30,   1,   1,  19,  25,   1,   1,
        122,   1,  17,   1,   1,  25,  44,   1,  91,   1,  23,  54, 194,   1,
          1,  17,   1,   1,  17,   1,  20,   1,  15,  25,  12,  15,   1,  15,
          1,  13,  18,   1,  53,   1,  13,  19,  17, 117,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0], dtype=torch.uint64)

In [None]:
class UserNetwork(nn.Module):
    def __init__(self, user_vocab_size, user_d_model, user_ffd, dropout=0.0) -> None:
        super(Transformer, self).__init__()

        self.user_embedding = nn.Embedding(user_vocab_size, user_d_model)

        self.ffn_1 = nn.Linear(user_d_model, user_ffd)
        self.ffn_2 = nn.Linear(user_ffd, user_d_model)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)

        self.ffn = \
            nn.Sequential(
                self.ffn_1,
                nn.Dropout(dropout),
                nn.ReLU(inplace=True),
                self.ffn_2
            ) 

    def forward(self, x:torch.Tensor):
        user_embed = self.user_embedding(x)
        return self.ffn(user_embed)

In [None]:
user_id, predict next sequence of movies to watch
user_id, predict next sequence of movies to watch (attention scores weighted by ratings)
user_id  predict rating for next movie based on watch history



In [None]:
longest increasing subsequence

In [48]:
def longest_increasing_subsequence(arr):
    f = [float("inf")]*len(arr) # f[i] - smallest value corresponding to last element for i+1 length increasing subsequence
    g = [0]*len(arr)

    max_p = 0
    for i in range(len(arr)):
        u = arr[i]
        left, right = 0, len(f)-1
        p = -1
        while left <= right:
            mid = int((left+right)/2)
            if f[mid] <= u:
                p = mid
                left = mid+1
            else:
                right = mid-1

        f[p+1] = min(f[p+1], u)
        max_p = max(max_p, p+2)
        g[i] = p+2

    out = []
    h = max_p
    for i in range(len(arr)-1, -1, -1):
        if g[i] == h and (len(out) == 0 or arr[i] <= out[-1]):
            out += [arr[i]]
            h -= 1

    return out[::-1]

In [49]:
import random
h = random.sample(range(1, 1000), k=100)
print(h)

[501, 925, 159, 179, 292, 610, 643, 234, 553, 793, 742, 132, 842, 250, 770, 348, 758, 624, 236, 650, 435, 815, 790, 558, 51, 823, 923, 35, 153, 708, 178, 623, 930, 756, 182, 666, 781, 693, 652, 773, 894, 996, 319, 711, 822, 133, 748, 575, 354, 998, 45, 792, 49, 492, 52, 131, 297, 549, 255, 601, 215, 352, 785, 281, 266, 999, 599, 715, 527, 683, 72, 130, 302, 224, 543, 873, 759, 936, 197, 468, 430, 926, 96, 37, 412, 136, 315, 256, 583, 422, 491, 434, 723, 617, 864, 476, 544, 698, 561, 363]


In [50]:
longest_increasing_subsequence(h)

[159, 179, 234, 250, 348, 435, 558, 623, 666, 693, 711, 748, 785, 873, 926]