In [53]:
# Standard libraries
import math
import os
import urllib.request
from functools import partial
from urllib.error import HTTPError

# Plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np

# PyTorch Lightning
import pytorch_lightning as pl
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# Torchvision
import torchvision
from pytorch_lightning.callbacks import ModelCheckpoint
from torchvision import transforms
from torchvision.datasets import CIFAR100
from tqdm.notebook import tqdm

plt.set_cmap("cividis")
%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = None
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
print("Device:", device)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 42


Device: mps


<Figure size 640x480 with 0 Axes>

In [54]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()

        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model) # (seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model)) # (seq_len, d_model)
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        self.register_buffer('pe', pe, persistent=False)

    def forward(self, x:torch.Tensor):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [55]:
def attention(q:torch.Tensor, k:torch.Tensor, v:torch.Tensor, mask=None):
    d_k = q.size()[-1] # q,k,v : (batch, head, seq_len, embed_size_per_head)
    attn_logits = torch.matmul(q, k.transpose(-2, -1)) # (batch, head, seq_len, seq_len)
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attn_logits, v) # (batch, head, seq_len, embed_size_per_head)
    return values, attention

def init_weights(x:nn.Linear):
    with torch.no_grad():
        nn.init.xavier_uniform_(x.weight)
        x.bias.data.fill_(0)

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, input_dim:int, d_model: int, h: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h

        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h

        self.w_q = nn.Linear(input_dim, d_model) # Wq
        self.w_k = nn.Linear(input_dim, d_model) # Wk
        self.w_v = nn.Linear(input_dim, d_model) # Wv
        self.w_o = nn.Linear(d_model, d_model) # Wo

        init_weights(self.w_q)
        init_weights(self.w_k)
        init_weights(self.w_v)
        init_weights(self.w_o)

    def forward(self, q_x:torch.Tensor, k_x:torch.Tensor, v_x:torch.Tensor, mask=None):
        q:torch.Tensor = self.w_q(q_x) # (batch, seq_len, d_model)
        k:torch.Tensor = self.w_k(k_x) # (batch, seq_len, d_model)
        v:torch.Tensor = self.w_v(v_x) # (batch, seq_len, d_model)

        q_h = q.reshape(q.shape[0], q.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        k_h = k.reshape(k.shape[0], k.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)
        v_h = v.reshape(v.shape[0], v.shape[1], self.h, self.d_k).transpose(1, 2) # (batch, head, seq_len, d_k)

        attn_out, _ = attention(q_h, k_h, v_h, mask) # (batch, head, seq_len, embed_size_per_head)
        attn_out = attn_out.transpose(1, 2) # (batch, seq_len, head, embed_size_per_head)
        attn_out = attn_out.reshape(attn_out.shape[0], attn_out.shape[1], attn_out.shape[2]*attn_out.shape[3]) # (batch, seq_len, d_model)

        return self.w_o(attn_out) # (batch, seq_len, d_model)

In [69]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()

        self.self_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)

        self.ffn_1 = nn.Linear(input_dim, dim_feedforward)
        self.ffn_2 = nn.Linear(dim_feedforward, input_dim)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)

        self.ffn = nn.Sequential(
            self.ffn_1,
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            self.ffn_2,
        )

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # attn_out = self.self_attn(x, x, x, mask=mask) # (batch, seq_len, input_dim)
        # x = x + self.dropout(attn_out) # (batch, seq_len, input_dim)
        # x = self.norm1(x) # (batch, seq_len, input_dim)

        ffn_out = self.ffn(x) # (batch, seq_len, input_dim)
        x = x + self.dropout(ffn_out) # (batch, seq_len, input_dim)
        x = self.norm2(x) # (batch, seq_len, input_dim)

        return x

In [70]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout=0.0):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
        return x

In [103]:
class DecoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0)->None:
        super().__init__()

        self.self_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)
        self.crss_attn = MultiHeadAttentionBlock(input_dim, input_dim, num_heads)

        self.ffn_1 = nn.Linear(input_dim, dim_feedforward)
        self.ffn_2 = nn.Linear(dim_feedforward, input_dim)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)
        
        self.ffn = nn.Sequential(
            self.ffn_1,
            nn.Dropout(dropout),
            nn.ReLU(inplace=True),
            self.ffn_2,
        )

        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        self.norm3 = nn.LayerNorm(input_dim)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, pred_mask, pad_mask):
        self_attn_out = self.self_attn(x, x, x, mask=pred_mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(self_attn_out) # (batch, seq_len, input_dim)
        x = self.norm1(x) # (batch, seq_len, input_dim)

        crss_attn_out = self.crss_attn(x, encoder_output, encoder_output, mask=pad_mask) # (batch, seq_len, input_dim)
        x = x + self.dropout(crss_attn_out) # (batch, seq_len, input_dim)
        x = self.norm2(x) # (batch, seq_len, input_dim)

        ffn_out = self.ffn(x) # (batch, seq_len, input_dim)
        x = x + self.dropout(ffn_out) # (batch, seq_len, input_dim)
        x = self.norm3(x) # (batch, seq_len, input_dim)

        return x

In [104]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_feedforward, dropout):
        super().__init__()
        self.layers = nn.ModuleList([DecoderBlock(d_model, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    def forward(self, x, encoder_output, pred_mask=None, pad_mask=None):
        for layer in self.layers:
            x = layer(x, encoder_output, pred_mask, pad_mask)
        return x

In [203]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model, num_heads, dim_feedforward, num_encoder_layers, num_decoder_layers, dropout=0.0) -> None:
        super(Transformer, self).__init__()

        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        self.src_positional_encoding = PositionalEncoding(d_model, src_seq_len, dropout)
        self.tgt_positional_encoding = PositionalEncoding(d_model, tgt_seq_len, dropout)

        self.encoder_block = Encoder(num_encoder_layers, d_model, num_heads, dim_feedforward, dropout)
        self.decoder_block = Decoder(num_decoder_layers, d_model, num_heads, dim_feedforward, dropout)

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        init_weights(self.fc)

        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=-1)       


    def generate_mask(self, src:torch.Tensor, tgt:torch.Tensor):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2) # (batch, 1, 1, seq_len)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3) # (batch, 1, seq_len, 1)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device=device) # (1, seq_len, seq_len)
        tgt_mask = tgt_mask & nopeak_mask # (batch, 1, seq_len, seq_len)
        return src_mask, tgt_mask
    

    def forward(self, src:torch.Tensor, tgt:torch.Tensor):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.encoder_embedding(src) # (batch, seq_len, d_model)
        tgt_embedded = self.decoder_embedding(tgt) # (batch, seq_len, d_model)

        src_embedded = self.src_positional_encoding(src_embedded) # (batch, seq_len, d_model)
        tgt_embedded = self.tgt_positional_encoding(tgt_embedded) # (batch, seq_len, d_model)

        src_embedded = self.dropout(src_embedded) # (batch, seq_len, d_model)
        tgt_embedded = self.dropout(tgt_embedded) # (batch, seq_len, d_model)

        enc_output = self.encoder_block(src_embedded, src_mask) # (batch, seq_len, d_model)
        dec_output = self.decoder_block(tgt_embedded, enc_output, tgt_mask, src_mask) # (batch, seq_len, d_model)

        return self.fc(dec_output) # (batch, seq_len, tgt_vocab_size)

In [204]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [None]:
import random

def generate_data(n=10000, start_rand=100, max_seq_length=100):
    vocab_size = 0
    data_src, data_tgt = [], []

    for _ in range(n):
        sumv = 0
        maxv = 0
        seq = []

        for j in range(2*max_seq_length):
            if j <= 1:
                d = random.randint(0, start_rand)
            else:
                d = abs(sumv-maxv)

            while d > 1000:
                d = d/2 if d % 2 == 0 else (d+1)/2
                d = int(d)

            vocab_size = max(vocab_size, d+1)
            seq += [d]

            sumv += d
            maxv = max(maxv, d)

        data_src += [seq[:max_seq_length]]
        data_tgt += [seq[max_seq_length:]]
    
    return torch.tensor(data_src, dtype=torch.int64), torch.tensor(data_tgt, dtype=torch.int64), vocab_size, vocab_size

In [None]:
def longest_increasing_subsequence(arr):
    f = [float("inf")]*len(arr) # f[i] - smallest value corresponding to last element for i+1 length increasing subsequence
    g = [0]*len(arr)

    max_p = 0
    for i in range(len(arr)):
        u = arr[i]
        left, right = 0, len(f)-1
        p = -1
        while left <= right:
            mid = int((left+right)/2)
            if f[mid] <= u:
                p = mid
                left = mid+1
            else:
                right = mid-1

        f[p+1] = min(f[p+1], u)
        max_p = max(max_p, p+2)
        g[i] = p+2

    out = []
    h = max_p
    for i in range(len(arr)-1, -1, -1):
        if g[i] == h and (len(out) == 0 or arr[i] <= out[-1]):
            out += [arr[i]]
            h -= 1

    return out[::-1]

def generate_data_lis(n=10000, src_seq_length=100, tgt_seq_length=101):
    data_src = []
    data_tgt = []
    src_vocab_size = 500
    tgt_vocab_size = 501

    for _ in range(n):
        h = random.sample(range(1, src_vocab_size), k=src_seq_length)
        res = longest_increasing_subsequence(h)
        res += [tgt_vocab_size-1]
        res = res + [0]*(tgt_seq_length-len(res))
        data_src += [h]
        data_tgt += [res]
    
    return torch.tensor(data_src, dtype=torch.uint16), torch.tensor(data_tgt, dtype=torch.uint16), src_vocab_size, tgt_vocab_size


In [206]:
n = 100000
m = int(0.8*n)
data_src, data_tgt, src_vocab_size, tgt_vocab_size = generate_data_lis(n)

data_src_train, data_src_test = data_src[:m], data_src[m:]
data_tgt_train, data_tgt_test = data_tgt[:m], data_tgt[m:]

In [207]:
data_src_train[0]

tensor([238, 117, 496,  34, 179, 267,  30,  79, 333, 329, 392, 287, 458, 353,
        254, 176, 345, 366, 258, 252,  22, 171, 313,   6, 135, 367, 101,  67,
        360, 396, 163, 234, 495,  56, 285, 315, 208,  18, 210, 355, 446, 264,
         69, 296, 497, 236,  19, 480, 130, 456, 283, 104, 146, 180, 230, 363,
        405, 102, 419, 112, 107, 268,  40, 312, 481, 214, 169,  31,  45, 445,
        199, 322, 153, 474, 241, 198,   4, 226, 307,  44,  46, 305, 414, 309,
         23, 240, 314, 272, 257, 483, 158, 489, 380, 444, 182,   7, 142, 188,
         96, 374])

In [208]:
data_tgt_train[0]

tensor([ 30,  79, 101, 163, 208, 210, 236, 283, 363, 405, 419, 445, 474, 483,
        489, 500,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0])

In [209]:
d_model = 128
num_heads = 4
num_layers = 1
d_ff = 32
src_seq_length = 100
tgt_seq_length = 101
dropout = 0.0

transformer = Transformer(src_vocab_size, tgt_vocab_size, src_seq_length, tgt_seq_length, d_model, num_heads, d_ff, num_layers, num_layers, dropout).to(device=device)

In [210]:
n_epochs = 50    # number of epochs to run
batch_size = 128  # size of each batch
batches_per_epoch = data_tgt_train.shape[0] // batch_size

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.001)
lr_scheduler = CosineWarmupScheduler(optimizer, warmup=50, max_iters=batches_per_epoch*n_epochs)

transformer.train()

for epoch in range(n_epochs):
    for i in range(batches_per_epoch):
        optimizer.zero_grad()
        start = i * batch_size

        data_src_train_batch = data_src_train[start:start+batch_size]
        data_tgt_train_batch = data_tgt_train[start:start+batch_size]

        output:torch.Tensor = transformer(data_src_train_batch.to(device=device), data_tgt_train_batch[:, :-1].to(device=device))
        loss:torch.Tensor = criterion(output.contiguous().view(-1, tgt_vocab_size), data_tgt_train_batch[:, 1:].to(device=device).contiguous().view(-1))

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item()}")

Epoch: 1, Batch: 1, Loss: 6.43637228012085
Epoch: 1, Batch: 2, Loss: 6.425315856933594
Epoch: 1, Batch: 3, Loss: 6.423421382904053
Epoch: 1, Batch: 4, Loss: 6.40769100189209
Epoch: 1, Batch: 5, Loss: 6.389381408691406
Epoch: 1, Batch: 6, Loss: 6.389106750488281
Epoch: 1, Batch: 7, Loss: 6.378807544708252
Epoch: 1, Batch: 8, Loss: 6.343634128570557
Epoch: 1, Batch: 9, Loss: 6.3392133712768555
Epoch: 1, Batch: 10, Loss: 6.308279991149902
Epoch: 1, Batch: 11, Loss: 6.292165279388428
Epoch: 1, Batch: 12, Loss: 6.238747596740723
Epoch: 1, Batch: 13, Loss: 6.241719722747803
Epoch: 1, Batch: 14, Loss: 6.210365295410156
Epoch: 1, Batch: 15, Loss: 6.204035758972168
Epoch: 1, Batch: 16, Loss: 6.17858362197876
Epoch: 1, Batch: 17, Loss: 6.17988920211792
Epoch: 1, Batch: 18, Loss: 6.189539432525635
Epoch: 1, Batch: 19, Loss: 6.179861068725586
Epoch: 1, Batch: 20, Loss: 6.176087379455566
Epoch: 1, Batch: 21, Loss: 6.142276763916016
Epoch: 1, Batch: 22, Loss: 6.120796203613281
Epoch: 1, Batch: 23, L

In [None]:
def get_before_end_token(seq, end_token):
    out = []
    for i in range(len(seq)):
        h = []
        for j in range(len(seq[i])):
            if seq[i][j] != end_token:
                h += [seq[i][j]]
            else:
                break
        out += [h]
    return out

def check_lis(pred, actual):
    if len(pred) != len(actual):
        return False
    for i in range(1, len(pred)):
        if pred[i] <= pred[i-1]:
            return False
    return True

def predict(model:nn.Module, n=100):
    model.eval()
    with torch.no_grad():
        preds:torch.Tensor = model(data_src_test[:n,:].to(device=device), data_tgt_test[:n, :-1].to(device=device))
        preds = preds.argmax(dim=-1)
        return preds
    
def evaluate(model:nn.Module, n=100):
    preds:torch.Tensor = predict(model, n)
    preds = preds.tolist()
    actuals = data_tgt_test[:n, 1:].tolist()
    preds = get_before_end_token(preds, tgt_vocab_size-1)
    actuals = get_before_end_token(actuals, tgt_vocab_size-1)
    s = 0
    u = 0
    for i in range(len(preds)):
        s += 1 if check_lis(preds[i], actuals[i]) else 0
        u += 1
    return s/u

In [232]:
evaluate(transformer, n=data_tgt_test.shape[0])

0.723

In [211]:
preds:torch.Tensor = transformer(data_src_test[100:101,:].to(device=device), data_tgt_test[100:101, :-1].to(device=device))
preds = preds.argmax(dim=-1)
preds

tensor([[ 33,  42,  89,  93, 106, 126, 133, 243, 263, 279, 324, 400, 425, 441,
         446, 499, 500, 209,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
          13,  13]], device='mps:0')

In [212]:
data_tgt_test[100:101, 1:]

tensor([[ 33,  42,  89,  93, 106, 126, 133, 243, 263, 279, 324, 400, 425, 441,
         446, 499, 500,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0]])

In [None]:
import pandas as pd

path = '/Users/amondal/recsys/datasets/ml-32m/ratings.csv'
column_names = ['userId', 'movieId', 'rating', 'timestamp']
df = pd.read_csv(path, sep=',', names=column_names, dtype={'userId':'int32', 'movieId':'int32', 'rating':float, 'timestamp':'int64'}, header=0)
df.dropna(inplace=True)

In [6]:
df = df.sort_values(by='timestamp')
df2 = df.groupby(by=["userId"]).agg(list).reset_index()

In [9]:
df2[:10]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,"[2997, 2966, 2890, 3078, 2882, 541, 838, 1136,...","[4.0, 1.0, 4.0, 2.0, 1.0, 5.0, 5.0, 1.0, 5.0, ...","[943226846, 943226846, 943226916, 943226986, 9..."
1,2,"[592, 296, 380, 344, 153, 588, 349, 318, 231, ...","[4.0, 1.0, 5.0, 1.0, 3.0, 5.0, 3.0, 5.0, 2.0, ...","[836423201, 836423202, 836423202, 836423237, 8..."
2,3,"[2012, 466, 2268, 168, 1544, 4306, 1485, 2617,...","[3.0, 1.0, 4.0, 3.5, 4.0, 3.5, 4.0, 4.0, 3.5, ...","[1084484354, 1084484362, 1084484382, 108448438..."
3,4,"[2745, 1833, 1210, 1272, 2115, 1327, 2826, 268...","[3.0, 2.0, 3.0, 4.0, 5.0, 3.0, 2.0, 3.0, 2.0, ...","[960485234, 960485234, 960485234, 960485281, 9..."
4,5,"[592, 590, 150, 380, 296, 349, 344, 165, 588, ...","[4.0, 3.0, 3.0, 5.0, 1.0, 4.0, 3.0, 4.0, 3.0, ...","[840763913, 840763914, 840763914, 840763915, 8..."
5,6,"[3753, 2167, 5445, 3082, 5418, 2948, 2949, 501...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[1100059902, 1100059947, 1100059952, 110006001..."
6,7,"[590, 592, 296, 150, 380, 588, 344, 153, 165, ...","[4.0, 3.0, 5.0, 5.0, 2.0, 4.0, 4.0, 3.0, 3.0, ...","[840385452, 840385452, 840385452, 840385452, 8..."
7,8,"[858, 527, 2959, 1221, 593, 3481, 2712, 5995, ...","[4.0, 4.0, 5.0, 3.0, 5.0, 3.0, 4.0, 5.0, 4.0, ...","[1553765387, 1553765389, 1553765392, 155376539..."
8,9,"[3897, 2700, 1500, 5060, 4034, 2194, 1639, 200...","[4.0, 3.5, 4.5, 4.0, 4.5, 4.0, 3.5, 3.5, 4.0, ...","[1138474042, 1138474047, 1138474066, 113847407..."
9,10,"[48, 2006, 1954, 1909, 1690, 5218, 858, 733, 4...","[3.5, 2.0, 3.5, 2.5, 2.0, 4.0, 5.0, 4.5, 3.5, ...","[1169260535, 1169260570, 1169260574, 116926059..."


In [None]:
class UserNetwork(nn.Module):
    def __init__(self, user_vocab_size, user_d_model, user_ffd, dropout=0.0) -> None:
        super(Transformer, self).__init__()

        self.user_embedding = nn.Embedding(user_vocab_size, user_d_model)

        self.ffn_1 = nn.Linear(user_d_model, user_ffd)
        self.ffn_2 = nn.Linear(user_ffd, user_d_model)

        init_weights(self.ffn_1)
        init_weights(self.ffn_2)

        self.ffn = \
            nn.Sequential(
                self.ffn_1,
                nn.Dropout(dropout),
                nn.ReLU(inplace=True),
                self.ffn_2
            ) 

    def forward(self, x:torch.Tensor):
        user_embed = self.user_embedding(x)
        return self.ffn(user_embed)

In [None]:
user_id, predict next sequence of movies to watch
user_id, predict next sequence of movies to watch (attention scores weighted by ratings)
user_id  predict rating for next movie based on watch history



In [None]:
longest increasing subsequence

In [48]:
def longest_increasing_subsequence(arr):
    f = [float("inf")]*len(arr) # f[i] - smallest value corresponding to last element for i+1 length increasing subsequence
    g = [0]*len(arr)

    max_p = 0
    for i in range(len(arr)):
        u = arr[i]
        left, right = 0, len(f)-1
        p = -1
        while left <= right:
            mid = int((left+right)/2)
            if f[mid] <= u:
                p = mid
                left = mid+1
            else:
                right = mid-1

        f[p+1] = min(f[p+1], u)
        max_p = max(max_p, p+2)
        g[i] = p+2

    out = []
    h = max_p
    for i in range(len(arr)-1, -1, -1):
        if g[i] == h and (len(out) == 0 or arr[i] <= out[-1]):
            out += [arr[i]]
            h -= 1

    return out[::-1]

In [49]:
import random
h = random.sample(range(1, 1000), k=100)
print(h)

[501, 925, 159, 179, 292, 610, 643, 234, 553, 793, 742, 132, 842, 250, 770, 348, 758, 624, 236, 650, 435, 815, 790, 558, 51, 823, 923, 35, 153, 708, 178, 623, 930, 756, 182, 666, 781, 693, 652, 773, 894, 996, 319, 711, 822, 133, 748, 575, 354, 998, 45, 792, 49, 492, 52, 131, 297, 549, 255, 601, 215, 352, 785, 281, 266, 999, 599, 715, 527, 683, 72, 130, 302, 224, 543, 873, 759, 936, 197, 468, 430, 926, 96, 37, 412, 136, 315, 256, 583, 422, 491, 434, 723, 617, 864, 476, 544, 698, 561, 363]


In [50]:
longest_increasing_subsequence(h)

[159, 179, 234, 250, 348, 435, 558, 623, 666, 693, 711, 748, 785, 873, 926]