In [1]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!pip install kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d grouplens/movielens-20m-dataset
!unzip /content/movielens-20m-dataset.zip


Saving kaggle.json to kaggle.json
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading movielens-20m-dataset.zip to /content
 90% 175M/195M [00:04<00:00, 38.2MB/s]
100% 195M/195M [00:04<00:00, 49.5MB/s]
Archive:  /content/movielens-20m-dataset.zip
  inflating: genome_scores.csv       
  inflating: genome_tags.csv         
  inflating: link.csv                
  inflating: movie.csv               
  inflating: rating.csv              
  inflating: tag.csv                 


In [2]:
import pandas as pd

df_transactions = pd.read_csv('rating.csv')
df_transactions

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [3]:
n_transactions = int(4e6)

df_transactions = df_transactions[:n_transactions]

In [4]:
train_size = int(0.9 * df_transactions.shape[0])
test_size = df_transactions.shape[0] - train_size

df_train, df_test = df_transactions[:train_size], df_transactions[train_size:]

In [12]:
len(df_test['userId'].unique())

2634

In [5]:
import torch 
import torch.nn as nn

class ConvNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, 3,1,1),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Conv1d(out_channels, out_channels, 3,1,1)
        )

    def forward(self, input):
        return input + self.model(input)


In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [7]:
class SASRec(nn.Module):
    def __init__(self, num_blocks, num_embeddings, embedding_size, num_heads, sentence_len):
        super().__init__()

        self.num_blocks = num_blocks

        self.embeddings = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_size)

        self.attention_layernorms = torch.nn.ModuleList()
        self.attention_layers = torch.nn.ModuleList()
        self.forward_layernorms = torch.nn.ModuleList()
        self.forward_layers = torch.nn.ModuleList()

        for _ in range(num_blocks):
            new_attn_layernorm = torch.nn.LayerNorm(embedding_size, eps=1e-8)
            self.attention_layernorms.append(new_attn_layernorm)

            new_attn_layer =  torch.nn.MultiheadAttention(embedding_size,
                                                            num_heads,
                                                            0.1, batch_first=True)
            self.attention_layers.append(new_attn_layer)

            new_fwd_layernorm = torch.nn.LayerNorm(embedding_size, eps=1e-8)
            self.forward_layernorms.append(new_fwd_layernorm)

            new_fwd_layer = ConvNet(sentence_len, sentence_len)
            self.forward_layers.append(new_fwd_layer)

        self.last = nn.Linear(embedding_size, num_embeddings)
        self.softmax = nn.Softmax(2)

    def forward(self, input):
        hidden = self.embeddings(input)
        tl = hidden.shape[1]
        attention_mask = ~torch.tril(torch.ones((tl, tl), dtype=torch.bool, device=device))

        for i in range(self.num_blocks):
            Q = self.attention_layernorms[i](hidden)

            output, _ = self.attention_layers[i](Q, hidden, hidden, attn_mask=attention_mask)
            
            hidden = Q + output
            hidden = self.forward_layernorms[i](hidden)
            hidden = self.forward_layers[i](hidden)
        return self.softmax(self.last(hidden))

In [8]:
NUM_PREDICTIONS = 20

In [9]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.auto import tqdm

class TransactionDataset(Dataset):
    def __init__(self, df_train, min_sample_len = 10):
        self.df_train = df_train
        self.df_train['timestamp'] = pd.to_datetime(self.df_train['timestamp'])
        self.df_train = self.df_train.sort_values(by=['timestamp'])
        users = set(self.df_train['userId'])
        self.transactions = []
        for user_id in tqdm(users):
            users_transactions = np.array(self.df_train[self.df_train['userId'] == user_id]['movieId'].values)
            if len(users_transactions) < min_sample_len:
                continue
            for i in range(len(users_transactions) - min_sample_len + 1):
                end_pos = i + min_sample_len
                self.transactions.append(users_transactions[i:end_pos])
        
    def __len__(self):
        return len(self.transactions)

    def __getitem__(self, index):
        return torch.LongTensor(self.transactions[index]).to(device)


In [13]:
dataset = TransactionDataset(df_train, NUM_PREDICTIONS)
print(len(dataset))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/24587 [00:00<?, ?it/s]

3132849


In [14]:
dataloader = DataLoader(dataset, batch_size=30, shuffle=True,num_workers=0)

for batch in dataloader: 
    print(batch)
    break

tensor([[33677, 34072, 30949,  8622, 44788, 46850,  7926,  8768,  7932,  5690,
          7022,  1274, 48322,  1217, 31658,  8607,  3996,  5072, 27604,   741],
        [  597,  1614,  2141,  2406,  1777,   852,   372,   216,  2145,  2252,
          2413,   224,  2247,  1391,  1485,   333,  3033,  2004,  2335,  2407],
        [ 5218, 44665, 46723,  1270,  2011,  1682, 45720, 49278,  7254,  4306,
           593,   253, 40278,   902,  2712, 36525, 45186, 27808, 27839, 34143],
        [ 1953,  3676,  1094,   541,  2997, 43679,  5426, 50872, 96216, 96218,
         26195, 61037, 80424,  3066,  2013, 67422,   307, 75395, 71304, 25852],
        [ 2571,  1387,  1953,  1291,  1610,  3527,  2000,  1210,  2947,  2194,
          3703,  1222,  2529,  2872,  4085,  3104,   377,  1580,   592,  3702],
        [  344,   349,   588,   153,    10,     1,   595,   161,   318,   316,
           165,   434,   329,   292,   253,    32,   110,   225,    47,    95],
        [  597,   587,   367,   292,   500,   

In [15]:
model = SASRec(4, max(df_transactions['movieId'].unique())+10, 40, 2, NUM_PREDICTIONS - 1).to(device) 
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

In [20]:
from tqdm.auto import tqdm

num_epochs = 5

log_window = 20000
sum_loss = 0
cnt_loss = 0


for epoch in tqdm(range(num_epochs)):
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        batch_len = batch.shape[1]

        output = model(batch[:, :(batch_len - 1)])
        target = batch[:, 1:].reshape(-1)

        loss = criterion(output.reshape(target.shape[0], -1), target)
        
        sum_loss += loss.item()
        cnt_loss += 1

        if cnt_loss == log_window:
            print(sum_loss / cnt_loss) 
            sum_loss, cnt_loss = 0, 0
        loss.backward()
        optimizer.step()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/104429 [00:00<?, ?it/s]

11.780143740415573
11.780119441509246


KeyboardInterrupt: ignored

In [21]:
def apk(actual, predicted, k = 10):
    predicted_ = predicted
    if len(actual) == 0:
        return 0
    if len(predicted) >= k:
        predicted_ = predicted[:k]

    ans, cnt = 0, 0
    total = min(len(actual), k)
    for i in range(len(predicted_)):
        if predicted_[i] in actual:
            cnt += 1
            ans += cnt / (i + 1)
    return ans / total

def mapk(actual, predicted, k = 10):
    ans = 0
    cnt = 0
    for i in range(min(len(predicted), len(actual))):
        ans += apk(actual[i], predicted[i], k)
        cnt += 1
    return ans / cnt

In [30]:
users = set(df_test['userId'])

df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

sum_mapk = 0
cnt_mapk = 0

K = 10000

for user_id in tqdm(users): 
    last_action = df_test[df_test['userId'] == user_id]['movieId'].values[-1]
    input_tensor = torch.LongTensor([[last_action] * (NUM_PREDICTIONS - 1)]).to(device)
    recommendations = set()
    for i in range(10):
        with torch.no_grad():
            output = model(input_tensor)
            positions = output.argmax(dim=2)
            for j in range(len(positions)):
                recommendations.add(int(positions[0][j].item()))
            input_tensor = positions
    target = df_test[df_test['userId'] == user_id]['movieId'].values
    sum_mapk += mapk(actual=[list(target)], predicted=[list(recommendations)], k=K)
    cnt_mapk += 1

print(f"Mean mapk : {sum_mapk / cnt_mapk}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/2634 [00:00<?, ?it/s]

Mean mapk : 0.005561204227013438
