In [1]:
import torch
import numpy as np

import pandas as pd
import torch.nn as nn
import torch.nn.functional as f
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from functools import lru_cache
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from IPython.display import clear_output
from collections import defaultdict

### Summary

У нас есть таблица рейтингов, тегов и фильмов.

Основная идея в том, чтобы обучить модель, которая по описанию фильма выдает его embedding,
сделать модель, которая по оценке фильма выдает embeding пользователя, усреднить по пользователю такие эмбединги.

Как это обучать??


Для этого заведем модель, которая по эмбедингам фильма и пользователя будет предсказывать оценку этого фильма

In [2]:
DATA_DIR = "ml-latest/"

ratings = pd.read_csv(DATA_DIR + "ratings.csv")
movies = pd.read_csv(DATA_DIR + "movies.csv")
tags = pd.read_csv(DATA_DIR + "tags.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


Соединяем таблицу tags и таблицу ratings, убирая лишние для нас столбцы.

In [5]:
big_table = pd.merge(ratings, tags, how="left", left_on=("userId", "movieId"), right_on =("userId", "movieId"))
big_table = big_table.drop(['timestamp_x', 'timestamp_y'], axis=1)
big_table.head()

Unnamed: 0,userId,movieId,rating,tag
0,1,307,3.5,
1,1,481,3.5,
2,1,1091,1.5,
3,1,1257,4.5,
4,1,1449,4.5,


In [6]:
ratings.shape, tags.shape, big_table.shape

((27753444, 4), (1108997, 4), (28363496, 4))

In [7]:
big_table_train, big_table_test = train_test_split(big_table, test_size=0.1, random_state=42, shuffle=True)

big_table_train.shape, big_table_test.shape

((25527146, 4), (2836350, 4))

In [8]:
big_table_train = big_table_train.reset_index()
big_table_test = big_table_test.reset_index()

Больше нам эти таблицы не пригодятся.

In [9]:
del ratings
del tags
del big_table

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Создадим множество всех жанров.

In [11]:
genres_vocab = set()
for g in movies.genres:
    genres_vocab.update(set(g.split('|')))

genres_vocab

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [12]:
genre2idx = dict(zip(genres_vocab, range(len(genres_vocab))))
idx2genre = dict(zip(range(len(genres_vocab)), genres_vocab))

Функция выдает dp.DataFrame, где предствлены one-hot представления жанров.

In [13]:
def one_hot_genre(data: pd.DataFrame):
    genre = pd.DataFrame()
    for g in genres_vocab:
        genre[g] = data.apply(lambda x: int(g in x['genres'].split('|')), axis=1)
    return genre

one_hot_genre(movies)

Unnamed: 0,Documentary,Sci-Fi,Horror,Romance,Animation,Comedy,Fantasy,Crime,War,IMAX,Musical,Children,Mystery,Adventure,Western,(no genres listed),Film-Noir,Drama,Thriller,Action
0,0,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
58094,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
58096,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


Заведем словарь из всех встречающихся слов

In [14]:
tokenizer = get_tokenizer('basic_english')

max_len_title = 0

def yield_tokens(it_list):
    global max_len_title
    
    for text in tqdm(it_list):
        tokens = tokenizer(text)
        if len(tokens) > max_len_title:
            max_len_title = len(tokens)
        yield tokens

def flat_list(list_):
    res = []
    for el in list_:
        res.extend(el)
    return res


vocab = {}
print('Building vocabs:')
vocab = build_vocab_from_iterator(
    yield_tokens(
        movies.title
    ),
    specials=["<unk>", "<pad>"]
)
vocab.set_default_index(vocab["<unk>"])

Building vocabs:


  0%|          | 0/58098 [00:00<?, ?it/s]

Заведем отдельные токены для неизвестных слов и для паддингов

In [15]:
unk_id = vocab['<unk>']
pad_id = vocab['<pad>']

print(f'<unk> ~ {unk_id}\n<pad> ~ {pad_id}')

<unk> ~ 0
<pad> ~ 1


In [16]:
max_len_title 

41

Напишем функцию как мы будем укладывать предложения в тензоры.

In [17]:
text_pipeline = lambda x: vocab(tokenizer(x))

def collate_batch_padding(batch):
    text_list = []
    genre_list = []
    for text in batch:
        text_list.append(text_pipeline(text[0]))
        genre_list.append(text[1])
        
    text_list = [
        text + [pad_id] * (max_len_title - len(text))
        for text
        in text_list
    ]
    genre_list = torch.vstack(genre_list).to(int)
    text_list = torch.tensor(text_list, dtype=torch.int64)
    
    return torch.cat([text_list, genre_list], dim=-1)

In [18]:
collate_batch_padding([["Hello world", torch.Tensor([[1, 1, 1, 1, 1, 2]])], 
                       ["Hello world", torch.Tensor([[1, 1, 1, 1, 1, 2]])]])

tensor([[918, 103,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   2],
        [918, 103,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   2]])

In [19]:
input_size = 61
hidden_size = 128
num_layers = 1
output_size = 64
batch_size = 256

Напишем класс, который будет по фильму возвращать его embedding

In [20]:
class MovieNN(nn.Module):
    def  __init__(self,
                  input_size: int, 
                  hidden_size: int, 
                  num_layers: int,
                  output_size: int):
        
        super(MovieNN, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        self.linear = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_):
        output, (_, _) = self.lstm(input_.float())
        
        embedings = self.linear(output.float())
        return embedings

In [21]:
movie_model = MovieNN(input_size, hidden_size, num_layers, output_size)

In [22]:
def get_movie_embedding(idx: int) -> torch.Tensor:
    """
    Function to get film embedding by film index
    movie_idx: int
    output movie_embedding: torch.tensor
    """
    movie = movies.loc[movies.movieId == idx]
    one_hot_genres = torch.Tensor(one_hot_genre(movie).to_numpy())
    title = str(movie.title)
    dataitem = collate_batch_padding([[title, one_hot_genres]])
    embedding = movie_model(dataitem).squeeze()
    return embedding

In [23]:
get_movie_embedding(100)

tensor([-0.0389,  0.2797,  0.6131, -0.1651,  0.3682, -0.6260,  0.2635, -0.0521,
         0.2103,  0.1322,  0.0935, -0.2009,  0.1525,  0.3379, -0.0659,  0.0589,
        -0.1638,  0.1516,  0.0981,  0.0638,  0.3476,  0.2553, -0.0292, -0.0158,
         0.2014,  0.2763,  0.0204, -0.2656,  0.0546, -0.5616,  0.1552, -0.1194,
         0.0840,  0.1173, -0.1658, -0.2450, -0.0837,  0.1019,  0.0259,  0.2462,
        -0.1744,  0.1590, -0.1708, -0.0515,  0.0896, -0.4333, -0.1324,  0.1036,
         0.1982,  0.1563,  0.0916, -0.2395,  0.1603, -0.3533,  0.1527,  0.3896,
         0.0609,  0.1816,  0.0479, -0.3128, -0.0683, -0.1326, -0.4809, -0.1288],
       grad_fn=<SqueezeBackward0>)

In [24]:
text_pipeline_user = lambda x: vocab(tokenizer(x)) if not isinstance(x, float) else vocab(tokenizer(""))
rating_pipeline = lambda x: -1 if np.isnan(x) else x


def collate_user_batch_padding(batch, movie2emb = None):
    text_list = []
    ratings_list = []
    movie_items = []
    for text, movie, rating in batch:
        
        text_list.append(text_pipeline_user(text))
        
        ratings_list.append(rating_pipeline(rating))
        if movie2emb is not None:
            movie_items.append(movie2emb[movie])
        else:
            movie_items.append(get_movie_embedding(movie))

    
    text_list = [
        text + [pad_id] * (max_len_title - len(text))
        for text
        in text_list
    ]
    ratings_list = torch.tensor(ratings_list)[:, None]
    movie_items = torch.vstack(movie_items).long()
    text_list = torch.tensor(text_list, dtype=torch.int64)
    
    return torch.cat([text_list, movie_items, ratings_list], dim=-1)

In [25]:
class UserNN(nn.Module):
    def  __init__(self, 
                  input_size: int, 
                  hidden_size: int, 
                  num_layers: int,
                  output_size: int):
        
        super(UserNN, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        self.linear = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_):
        output, (_, _) = self.lstm(input_.float())
        
        embedings = self.linear(output.float())
        return embedings

In [26]:
user_model = UserNN(106, hidden_size, num_layers, output_size)

Чтобы постоянно не пересчитывать ембеддинги фильмов для получения ембеддингов пользователей, мы будем раз в несколько итераций полностью вычислять для всех фильмов ембеддинги.

In [27]:
movie2emb = {}

In [28]:
def get_user_embedding(idx: int):
    user = big_table_train.loc[big_table_train.userId == idx]
    tags = user.tag
    movie = user.movieId
    rating = user.rating
    dataitems = collate_user_batch_padding(zip(tags, movie, rating), movie2emb)
    embeddings = user_model(dataitems)
        
    embedding = embeddings.mean(axis=0)
    return embedding

In [29]:
def update_movie():
    mov = np.unique(movies.movieId)
    
    mov2emb = defaultdict(lambda x: 0)
    
    for m in tqdm(mov):
        movie = movies.loc[movies.movieId == m]
        one_hot_genres = torch.Tensor(one_hot_genre(movie).to_numpy())
        title = str(movie.title)
        
        dataitem = collate_batch_padding([[title, one_hot_genres]])
        embedding = movie_model(dataitem).squeeze()
        
        mov2emb[m] = embedding 
    
    return mov2emb

In [30]:
movie2emb = update_movie()
len(movie2emb)

  0%|          | 0/58098 [00:00<?, ?it/s]

58098

In [31]:
class LearningDataset(Dataset):
    def __init__(self, big_table: pd.DataFrame):
        self.rating = big_table.rating
        self.movies = big_table.movieId
        self.users = big_table.userId
        self.ln = len(self.rating)
        
    def __getitem__(self, index):
        user_emb = get_user_embedding(self.users[index])
        movie_emb = get_movie_embedding(self.movies[index])
        data = torch.cat([user_emb, movie_emb], dim=-1)
        return data, self.rating[index]
        
    def __len__(self):
        return self.ln
    
train_dataset = LearningDataset(big_table_train)
test_dataset = LearningDataset(big_table_test)

train_dataloader = DataLoader(train_dataset, 
                               batch_size=batch_size, 
                               shuffle=True)
test_dataloader = DataLoader(test_dataset, 
                               batch_size=batch_size, 
                               shuffle=True)

In [32]:
next(iter(train_dataloader))

[tensor([[ 0.0626, -0.0891,  0.0213,  ...,  0.1376, -0.2347,  0.0820],
         [ 0.0733, -0.0856,  0.0219,  ...,  0.1927, -0.0145, -0.0911],
         [ 0.0747, -0.0851,  0.0216,  ...,  0.2345,  0.0407,  0.2059],
         ...,
         [ 0.0742, -0.0852,  0.0217,  ...,  0.2080,  0.1159, -0.1020],
         [ 0.0682, -0.0873,  0.0218,  ..., -0.1048, -0.2987, -0.0324],
         [ 0.0696, -0.0865,  0.0211,  ...,  0.3236, -0.0912, -0.2323]],
        grad_fn=<StackBackward0>),
 tensor([3.0000, 2.0000, 4.0000, 4.0000, 2.5000, 5.0000, 4.0000, 5.0000, 3.5000,
         3.0000, 5.0000, 4.0000, 4.0000, 3.5000, 5.0000, 4.0000, 3.5000, 4.0000,
         3.0000, 4.0000, 2.0000, 5.0000, 4.0000, 4.5000, 1.0000, 4.0000, 4.5000,
         4.5000, 4.0000, 4.0000, 4.0000, 2.0000, 4.0000, 4.0000, 4.0000, 5.0000,
         5.0000, 3.5000, 3.0000, 3.5000, 4.0000, 1.0000, 4.0000, 2.0000, 3.0000,
         4.5000, 1.5000, 2.0000, 3.0000, 4.0000, 3.0000, 5.0000, 5.0000, 3.0000,
         3.0000, 4.0000, 5.0000, 3.500

In [33]:
class PredictModel(nn.Module):
    def __init__(self, input_size):
        super(PredictModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.ReLU(),
            nn.Softmax(dim=-1)
        )
              
    def forward(self, input_):
        output = self.model(input_)
        return output * 5

In [34]:
def validation_loss(model, dataloader = test_dataloader):
    with torch.no_grad():
        history_loss = []
        for i, (data, label) in tqdm(enumerate(dataloader)):
            prediction = model(data)

            loss = ((prediction - label) ** 2).mean()
            history_loss.append(loss.item())
        
        return torch.mean(history_loss)

In [None]:

learning_model = PredictModel(output_size * 2)
optimizer = torch.optim.Adam(learning_model.parameters(), lr=1e-3)

epochs = 5

history_loss = []
val_loss = []
update_freq = 100


for epoch in range(epochs):
    for i, (data, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        prediction = learning_model(data)

        loss = ((prediction - label) ** 2).mean()
        history_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        
        if i % update_freq == 0:
            clear_output()
            plt.plot(history_loss)
            movie2emb = update_movie()
            
    val_loss = validation_loss(learning_model)

    print(f"Epoch#{epoch}   validation_loss == {val_loss}")


  0%|          | 0/58098 [00:00<?, ?it/s]