In [1]:
import pandas as pd
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW

In [2]:
use_small = True
if use_small:
    folder_name = "ml-latest-small"
else:
    folder_name = "ml-latest"
    genome_scores = pd.read_csv(folder_name + "/genome-scores.csv")
    genome_tags = pd.read_csv(folder_name + "/genome-tags.csv")
    genome_scores.head()

In [3]:
if not use_small:
    genome_tags.head()

In [4]:
tags = pd.read_csv(folder_name + "/tags.csv")
movies = pd.read_csv(folder_name + "/movies.csv", index_col="movieId")
ratings = pd.read_csv(folder_name + "/ratings.csv")

In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [7]:
tags.dropna(inplace=True)

In [8]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [9]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [10]:
from nltk.stem.snowball import SnowballStemmer

In [11]:
stemmer = SnowballStemmer(language="english")
tags["tag"] = tags["tag"].astype(str).apply(lambda x: stemmer.stem(x.lower()))

In [12]:
unique_tags = tags["tag"].unique()
tags_to_idx = {t: i+1 for i, t in enumerate(unique_tags)}
tags["tag"] = tags["tag"].map(tags_to_idx)
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,1,1445714994
1,2,60756,2,1445714996
2,2,60756,3,1445714992
3,2,89774,4,1445715207
4,2,89774,5,1445715200


In [13]:
tags_concatenated = tags.groupby("movieId")["tag"].agg(list)

In [14]:
tags_concatenated.head()

movieId
1       [58, 58, 1181]
2    [31, 32, 33, 543]
3           [368, 369]
5           [544, 545]
7                [545]
Name: tag, dtype: object

У фильмов может быть несколько одинаковых тегов, но не будем это исправлять. Это поможет получить более точное представление фильмов.

In [15]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [16]:
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))

In [17]:
movies["tags"] = tags_concatenated

In [18]:
movies.isna().sum()

title        0
genres       0
tags      8170
dtype: int64

In [19]:
movies["tags"] = movies["tags"].apply(lambda x: [0] if not isinstance(x, list) else x)

In [20]:
movies.head()

Unnamed: 0_level_0,title,genres,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[58, 58, 1181]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[31, 32, 33, 543]"
3,Grumpier Old Men (1995),"[Comedy, Romance]","[368, 369]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[0]
5,Father of the Bride Part II (1995),[Comedy],"[544, 545]"


In [21]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [22]:
def train_test_split(group, train_size):
    threshold = int(len(group) * train_size)
    return group.iloc[:threshold]

In [23]:
ratings.sort_values(["userId", "timestamp"], inplace=True)
train_ratings = ratings.groupby("userId").apply(train_test_split, 0.7).reset_index(drop=True)
val_test_ratings = pd.concat([ratings, train_ratings]).drop_duplicates(keep=False).reset_index(drop=True)

In [24]:
train_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,804,4.0,964980499
1,1,1210,5.0,964980499
2,1,2018,5.0,964980523
3,1,2628,4.0,964980523
4,1,2826,4.0,964980523


In [25]:
val_ratings = val_test_ratings.groupby("userId").apply(train_test_split, 2/3).reset_index(drop=True)
test_ratings = pd.concat([val_test_ratings, val_ratings]).drop_duplicates(keep=False).reset_index(drop=True)

In [26]:
val_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1030,3.0,964982903
1,1,2033,5.0,964982903
2,1,4006,4.0,964982903
3,1,50,5.0,964982931
4,1,608,5.0,964982931


In [27]:
test_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1270,5.0,964983705
1,1,1240,5.0,964983723
2,1,1206,5.0,964983737
3,1,3702,5.0,964983737
4,1,3033,5.0,964983762


In [28]:
train_ratings.shape[0], val_ratings.shape[0], test_ratings.shape[0]

(70312, 20162, 10362)

In [29]:
ratings.shape

(100836, 4)

TODO: добавить теги из genome_scores с учетом их релевантности

In [30]:
genres = [
    "(no genres listed)",
    "Action", 
    "Adventure", 
    "Animation", 
    "Children", 
    "Comedy", 
    "Crime", 
    "Documentary", 
    "Drama", 
    "Fantasy", 
    "Film-Noir", 
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "IMAX"
]
genres_to_idx = {genre: index for index, genre in enumerate(genres)}

## Создание датасета

In [31]:
class MovieDataset(Dataset):
    def __init__(self, movies_data, user_data):
        self.user_idx = user_data["userId"]
        self.ratings = user_data["rating"]
        self.movie_idx = user_data["movieId"]
        self.tag_idx = movies_data["tags"]
        self.genre_idx = movies_data["genres"].apply(lambda x: [genres_to_idx[genre] for genre in x])
    
    def __getitem__(self, index):
        user_id = self.user_idx[index]
        rating = self.ratings[index]
        movie_id = self.movie_idx[index]
        tags = torch.Tensor(self.tag_idx[movie_id]).int()
        genres = torch.Tensor(self.genre_idx[movie_id]).int()
        return tags, genres, user_id, rating, index
    
    def __len__(self):
        return self.user_idx.shape[0]
    
    def collate_function(self, batch):
        tags, genres, user_id, rating, index = zip(*batch)
        batched_tags = pad_sequence(tags, batch_first=True, padding_value=0)
        batched_genres = pad_sequence(genres, batch_first=True, padding_value=0)
        user_id = torch.Tensor(user_id).int()
        rating = torch.Tensor(rating).int()
        return {"user_id": user_id, "tags": batched_tags, "genres": batched_genres, "rating": rating, "index": index}

In [32]:
torch.manual_seed(42)

train_dataset = MovieDataset(movies, train_ratings)
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=64, 
    collate_fn=train_dataset.collate_function,
    shuffle=True,
)

val_dataset = MovieDataset(movies, val_ratings)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=64,
    collate_fn=val_dataset.collate_function,
    shuffle=True,
)

test_dataset = MovieDataset(movies, test_ratings)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=64,
    collate_fn=val_dataset.collate_function,
    shuffle=True,
)

## Обучение

In [33]:
class MovieModel(nn.Module):
    def __init__(self, tag_vocab_size, genre_vocab_size, user_vocab_size, embedding_dim):
        super(MovieModel, self).__init__()
        
        self.tag_embedder = nn.Embedding(tag_vocab_size, embedding_dim, padding_idx=0)
        self.genre_embedder = nn.Embedding(genre_vocab_size, embedding_dim, padding_idx=0)
        self.user_embedder = nn.Embedding(user_vocab_size, embedding_dim)
        
        self.fc = nn.Linear(2 * embedding_dim, embedding_dim)
        
    def forward(self, input_batch):
        tag_emb = torch.mean(self.tag_embedder(input_batch["tags"]), dim=1)
        genre_emb = torch.mean(self.genre_embedder(input_batch["genres"]), dim=1)
        tag_genre_emb = torch.cat((tag_emb, genre_emb), dim = 1)
        movie_emb = self.fc(tag_genre_emb)
        
        user_emb = self.user_embedder(input_batch["user_id"])
        
        return user_emb, movie_emb

In [34]:
def compute_loss(user_emb, movie_emb, rating):
    cos_sim = nn.CosineSimilarity(dim=1)
    
    sim_norm = (cos_sim(user_emb, movie_emb) + 1) * 5
    rmse_loss = torch.sqrt(torch.mean((sim_norm - rating) ** 2))
    
    return rmse_loss

In [35]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"

In [40]:
embedding_dim = 500
lr = 1e-5
epochs = 1000

model = MovieModel(unique_tags.shape[0]+1, len(genres), ratings["userId"].unique().shape[0]+1, embedding_dim)
model.to(device)

optimizer = AdamW(model.parameters(), lr)

pbar = tqdm(total=len(train_dataloader)*epochs)

num_iter = 1
val_every = 100_000

for epoch in range(epochs):
    for batch in train_dataloader:
        model.train()

        for key, val in batch.items():
            if key != "index":
                batch[key] = val.to(device)

        optimizer.zero_grad()

        user_emb, movie_emb = model(batch)
        loss = compute_loss(user_emb, movie_emb, batch["rating"])

        if torch.isnan(loss).item():
            print(batch)
            break

        loss.backward()
        optimizer.step()

        pbar.desc = f"Epoch {epoch+1}/{epochs}. Train loss: {round(loss.item(), 3)}"
        pbar.update()
        
        num_iter += 1
        
        if num_iter % val_every == 0:
            model.eval()
            val_loss = 0
            
            for batch in tqdm(val_dataloader):
                for key, val in batch.items():
                    if key != "index":
                        batch[key] = val.to(device)
                user_emb, movie_emb = model(batch)
                val_loss += compute_loss(user_emb, movie_emb, batch["rating"])
                
            print(f"Validation loss: {round(val_loss.item()/len(val_dataloader), 3)}")
                
        
pbar.close()

  0%|          | 0/1099000 [00:00<?, ?it/s]

  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 1.132


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 1.028


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.995


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.984


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.975


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.971


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.967


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.966


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.962


  0%|          | 0/316 [00:00<?, ?it/s]

Validation loss: 0.957


TODO: добавить остальные теги с весами, разбить на обучающую, валидационную и тестовую. Если модель более менее обучится - добавить код для теста. На обучении сделать ранний выход по лоссу. Добавить пояснения везде.