In [182]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

Load our data.

In [183]:
path_movie = "dataset/netflix-movie/Netflix_Dataset_Movie.csv"
path_rating = "dataset/netflix-movie/Netflix_Dataset_Rating.csv"


movie_data = pd.read_csv(path_movie).loc[lambda df_: df_["Year"] >= 2005]
rating_data = pd.read_csv(path_rating).loc[lambda df_: df_["Movie_ID"].isin(movie_data["Movie_ID"].
values)]

In [184]:
movie_data.columns = movie_data.columns.str.lower().str.replace(" ", "_")
rating_data.columns = rating_data.columns.str.lower().str.replace(" ", "_")

In [185]:
print(f"Number of movie: {movie_data['movie_id'].nunique()}")
print(f"Number of user: {rating_data['user_id'].nunique()}")
print(f"Number of rating: {rating_data.shape[0]}")

Number of movie: 512
Number of user: 82224
Number of rating: 222272


In [186]:
movie_data.head()

Unnamed: 0,movie_id,year,name
16,17,2005,7 Seconds
84,85,2005,Elfen Lied
90,91,2005,WWE: Royal Rumble 2005
148,149,2005,The Edward R. Murrow Collection
150,151,2005,Sleepover Nightmare


In [187]:
rating_data.head()

Unnamed: 0,user_id,rating,movie_id
13420,2187374,4,17
13421,2503129,5,17
13422,263315,3,17
13423,608309,3,17
13424,2336678,2,17


However, the movie_id and user_id do not start with 0. We need to encode them to make them start with 0 and increase by 1.

In [188]:
movie_encoder = LabelEncoder()
movie_data["movie_id_encoded"] = movie_encoder.fit_transform(movie_data["movie_id"])

movie_data.head()

Unnamed: 0,movie_id,year,name,movie_id_encoded
16,17,2005,7 Seconds,0
84,85,2005,Elfen Lied,1
90,91,2005,WWE: Royal Rumble 2005,2
148,149,2005,The Edward R. Murrow Collection,3
150,151,2005,Sleepover Nightmare,4


In [189]:
user_encoder = LabelEncoder()

rating_data["user_id_encoded"] = user_encoder.fit_transform(rating_data["user_id"])
rating_data["movie_id_encoded"] = movie_encoder.transform(rating_data["movie_id"])

rating_data.head()

Unnamed: 0,user_id,rating,movie_id,user_id_encoded,movie_id_encoded
13420,2187374,4,17,67864,0
13421,2503129,5,17,77654,0
13422,263315,3,17,8119,0
13423,608309,3,17,18998,0
13424,2336678,2,17,72409,0


After that we can prepare dataset and dataloader.

In [190]:
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return self.ratings.shape[0]

    def __getitem__(self, index):
        user = self.users[index]
        movie = self.movies[index]
        rating = self.ratings[index]
        return {
            "user": torch.tensor(user, dtype=torch.int64),
            "movie": torch.tensor(movie, dtype=torch.int64),
            "rating": torch.tensor(rating, dtype=torch.float32),
        }


df_train, df_val = train_test_split(
    rating_data, test_size=0.1, random_state=1, stratify=rating_data["movie_id_encoded"].values
)

train_dataset = MovieDataset(
    df_train["user_id_encoded"].values, 
    df_train["movie_id_encoded"].values, 
    df_train["rating"].values
)
val_dataset = MovieDataset(
    df_val["user_id_encoded"].values, 
    df_val["movie_id_encoded"].values, 
    df_val["rating"].values
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


print(f"Train shape = {df_train.shape}")
print(f"Val shape = {df_val.shape}")

Train shape = (200044, 5)
Val shape = (22228, 5)


In [191]:
for batch_idx, x in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print(f"User: {x.get('user')}") 
    print(f"Movie: {x.get('movie')}") 
    print(f"Rating: {x.get('rating')}") 

    break

Batch 1
User: tensor([49335, 49008, 80465,  5854, 79475, 33496, 80439,  6379, 79265,  1180,
        67068, 55800, 27298,  5695, 33772, 21328, 13410, 15575, 22467, 75336,
        75897,  2278, 15226, 74245, 49009,  1728, 53636, 16328,  3983, 10815,
        17917, 31590])
Movie: tensor([120,  34,  15,  21, 120,  12,  12,  12,  35,  21,  35, 120,  15,  34,
         15, 120,  35,  12,  21,   0,  21, 120, 128,  15, 120,  21,  12,  12,
         35,  12,  81,  15])
Rating: tensor([5., 4., 1., 5., 3., 3., 4., 4., 2., 3., 3., 2., 4., 3., 4., 4., 4., 4.,
        4., 2., 1., 5., 5., 3., 4., 3., 5., 4., 4., 5., 4., 5.])


In [192]:
for batch_idx, x in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print(f"User: {x.get('user').shape}") 
    print(f"Movie: {x.get('movie').shape}") 
    print(f"Rating: {x.get('rating').shape}") 

    break

Batch 1
User: torch.Size([32])
Movie: torch.Size([32])
Rating: torch.Size([32])


Building model.

In [193]:
class RecSys(nn.Module):
    def __init__(self, num_users, num_movies):
        super().__init__()
        self.user_embeds = nn.Embedding(num_embeddings=num_users, embedding_dim=32)
        self.movie_embeds = nn.Embedding(num_embeddings=num_movies, embedding_dim=32)
        self.linear = nn.Linear(in_features=64, out_features=1)

    def forward(self, user_id, movie_id):
        user_embeds = self.user_embeds(user_id)
        movie_embeds = self.movie_embeds(movie_id)
        output = torch.cat([user_embeds, movie_embeds,], dim=1)
        output = self.linear(output) 
        return output

Intuition: embedding layer

In [194]:
# sample forward  
sample_model = RecSys(num_users=3, num_movies=5)
zero = torch.zeros(1, dtype=torch.long)
sample_model(user_id=zero, movie_id=zero)

tensor([[1.3258]], grad_fn=<AddmmBackward0>)

In [195]:
sample_model.user_embeds(torch.tensor(0))

tensor([ 0.3448,  0.7694, -2.0348, -0.4750,  0.1040, -1.2572,  0.2662, -1.8374,
        -1.7436, -2.2077, -0.6095, -1.3973, -1.0913,  1.9447, -1.4663,  0.0667,
         1.1932, -1.8131,  0.1837, -1.0335,  0.8421,  0.2798,  1.3908,  0.2931,
         2.3502, -0.4646, -1.9585, -2.0980,  2.2357, -0.5690,  1.0673,  1.6572],
       grad_fn=<EmbeddingBackward0>)

In [196]:
sample_model.user_embeds(torch.tensor(1))

tensor([ 1.0169,  0.0246,  0.7755,  1.0175, -0.6570,  0.4776,  1.5278,  1.9648,
        -0.3315,  0.8699,  1.5464,  0.4878,  1.4138, -1.3487, -0.1113,  0.2078,
         0.6565, -0.3666, -0.5509,  0.5137,  0.9235, -0.6937,  0.8053,  0.8830,
         0.0453, -0.4520, -1.1806,  0.1417,  0.8241, -0.7719, -1.7992,  0.2932],
       grad_fn=<EmbeddingBackward0>)

In [197]:
sample_model.user_embeds(torch.tensor(2))

tensor([-0.9677, -2.1952,  0.4824,  1.6661, -0.7796, -0.4394,  1.4581, -0.4228,
        -0.2366, -1.6898, -0.2047, -0.6702,  1.1447, -0.4591,  1.4517, -1.6578,
         0.3572,  0.0407, -0.0359,  0.1122, -0.2783,  0.6673, -0.4803, -0.3347,
         0.0487, -0.9800,  0.0970, -0.0344,  0.8089,  0.9759, -0.0218,  0.1360],
       grad_fn=<EmbeddingBackward0>)

Start building training loop.

In [198]:
num_users = rating_data["user_id_encoded"].nunique()
num_movies = movie_data["movie_id_encoded"].nunique()
num_train_samples = df_train.shape[0]
num_val_samples = df_val.shape[0]
recsys = RecSys(num_movies=num_movies, num_users=num_users)

criterion = nn.MSELoss()
lr = 0.001
optimizer = torch.optim.Adam(lr=lr, params=recsys.parameters())

epochs = 5
print_step = 1


def train_one_step(model, optimizer, user_batch, movie_batch, rating_batch):
    optimizer.zero_grad()
    output = model(user_batch, movie_batch)
    loss = criterion(output.flatten(), rating_batch)
    num_samples = output.shape[0]
    loss.backward()
    optimizer.step()
    return loss.item() * num_samples


def val_one_step(model, user_batch, movie_batch, rating_batch):
    output = model(user_batch, movie_batch)
    loss = criterion(output.flatten(), rating_batch)
    num_samples = output.shape[0]
    return loss.item() * num_samples


for epoch in range(epochs):

    total_train_squared_error = 0.0
    total_val_squared_error = 0.0

    for batch_idx, x in enumerate(train_loader):
        user_batch = x.get("user")
        movie_batch = x.get("movie")
        rating_batch = x.get("rating")
        recsys.train()
        loss = train_one_step(recsys, optimizer, user_batch, movie_batch, rating_batch)
        total_train_squared_error += loss

    for batch_idx, x in enumerate(val_loader):
        user_batch = x.get("user")
        movie_batch = x.get("movie")
        rating_batch = x.get("rating")
        recsys.eval()
        with torch.no_grad():
            loss = val_one_step(recsys, user_batch, movie_batch, rating_batch)
            total_val_squared_error += loss

    if (epoch == 0) or ((epoch + 1) % print_step == 0):
        # print train and validation result
        train_mse = total_train_squared_error / num_train_samples
        val_mse = total_val_squared_error / num_val_samples
        train_rmse = np.sqrt(train_mse)
        val_rmse = np.sqrt(val_mse)
        print(
            f"Epoch {epoch+1: <3}/{epochs} | train RMSE = {train_rmse: .8f} | val RMSE = {val_rmse: .8f}"
        )

Epoch 1  /5 | train RMSE =  1.12147819 | val RMSE =  0.95866363
Epoch 2  /5 | train RMSE =  0.95949892 | val RMSE =  0.95830130
Epoch 3  /5 | train RMSE =  0.93308881 | val RMSE =  0.96095761
Epoch 4  /5 | train RMSE =  0.89334105 | val RMSE =  0.96643824
