In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

# Data Loading

In [2]:
path_movie = "dataset/netflix-movie/Netflix_Dataset_Movie.csv"
path_rating = "dataset/netflix-movie/Netflix_Dataset_Rating.csv"


movie_data = pd.read_csv(path_movie).loc[lambda df_: df_["Year"] >= 2005]
rating_data = pd.read_csv(path_rating).loc[lambda df_: df_["Movie_ID"].isin(movie_data["Movie_ID"].
values)]

In [3]:
movie_data.columns = movie_data.columns.str.lower().str.replace(" ", "_")
rating_data.columns = rating_data.columns.str.lower().str.replace(" ", "_")

In [4]:
print(f"Number of movie: {movie_data['movie_id'].nunique()}")
print(f"Number of user: {rating_data['user_id'].nunique()}")
print(f"Number of rating: {rating_data.shape[0]}")

Number of movie: 512
Number of user: 82224
Number of rating: 222272


In [5]:
movie_data.head()

Unnamed: 0,movie_id,year,name
16,17,2005,7 Seconds
84,85,2005,Elfen Lied
90,91,2005,WWE: Royal Rumble 2005
148,149,2005,The Edward R. Murrow Collection
150,151,2005,Sleepover Nightmare


In [6]:
rating_data.head()

Unnamed: 0,user_id,rating,movie_id
13420,2187374,4,17
13421,2503129,5,17
13422,263315,3,17
13423,608309,3,17
13424,2336678,2,17


# Label Encoding

However, the movie_id and user_id do not start with 0. We need to encode them to make them start with 0 and increase by 1.

In [7]:
movie_encoder = LabelEncoder()
movie_data["movie_id_encoded"] = movie_encoder.fit_transform(movie_data["movie_id"])

movie_data.head()

Unnamed: 0,movie_id,year,name,movie_id_encoded
16,17,2005,7 Seconds,0
84,85,2005,Elfen Lied,1
90,91,2005,WWE: Royal Rumble 2005,2
148,149,2005,The Edward R. Murrow Collection,3
150,151,2005,Sleepover Nightmare,4


In [8]:
user_encoder = LabelEncoder()

rating_data["user_id_encoded"] = user_encoder.fit_transform(rating_data["user_id"])
rating_data["movie_id_encoded"] = movie_encoder.transform(rating_data["movie_id"])

rating_data.head()

Unnamed: 0,user_id,rating,movie_id,user_id_encoded,movie_id_encoded
13420,2187374,4,17,67864,0
13421,2503129,5,17,77654,0
13422,263315,3,17,8119,0
13423,608309,3,17,18998,0
13424,2336678,2,17,72409,0


# Dataset and Dataloader Preparation

After that we can prepare dataset and dataloader.

In [9]:
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return self.ratings.shape[0]

    def __getitem__(self, index):
        user = self.users[index]
        movie = self.movies[index]
        rating = self.ratings[index]
        return {
            "user": torch.tensor(user, dtype=torch.int64),
            "movie": torch.tensor(movie, dtype=torch.int64),
            "rating": torch.tensor(rating, dtype=torch.float32),
        }


df_train, df_val = train_test_split(
    rating_data, test_size=0.05, random_state=1, shuffle=True,
)

train_dataset = MovieDataset(
    df_train["user_id_encoded"].values, 
    df_train["movie_id_encoded"].values, 
    df_train["rating"].values
)
val_dataset = MovieDataset(
    df_val["user_id_encoded"].values, 
    df_val["movie_id_encoded"].values, 
    df_val["rating"].values
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


print(f"Train shape = {df_train.shape}")
print(f"Val shape = {df_val.shape}")

Train shape = (211158, 5)
Val shape = (11114, 5)


In [10]:
for batch_idx, x in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print(f"User: {x.get('user')}") 
    print(f"Movie: {x.get('movie')}") 
    print(f"Rating: {x.get('rating')}") 

    break

Batch 1
User: tensor([79945, 73558, 15187, 65311, 69461, 35161, 11705,  3652, 60936, 63387,
        17241, 12927, 45297, 58741, 70594, 80414, 12858, 14489, 30031, 32766,
        70980, 51573, 17784, 17474, 45709, 58321, 82023, 10965, 54236, 26300,
        42007, 25841])
Movie: tensor([117,  15, 123,  92, 120, 120,  24, 120,  35,  15,  35,  38, 120, 120,
         92,  35, 120,  15,  15,  15,  35, 116, 116, 116,  12, 120,  35,  12,
         12,  21,  15, 120])
Rating: tensor([3., 4., 4., 4., 4., 4., 3., 5., 4., 3., 5., 5., 4., 5., 2., 4., 5., 3.,
        4., 3., 5., 3., 3., 5., 3., 4., 1., 3., 5., 5., 4., 5.])


In [11]:
for batch_idx, x in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print(f"User: {x.get('user').shape}") 
    print(f"Movie: {x.get('movie').shape}") 
    print(f"Rating: {x.get('rating').shape}") 

    break

Batch 1
User: torch.Size([32])
Movie: torch.Size([32])
Rating: torch.Size([32])


# Model Architecture

In [12]:
class RecSys(nn.Module):
    def __init__(self, num_users, num_movies):
        super().__init__()
        self.user_embeds = nn.Embedding(num_embeddings=num_users, embedding_dim=32)
        self.movie_embeds = nn.Embedding(num_embeddings=num_movies, embedding_dim=32)
        self.linear = nn.Linear(in_features=64, out_features=1)

    def forward(self, user_id, movie_id):
        user_embeds = self.user_embeds(user_id)
        movie_embeds = self.movie_embeds(movie_id)
        output = torch.cat([user_embeds, movie_embeds,], dim=1)
        output = self.linear(output) 
        return output

Intuition: embedding layer

In [13]:
# sample forward  
sample_model = RecSys(num_users=3, num_movies=5)
zero = torch.zeros(1, dtype=torch.long)
sample_model(user_id=zero, movie_id=zero)

tensor([[-0.2082]], grad_fn=<AddmmBackward0>)

In [14]:
sample_model.user_embeds(torch.tensor(0))

tensor([ 0.4413,  1.4442, -0.0590,  0.1726,  1.0295,  0.6764, -0.2685,  1.7788,
         1.0331,  0.8247,  1.2816,  2.3189,  0.5206,  0.9737, -0.0908,  0.2900,
        -0.7403,  1.7751, -1.3632,  0.7035,  1.0892, -1.5456, -0.9757,  1.1953,
         1.1891,  0.6957, -1.4244,  1.2019, -1.7380,  0.6180, -0.4272, -1.2800],
       grad_fn=<EmbeddingBackward0>)

In [15]:
sample_model.user_embeds(torch.tensor(1))

tensor([ 0.9873, -0.0590,  1.7936,  1.0566, -1.1143,  0.2417, -0.2459, -0.4595,
         0.5420, -0.0959,  0.6436,  1.4494, -0.1323, -0.7115, -1.1255,  0.0782,
        -1.0961, -0.5254, -1.7225, -0.5179,  0.7030,  1.4099,  1.3415,  1.4291,
        -0.1547, -1.1236,  0.8359,  1.2500,  0.0871, -0.2413,  0.2581, -0.3197],
       grad_fn=<EmbeddingBackward0>)

In [16]:
sample_model.user_embeds(torch.tensor(2))

tensor([ 1.0172,  1.3604,  0.8126,  1.5725, -1.1071,  1.3814, -0.3294, -0.4838,
        -1.7807,  1.0019, -0.0116,  0.3569,  2.8579,  0.3632, -2.1023,  0.6500,
         1.2915,  0.2603,  0.0479, -0.1491,  0.8740, -0.2258, -0.0701, -0.6733,
        -1.6698,  1.0064, -1.1168, -0.2634, -1.7533,  0.0965,  1.6248, -0.5973],
       grad_fn=<EmbeddingBackward0>)

# Training and Validation

In [17]:
num_users = rating_data["user_id_encoded"].nunique()
num_movies = movie_data["movie_id_encoded"].nunique()
num_train_samples = df_train.shape[0]
num_val_samples = df_val.shape[0]
recsys = RecSys(num_movies=num_movies, num_users=num_users)

criterion = nn.MSELoss()
lr = 0.001
optimizer = torch.optim.Adam(lr=lr, params=recsys.parameters())

epochs = 5
print_step = 1


def train_one_step(model, optimizer, user_batch, movie_batch, rating_batch):
    optimizer.zero_grad()
    output = model(user_batch, movie_batch)
    loss = criterion(output.flatten(), rating_batch)
    num_samples = output.shape[0]
    loss.backward()
    optimizer.step()
    return loss.item() * num_samples


def val_one_step(model, user_batch, movie_batch, rating_batch):
    output = model(user_batch, movie_batch)
    loss = criterion(output.flatten(), rating_batch)
    num_samples = output.shape[0]
    return loss.item() * num_samples


for epoch in range(epochs):

    total_train_squared_error = 0.0
    total_val_squared_error = 0.0

    for batch_idx, x in enumerate(train_loader):
        user_batch = x.get("user")
        movie_batch = x.get("movie")
        rating_batch = x.get("rating")
        recsys.train()
        loss = train_one_step(recsys, optimizer, user_batch, movie_batch, rating_batch)
        total_train_squared_error += loss

    for batch_idx, x in enumerate(val_loader):
        user_batch = x.get("user")
        movie_batch = x.get("movie")
        rating_batch = x.get("rating")
        recsys.eval()
        with torch.no_grad():
            loss = val_one_step(recsys, user_batch, movie_batch, rating_batch)
            total_val_squared_error += loss

    if (epoch == 0) or ((epoch + 1) % print_step == 0):
        # print train and validation result
        train_mse = total_train_squared_error / num_train_samples
        val_mse = total_val_squared_error / num_val_samples
        train_rmse = np.sqrt(train_mse)
        val_rmse = np.sqrt(val_mse)
        print(
            f"Epoch {epoch+1: <3}/{epochs} | train RMSE = {train_rmse: .8f} | val RMSE = {val_rmse: .8f}"
        )

Epoch 1  /5 | train RMSE =  1.12176563 | val RMSE =  0.96423549
Epoch 2  /5 | train RMSE =  0.95660068 | val RMSE =  0.96612627
Epoch 3  /5 | train RMSE =  0.92620938 | val RMSE =  0.97035144
Epoch 4  /5 | train RMSE =  0.88416434 | val RMSE =  0.97591637
Epoch 5  /5 | train RMSE =  0.83994390 | val RMSE =  0.98304811


# Recommendation

In [18]:
def movie_id_encoded_to_title(id):
    return movie_data.loc[movie_data["movie_id_encoded"] == id, "name"].values[0]

def recommend(user_id_to_pred):
    # all his past rating history
    past_user_ratings = rating_data.loc[rating_data["user_id_encoded"] == user_id_to_pred]
    # set of movies already rated by the user
    rated_movie_ids_encoded = set(past_user_ratings["movie_id_encoded"].values)

    # all of our movies
    all_movie_ids_encoded = movie_data.loc[:, "movie_id_encoded"].values
    all_movie_ids_encoded = torch.from_numpy(all_movie_ids_encoded)

    user_id = torch.tensor(user_id_to_pred, dtype=torch.int64)
    user_id_expanded = user_id.expand(size=all_movie_ids_encoded.shape)

    recsys.eval()
    with torch.no_grad():
        predicted_ratings = recsys(movie_id=all_movie_ids_encoded, user_id=user_id_expanded)

    # filter out movies already rated by user
    top_recommendations = [
        (movie_id_encoded.item(), rating.item())
        for movie_id_encoded, rating in zip(all_movie_ids_encoded, predicted_ratings)
        if movie_id_encoded.item() not in rated_movie_ids_encoded
    ]

    # sort and get top 10 recommendation
    top_recommendations.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = top_recommendations[:10]

    # max the rating to 5
    top_recommendations = [(movie_id_encoded, min(rating, 5.0)) for (movie_id_encoded, rating) in top_recommendations]

    print(f"User {user_id_to_pred}")
    print("======================================================")
    print(f"Past History:")
    for row in past_user_ratings.itertuples(index=True, name="Pandas"):
        movie_name = movie_id_encoded_to_title(row.movie_id_encoded)
        print(f"Movie: {movie_name} (rating {row.rating})")

    print("======================================================")
    print(f"Recommendation:")
    for encoded_id, pred_rating in top_recommendations:
        movie_name = movie_id_encoded_to_title(encoded_id)
        print(f"Movie: {movie_name} (predicted rating {pred_rating:.2f})")

In [19]:
recommend(0)

User 0
Past History:
Movie: The Hitchhiker's Guide to the Galaxy (rating 4)
Movie: The Amityville Horror (rating 3)
Movie: Batman Begins (rating 3)
Recommendation:
Movie: Land of the Dead (predicted rating 3.99)
Movie: Pooh's Heffalump Halloween Movie (predicted rating 3.93)
Movie: Alias: Season 4 (predicted rating 3.81)
Movie: Coach Carter (predicted rating 3.68)
Movie: The L Word: Season 2 (predicted rating 3.53)
Movie: Pooh's Heffalump Movie (predicted rating 3.46)
Movie: Mermaid Forest (predicted rating 3.41)
Movie: Faith of My Fathers (predicted rating 3.35)
Movie: Bigger Than the Sky (predicted rating 3.32)
Movie: Hostage (predicted rating 3.30)


In [24]:
recommend(3)

User 3
Past History:
Movie: The Pacifier (rating 4)
Movie: Hostage (rating 4)
Movie: Coach Carter (rating 5)
Recommendation:
Movie: Land of the Dead (predicted rating 4.99)
Movie: Pooh's Heffalump Halloween Movie (predicted rating 4.93)
Movie: Alias: Season 4 (predicted rating 4.82)
Movie: Batman Begins (predicted rating 4.80)
Movie: The L Word: Season 2 (predicted rating 4.53)
Movie: Pooh's Heffalump Movie (predicted rating 4.47)
Movie: Mermaid Forest (predicted rating 4.42)
Movie: Faith of My Fathers (predicted rating 4.36)
Movie: Bigger Than the Sky (predicted rating 4.33)
Movie: Saving Face (predicted rating 4.29)


# Check the Embedding

In [25]:
recsys.user_embeds(torch.tensor(4))

tensor([ 0.3512, -1.2781, -0.5436, -1.3368,  0.2616,  0.4616,  0.0638,  1.3292,
        -0.1842, -1.0550, -0.4159,  0.6357, -0.4987, -1.3834, -0.6750, -0.2105,
        -0.7722, -0.3345, -0.1555, -1.2376, -1.0561,  0.8264,  1.0725, -0.8379,
        -0.9016,  1.6871, -0.4200, -0.2159,  1.4906, -0.8288, -0.6790, -0.1686],
       grad_fn=<EmbeddingBackward0>)

In [29]:
recsys.user_embeds(torch.tensor(1002))

tensor([ 1.0885,  0.1655, -0.5696, -0.2680,  0.3881,  0.0510, -1.9215,  0.7248,
         0.9678, -0.7449,  2.2349, -1.9566, -1.4311,  1.0081,  0.4885, -0.3890,
        -1.1664,  0.7785,  0.0940, -1.1736,  0.3443,  1.2190, -1.5367,  0.0814,
         1.2966, -0.1840, -1.0546,  0.0117, -2.4107,  0.7779, -1.6893, -0.7434],
       grad_fn=<EmbeddingBackward0>)

In [30]:
recsys.movie_embeds(torch.tensor(4))

tensor([ 0.1664,  0.1603,  0.1392,  0.5608,  0.7516,  0.3981, -0.1408,  0.7497,
         0.7126, -1.6032,  0.6615,  1.7132,  0.0111,  2.3691, -0.3437, -1.1289,
         1.5248, -0.0262, -0.5698, -0.7279, -1.2728,  1.5623,  0.1241,  1.0471,
         1.2422, -1.3977,  0.6566,  1.3975, -1.3887, -1.0352,  0.8116,  0.4505],
       grad_fn=<EmbeddingBackward0>)

In [31]:
recsys.movie_embeds(torch.tensor(400))

tensor([-0.2841, -0.7781, -1.6210, -0.3829, -1.0402,  0.6156,  0.3045, -0.4976,
        -1.6081, -1.9383, -2.2449,  2.4617, -0.0662,  0.8831,  0.9313, -0.8876,
        -1.7698, -1.2476, -0.6703, -0.4683, -0.1761, -0.1463,  0.7554,  0.7601,
         0.1288, -0.5142,  0.8479, -2.2829,  0.4148,  1.7130,  1.9453,  2.0603],
       grad_fn=<EmbeddingBackward0>)