In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import LabelEncoder

#### loading dataset

In [2]:
path = "dataset/ml-latest-small/"

df = pd.read_csv(path + "ratings.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.userId.nunique()

610

In [5]:
df.movieId.nunique()

9724

In [6]:
df.shape

(100836, 4)

In [7]:
sparsity = len(df) / (df.userId.nunique() * df.movieId.nunique())

print(f"sparsity of dataset is {sparsity * 100}%")

sparsity of dataset is 1.6999683055613624%


#### matrix decomposition technique using SGD

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


class RatingDataset():
    
    def __init__(self, data):
        self.data = data
        self.users = torch.tensor(data['userId'].values - 1) 

        # Use LabelEncoder to transform non-contiguous movieId into contiguous integers
        self.movie_encoder = LabelEncoder()
        self.movies = torch.tensor(self.movie_encoder.fit_transform(data['movieId']), dtype=torch.long)

        self.ratings = torch.tensor(data['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]


class MatrixFactorizationNN(nn.Module):
    def __init__(self, num_users, num_items, num_factors):
        super().__init__()

        #Embedding layers for users and items
        self.user_embedding = nn.Embedding(num_users, num_factors)  
        self.item_embedding = nn.Embedding(num_items, num_factors)

        #if we want a more complex model instead of a simple dot product, can consider:
        # self.out = nn.Linear(num_factors, 1) 

    def forward(self, user, item):
        # Get user and item embeddings
        user_embedded = self.user_embedding(user)
        item_embedded = self.item_embedding(item)

        # Predicted rating is the dot product of the user and item embeddings
        # Perform a element wise multiplication then sum them up together. 
        predicted_rating = (user_embedded * item_embedded).sum(1)
        return predicted_rating


dataset = RatingDataset(df)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)

num_users = df['userId'].nunique()
num_items = df['movieId'].nunique()


num_factors = 20  #latent factors
model = MatrixFactorizationNN(num_users, num_items, num_factors)


loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
num_epochs = 25
for epoch in range(num_epochs):
    model.train()

    running_rmse = 0.0
    for batch_users, batch_items, batch_ratings in dataloader:
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(batch_users, batch_items)
        loss = loss_function(predictions, batch_ratings)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_rmse += torch.sqrt(loss).item()

    print(f"Epoch {epoch+1}/{num_epochs}, RMSE Loss: {running_rmse/len(dataloader):.4f}")

# # After training, we can now predict ratings using the trained model

# model.eval()

# with torch.no_grad():
#     predictions = model(input_data)


Epoch 1/25, RMSE Loss: 4.5050
Epoch 2/25, RMSE Loss: 3.5213
Epoch 3/25, RMSE Loss: 2.7860
Epoch 4/25, RMSE Loss: 2.2924
Epoch 5/25, RMSE Loss: 1.9893
Epoch 6/25, RMSE Loss: 1.7896
Epoch 7/25, RMSE Loss: 1.6446
Epoch 8/25, RMSE Loss: 1.5354
Epoch 9/25, RMSE Loss: 1.4472
Epoch 10/25, RMSE Loss: 1.3745
Epoch 11/25, RMSE Loss: 1.3137
Epoch 12/25, RMSE Loss: 1.2627
Epoch 13/25, RMSE Loss: 1.2188
Epoch 14/25, RMSE Loss: 1.1803
Epoch 15/25, RMSE Loss: 1.1440
Epoch 16/25, RMSE Loss: 1.1145
Epoch 17/25, RMSE Loss: 1.0854
Epoch 18/25, RMSE Loss: 1.0613
Epoch 19/25, RMSE Loss: 1.0379
Epoch 20/25, RMSE Loss: 1.0163
Epoch 21/25, RMSE Loss: 0.9972
Epoch 22/25, RMSE Loss: 0.9789
Epoch 23/25, RMSE Loss: 0.9639
Epoch 24/25, RMSE Loss: 0.9477
Epoch 25/25, RMSE Loss: 0.9337


In [9]:
print(dir(dataloader))

['_DataLoader__initialized', '_DataLoader__multiprocessing_context', '_IterableDataset_len_called', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_collation', '_dataset_kind', '_get_iterator', '_index_sampler', '_iterator', 'batch_sampler', 'batch_size', 'check_worker_number_rationality', 'collate_fn', 'dataset', 'drop_last', 'generator', 'multiprocessing_context', 'num_workers', 'persistent_workers', 'pin_memory', 'pin_memory_device', 'prefetch_factor', 'sampler', 'timeout', 'worker_init_fn']


In [10]:
print(dir(dataset))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'data', 'movie_encoder', 'movies', 'ratings', 'users']
