In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import sys


In [None]:
# Use MPS if available, else use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load the data
df = pd.read_csv('ml-latest-small/ratings.csv')

df.head()


In [None]:
df.describe()

In [None]:
df.isnull().sum()


In [None]:
df.userId.nunique(), df.movieId.nunique()

In [None]:
df.rating.value_counts()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(df.rating)
plt.xlabel("Rating")
plt.ylabel("Count")
plt.title("Count of Ratings")
plt.show()

## MovieLens Dataset

In [None]:
from torch.utils.data import Dataset
import torch
class MovieLensDataset(Dataset):

    """
    The Movie Lens Dataset class. This class prepares the dataset for training and validation.
    """

    def __init__(self, users, movies, ratings):

        """
        Args:
            users (list): The list of users.
            movies (list): The list of movies.
            ratings (list): The list of ratings.
        """
        
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        """
        Retrieves a sample from the dataset at the specified index.
        """
        user = self.users[idx]
        movie = self.movies[idx]
        rating = self.ratings[idx]

        return {
            "user": torch.tensor(user, dtype=torch.long),
            "movie": torch.tensor(movie, dtype=torch.long),
            "rating": torch.tensor(rating, dtype=torch.float)
        }

## Movie Recommendation System

In [None]:

class MovieRecommendationSystem(nn.Module):
    """
    The Movie Recommendation System class. This class implements the movie recommendation system.
    """

    def __init__(self, num_users, num_movies, embedding_size=256, hidden_dim=256, dropout_rate=0.2):
        """
        Args:
            num_users (int): The number of users.
            num_movies (int): The number of movies.
            embedding_dim (int): The dimension of the embedding.
            hidden_dim (int): The dimension of the hidden layer.
        """
        super(MovieRecommendationSystem, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.hidden_dim = hidden_dim

        # Embeddings layers
        self.user_embedding = nn.Embedding(num_embeddings = self.num_users, embedding_dim=self.embedding_size)
        self.movie_embedding = nn.Embedding(num_embeddings = self.num_movies, embedding_dim=self.embedding_size)

        # Hidden layers
        self.fc1 = nn.Linear(2 * self.embedding_size, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 1)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_rate)
        
        # Activation function
        self.relu = nn.ReLU()

    def forward(self, users, movies):
        """
        Forward pass of the movie recommendation system.
        """
        user_embedding = self.user_embedding(users)
        movie_embedding = self.movie_embedding(movies)

        # Concatenate the user and movie embeddings
        combined = torch.cat([user_embedding, movie_embedding], dim=1)

        # Pass through the hidden layers with ReLU activation and dropout
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.dropout(x)

        # Output layer
        output = self.fc2(x)

        return output

## Data Preprocessing and Encoding

In [None]:
from sklearn import preprocessing

le_user = preprocessing.LabelEncoder()
le_movie = preprocessing.LabelEncoder()
df.userId = le_user.fit_transform(df.userId.values)
df.movieId = le_movie.fit_transform(df.movieId.values)

df_train, df_val = model_selection.train_test_split(
    df, test_size=0.1, random_state=3, stratify=df.rating.values
)

train_dataset = MovieLensDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values,
)

valid_dataset = MovieLensDataset(
    users=df_val.userId.values,
    movies=df_val.movieId.values,
    ratings=df_val.rating.values,
)

## Initializing Data Loaders

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8
)
val_loader = DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8
)

## Training the Model

In [None]:
recommendation_model = MovieRecommendationSystem(
    num_users=len(le_user.classes_),
    num_movies=len(le_movie.classes_),
    embedding_size=64,
    hidden_dim=128,
    dropout_rate=0.1,
).to(device)

optimizer = torch.optim.Adam(recommendation_model.parameters(), lr=1e-3)
loss_func = nn.MSELoss()

In [None]:
EPOCHS = 2

# Function to log progress
def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d} | Step: {step}/{data_size} | Avg Loss: {avg_loss:<6.9f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)

In [None]:
total_loss = 0
log_progress_step = 100
losses = []
train_dataset_size = len(train_dataset)
print(f"Training on {train_dataset_size} samples...")

recommendation_model.train()
for e in range(EPOCHS):
    step_count = 0  # Reset step count at the beginning of each epoch
    for i, train_data in enumerate(train_loader):
        output = recommendation_model(
            train_data["users"].to(device), train_data["movies"].to(device)
        )
        # Reshape the model output to match the target's shape
        output = output.squeeze()  # Removes the singleton dimension
        ratings = (
            train_data["ratings"].to(torch.float32).to(device)
        )  # Assuming ratings is already 1D

        loss = loss_func(output, ratings)
        total_loss += loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Increment step count by the actual size of the batch
        step_count += len(train_data["users"])

        # Check if it's time to log progress
        if (
            step_count % log_progress_step == 0 or i == len(train_loader) - 1
        ):  # Log at the end of each epoch
            log_progress(
                e, step_count, total_loss, log_progress_step, train_dataset_size, losses
            )
            total_loss = 0