# Load Imports

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import requests
import zipfile
import io
import pickle


# Load, Preprocess, Split - Dataset

In [None]:

def load_data():
  # Set the URL and file name for the dataset
  url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
  filename = 'movielens.zip'

  # Download the dataset
  response = requests.get(url)

  # Extract the dataset
  with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
      zip_ref.extractall()

  # Load the ratings.csv and movies.csv files into pandas dataframes
  ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
  movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python', encoding='latin-1')

  return ratings, movies




In [None]:
ratings, movies = load_data()

# Print the first few rows of each dataframe
print('Ratings dataframe:')
print(ratings.head())
print('\nMovies dataframe:')
print(movies.head())

In [None]:
# get the distribution of ratings
ratings_dist = ratings['rating'].value_counts().sort_index()

# create the bar chart
fig, ax = plt.subplots() # create a figure and axis object
bars = ax.bar(ratings_dist.index, ratings_dist.values)

# add rating values on top of bars
for bar, freq in zip(bars, ratings_dist.values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, freq,
            ha='center', va='bottom', fontsize=10)

# add some labels and title
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Distribution of Movie Ratings')

# show the bar chart
plt.show()


In [None]:
import pandas as pd
import numpy as np
import math

# Calculate statistics dynamically
num_movies = len(movies)
num_users = len(ratings['userId'].unique())
num_ratings = len(ratings)
avg_ratings_per_user = ratings.groupby('userId').size().mean()
avg_ratings_per_movie = ratings.groupby('movieId').size().mean()

# Format the values
num_movies_str = "{:,}".format(num_movies)
num_users_str = "{:,}".format(num_users)
num_ratings_str = "{:,}".format(num_ratings)
avg_ratings_per_user_str = "{:.2f}".format(avg_ratings_per_user)
avg_ratings_per_movie_str = "{:.2f}".format(avg_ratings_per_movie)

# Create the table
data = [['Number of Movies', num_movies_str],
        ['Number of Users', num_users_str],
        ['Number of Ratings', num_ratings_str],
        ['Average Ratings per User', avg_ratings_per_user_str],
        ['Average Ratings per Movie', avg_ratings_per_movie_str]]
df = pd.DataFrame(data, columns=['Statistic', 'Value'])

# Show the table
print(df)


In [None]:
df

In [None]:
# load the data
genres = movies['genres'].str.split('|', expand=True)
genre_counts = genres.stack().value_counts()

# create the pie chart
fig, ax = plt.subplots()
ax.pie(genre_counts.values, labels=genre_counts.index)

# set the title
ax.set_title('Genre Distribution of Movies')

# show the chart
plt.show()


In [None]:
# Encode the user and movie IDs
user_encoder = {u: i for i, u in enumerate(ratings['userId'].unique())}
movie_encoder = {m: i for i, m in enumerate(ratings['movieId'].unique())}
ratings['userId'] = ratings['userId'].apply(lambda x: user_encoder[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x: movie_encoder[x])


# Map the movie IDs to their titles
movie_titles = {}
for movie_id in movie_encoder:
    title = movies.loc[movies['movieId'] == movie_id]['title'].values[0]
    movie_titles[movie_encoder[movie_id]] = title

# Split the data into training and validation sets
train, val = train_test_split(ratings, test_size=0.4)




In [None]:
ratings.head()

# Movie Lens Dataset class

In [None]:
# Define the Movielens dataset
class MovielensDataset(Dataset):
    def __init__(self, data):
        self.user_ids = data['userId'].values
        self.movie_ids = data['movieId'].values
        self.ratings = data['rating'].values
        
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        return (self.user_ids[idx], self.movie_ids[idx]), self.ratings[idx]


# Matrix Factorization Model

In [None]:
# Define the matrix factorization model
class MatrixFactorizationModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(num_users, embedding_dim)
        self.movie_embeddings = torch.nn.Embedding(num_movies, embedding_dim)
        self.bias_user = torch.nn.Embedding(num_users, 1)
        self.bias_movie = torch.nn.Embedding(num_movies, 1)
        self.global_bias = torch.nn.Parameter(torch.tensor(ratings['rating'].mean()))
        
    def forward(self, user_ids, movie_ids):
        user_embeds = self.user_embeddings(user_ids)
        movie_embeds = self.movie_embeddings(movie_ids)
        user_bias = self.bias_user(user_ids)
        movie_bias = self.bias_movie(movie_ids)
        dot_product = torch.sum(user_embeds * movie_embeds, dim = 1)
        output = dot_product + user_bias.squeeze() + movie_bias.squeeze() + self.global_bias
        return output

# Define the training loop
def train_model(model, train_loader,loss_list, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch_idx, (inputs, ratings) in enumerate(train_loader):
            user_ids = inputs[0]
            movie_ids = inputs[1]
            ratings = ratings.float()
            outputs = model(user_ids, movie_ids)
            loss = criterion(outputs, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        loss_list.append(train_loss)
        print('Epoch: {} Train Loss: {:.4f}'.format(epoch+1, train_loss))

# Create Data Loader for train and validation set

In [None]:
# Instantiate the dataset and data loaders
train_dataset = MovielensDataset(train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = MovielensDataset(val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

# Build & Train Model

In [None]:
# Instantiate the model and define the loss and optimizer
model = MatrixFactorizationModel(num_users=len(ratings['userId'].unique()), 
                                 num_movies=len(ratings['movieId'].unique()), 
                                 embedding_dim=64)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_list = []

# Train the model
train_model(model, train_loader, loss_list, criterion, optimizer, num_epochs=15)


In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

# Training loss Plot

In [None]:
fig, ax = plt.subplots()

# Plot the loss values
ax.plot(loss_list)

# Set axis labels and title
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Training Loss')

# Show the plot
plt.show()

# Compute MSE, RMSE

In [None]:
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(val_loader): 
        model_output = model(batched_data[0][0], 
                       batched_data[0][1])
        
        model_output_list.append(model_output.sum().item() / len(batched_data[0][0]) )

        target_rating = batched_data[1]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data[0][0]))

        # print(f"model_output: {model_output}, target_rating: {target_rating}")


# squared If True returns MSE value, if False returns RMSE value.
rmse = mean_squared_error(target_rating_list, model_output_list, squared=False)
mse = mean_squared_error(target_rating_list, model_output_list, squared=True)
print(f"mse: {mse}")
print(f"rmse: {rmse}")

In [None]:
# import matplotlib.pyplot as plt

# create the data
data = [['', 'Loss Function'],
        ['MSE', mse],
        ['RMSE', rmse]]

# create the table
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
ax.axis('tight')
table = ax.table(cellText=data, colLabels=None, cellLoc='center', loc='center', fontsize=14, cellColours=[['lightgray'] * 2, [None, 'lightgray'], [None, 'lightgray']])

# adjust cell widths and heights
table.auto_set_column_width(col=list(range(2)))
table.scale(1, 2)

# show the table
plt.show()





# Evaluation Metrics

In [None]:
def evaluate_model(model, val_loader, threshold):
    """Compute recall, precision, f_score, and ndcg of the model on the validation set."""
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    ndcg = 0
    
    # Set the model to evaluation mode
    model.eval()
    
    with torch.no_grad():
        for batch in val_loader:
            # Unpack the batch elements
            user_ids = batch[0][0]
            movie_ids = batch[0][1]
            ratings = batch[1]

            # Predict the ratings for the validation set
            preds = model(user_ids, movie_ids).squeeze()
            
            # Compute the binary predictions using the threshold
            binary_preds = torch.where(torch.logical_and(preds >= threshold, preds <= 5), torch.ones_like(preds), torch.zeros_like(preds))
            
            binary_movie_rating = torch.where(ratings >= threshold, torch.ones_like(preds), torch.zeros_like(preds))
            # Compute the true positives, false positives, and false negatives
            true_positives += ((binary_preds == 1) & (binary_movie_rating == 1)).sum().item()
            false_positives += ((binary_preds == 1) & (binary_movie_rating == 0)).sum().item()
            false_negatives += ((binary_preds == 0) & (binary_movie_rating == 1)).sum().item()

            
            # Compute the NDCG score
            ndcg += ndcg_score(np.expand_dims(ratings, axis=0), np.expand_dims(preds, axis=0), k=10).item()
    
    # Compute the recall, precision, f_score, and ndcg
    recall = true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    f_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    ndcg /= len(val_loader)
    
    return recall, precision, f_score, ndcg


# Compute Recall, Precision, F_score, Ndcg 

In [None]:
recall, precision, f_score, ndcg = evaluate_model(model, val_loader, 3.5)
print(f"Recall:  {recall}, Precision: {precision}, F_score: {f_score}, Ndcg: {ndcg}")

In [None]:
# create the data
data = [['', 'Evaluation Metrics'],
        ['Recall', recall],
        ['Precision', precision],
        ['F_Score', f_score],
        ['NDCG', ndcg]]

# create the table
fig, ax = plt.subplots(figsize=(4, 3)) # create a figure and axis object
colors = [['lightgray']*2, [None, 'lightgray'], [None, 'lightgray'], [None, 'lightgray'], [None, 'lightgray']]
table = ax.table(cellText=data, colLabels=None, cellLoc='center', loc='center', fontsize=14, cellColours = colors)

# adjust cell widths and heights
table.auto_set_column_width(col=list(range(2)))
table.scale(1, 2)

# remove borders
ax.axis('off')

# show the table
plt.show()





# Compute Recall@K, Precision@K



In [None]:
from collections import defaultdict

# a dict that stores a list of predicted rating and actual rating pair for each user 
user_est_true = defaultdict(list)

# iterate through the validation data to build the user-> [(y1, y1_hat), (y2, y2_hat)...]

model.eval()
with torch.no_grad():
    for i, batched_data in enumerate(val_loader): 
        users = batched_data[0][0]
        movies = batched_data[0][1]
        ratings = batched_data[1]

        
        model_output = model(batched_data[0][0], batched_data[0][1])

        for i in range(len(users)):
            user_id = users[i].item()
            movie_id = movies[i].item() 
            pred_rating = model_output[i].item()
            true_rating = ratings[i].item()
            
            print(f"{user_id}, {movie_id}, {pred_rating}, {true_rating}")
            user_est_true[user_id].append((pred_rating, true_rating, movie_id))            


In [None]:

with torch.no_grad():
    precisions = dict()
    recalls = dict()
    recommended_movies = dict()
    k = 10

    threshold = 3.5

    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value. 
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        top_k_movies = [movie_titles[movie_id] for (est, true_r, movie_id) in user_ratings[:k] if est >= threshold and est <= 5]

        # Add the recommended movies to the dictionary for this user
        recommended_movies[uid] = top_k_movies[:k]
 
        # get the number of actual relevant item
        n_rel = sum((true_r >= threshold) for (_, true_r,_) in user_ratings)

        # get the number of recommended item that are predicted relevent and within topk
        n_rec_k = sum((est >= threshold) for (est, _, _) in user_ratings[:k])

        # get the number of recommented item that' is also actually relevant within topk
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r, _) in user_ratings[:k]
        )

        # print(f"uid {uid},  n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}")

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of   relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0



In [None]:
# Precision and recall can then be averaged over all users
print(f"precision @ {k}: {sum(prec for prec in precisions.values()) / len(precisions)}")

print(f"recall @ {k} : {sum(rec for rec in recalls.values()) / len(recalls)}")


# Recommend Movies to User

In [None]:
recommended_movies

In [None]:
# Sample User
user_id = 478
recommended_movies[user_id]

In [None]:
def recommend_unwatched_movies(model, user_encoder, movie_encoder, user_id, threshold, top_n=10):
    """Recommend top N movies to the given user that they have not yet watched."""
    
    # Get the list of all movies
    all_movies = np.array(list(movie_encoder.keys()))
    
    # Get the list of movies rated by the user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].values
    
    # Get the list of movies the user has not watched
    unwatched_movies = np.setdiff1d(all_movies, rated_movies)

    # Get the encoded user ID
    encoded_user_id = user_encoder[user_id]
  
    # Get the predicted ratings for the user and unwatched movies
    recommended_movie_id = []
    for movie_id in unwatched_movies:
        encoded_movie_id = movie_encoder[movie_id]
        predicted_rating = model(torch.LongTensor([encoded_user_id]), torch.LongTensor([encoded_movie_id])).item()
        if (predicted_rating >= threshold and predicted_rating <= 5.0):
            recommended_movie_id.append((predicted_rating, movie_id))
    
    # Sort the unwatched movies by predicted rating and recommend the top N
    recommended_movie_id.sort(key = lambda x : x[0], reverse=True)
    top_movies = [movie_titles[movie_encoder[movie_id]] for _, movie_id in recommended_movie_id[:top_n]]
    
    return top_movies


In [None]:
# Load the Movielens dataset
ratings, movies = load_data()

user_encoder = {u: i for i, u in enumerate(ratings['userId'].unique())}
movie_encoder = {m: i for i, m in enumerate(ratings['movieId'].unique())}
ratings['userId'] = ratings['userId'].apply(lambda x: user_encoder[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x: movie_encoder[x])


# Map the movie IDs to their titles
movie_titles = {}
for movie_id in movie_encoder:
    title = movies.loc[movies['movieId'] == movie_id]['title'].values[0]
    genres = movies.loc[movies['movieId'] == movie_id]['genres'].values[0]
    movie_titles[movie_encoder[movie_id]] = (title, genres)



In [None]:
user_id = 478
print('User ID:', user_id, recommend_unwatched_movies(model, user_encoder, movie_encoder, user_id, 3.5))

In [None]:
user_id = 477
rec_movies = dict(recommend_unwatched_movies(model, user_encoder, movie_encoder, user_id, 3.5))
pd.DataFrame({"Movie Title": rec_movies.keys(), "Genre": rec_movies.values()})