In [1]:
import os
import numpy as np
import pandas as pd
import tabulate
import torch
import pytorch_lightning as pl

from sentence_transformers import SentenceTransformer
from collections import defaultdict
from torch.utils.data import DataLoader, TensorDataset, random_split
from src.models import TwoTowerRecommendationModel, TwoTowerRecommendationModel_MLP
from src.utils import generate_random_sample_data
from src.movie_dataset_utils import load_data, gen_user_vecs
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

pd.set_option("display.precision", 1)

os.environ["TOKENIZERS_PARALLELISM"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


### Content-based filtering with a Two Tower neural network

<figure>
    <center> <img src="images/RecSysNN.png"   style="width:500px;height:280px;" ></center>
</figure>

The Two-Tower model is a neural network architecture used for recommendation systems. It consists of two separate neural networks (towers) that learn user and item representations independently. The user tower processes user features, while the item tower processes item features. The outputs of these towers are then combined, typically using a similarity measure like cosine similarity, to predict the relevance score or rating for a given user-item pair. This model allows for efficient retrieval of recommendations by precomputing item representations.


## Movie ratings dataset 
The data set is derived from the [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/) dataset. 

[F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>]

The original dataset has 9000 movies rated by 600 users with ratings on a scale of 0.5 to 5 in 0.5 step increments. The dataset has been reduced in size to focus on movies from the years since 2000 and popular genres. The reduced dataset has $n_u = 395$ users and $n_m= 694$ movies. For each movie, the dataset provides a movie title, release date, and one or more genres. For example "Toy Story 3" was released in 2010 and has several genres: "Adventure|Animation|Children|Comedy|Fantasy|IMAX".  This dataset contains little information about users other than their ratings. This dataset is used to create training vectors for the neural networks described below. 

In [3]:
# Load Data, set configuration variables
    
cwd = os.getcwd()
path = os.path.join(cwd, "data/gold/movie_dataset/")
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data(path=path)

add_movie_descriptions_embeddings = False

In [None]:
if add_movie_descriptions_embeddings:
        # Embed movie descriptions using SentenceTransformer with all-MiniLM-L6-v2 transformer model
        # This creates 384 dimensional embeddings for each movie description
        model = SentenceTransformer("all-MiniLM-L6-v2")
        movie_descripions = {movie_id:content['title'] + " " + content['genres'] for movie_id, content in movie_dict.items()}
        movie_descripions_sentences = [movie_descripions[movie_id] for movie_id in movie_dict.keys()]
        movies_embeddings = model.encode(movie_descripions_sentences)
        
        # Use PCA to reduce the embeddings dimensions with 0.95 variance
        pca = PCA(n_components=0.95)    
        movies_embeddings = pca.fit_transform(movies_embeddings)
        movies_embeddings_dict = {movie_id:embeddings for movie_id, embeddings in zip(movie_dict.keys(), movies_embeddings)}
        
        # add embeddings to item_train
        items_train_embeddings_to_add = []
        for i in range(item_train.shape[0]):
            movie_id = int(item_train[i, 0])
            embeddings = movies_embeddings_dict[movie_id]
            items_train_embeddings_to_add.append(embeddings)

        # add embeddings to item_vecs
        items_vecs_to_add = []
        for i in range(item_vecs.shape[0]):
            movie_id = int(item_vecs[i, 0])
            embeddings = movies_embeddings_dict[movie_id]
            items_vecs_to_add.append(embeddings)
        
        # Add embeddings to item_vecs
        item_vecs = np.hstack((item_vecs, np.array(items_vecs_to_add)))
        
        # Add embeddings to item_train
        item_train = np.hstack((item_train, np.array(items_train_embeddings_to_add)))

In [5]:
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
scaledata = True  # applies the standard scalar to data if true
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 58187


In [6]:
use_synthetic_data = False
if use_synthetic_data:
    # Generate synthetic data
    num_samples = 1000
    user_data, product_data, target_data = generate_random_sample_data(num_samples)

In [7]:
scaledata = True
if scaledata:
    item_train_save = item_train
    user_train_save = user_train
    y_train_save = y_train

    scalerItem = StandardScaler()
    scalerItem.fit(item_train)
    item_train = scalerItem.transform(item_train)

    scalerUser = StandardScaler()
    scalerUser.fit(user_train)
    user_train = scalerUser.transform(user_train)
    
    targetScaler = MinMaxScaler((-1, 1))
    targetScaler.fit(y_train.reshape(-1, 1))
    y_train = targetScaler.transform(y_train.reshape(-1, 1))

    print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train)))
    print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train)))

True
True


In [8]:
user_data_tensor = torch.tensor(user_train[:, u_s:], dtype=torch.float32)
product_data_tensor = torch.tensor(item_train[:, i_s:], dtype=torch.float32)
target_data_tensor = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32) 
    
print("User Tensor", user_data_tensor.shape, 
      "Product Tensor", product_data_tensor.shape, 
      "Target Rating Tensor", target_data_tensor.shape)

User Tensor torch.Size([58187, 14]) Product Tensor torch.Size([58187, 16]) Target Rating Tensor torch.Size([58187, 1])


In [9]:
 # Hyperparameters
batch_size = 32
epochs = 5
learning_rate = 0.001
use_gpu = True

# Model configurations
user_config = {'input_dim': user_data_tensor.shape[1], 
            'embed_dim': 128,
            'output_dim': 64, 
            'nr_heads': 8, 
            'continuous_feature_indices':[i for i in range(user_data_tensor.shape[1])],
            'categorical_feature_indices': [], 
            'internal_dimension':32 # Internal Dimension for Continuous Features Embedding
            }
# product_config = {'input_dim': product_data_tensor.shape[1], 
#                 'embed_dim': 128, 
#                 'output_dim': 64,
#                 'nr_heads': 8}
product_config = {'input_dim': product_data_tensor.shape[1], 
        'embed_dim': 128,
        'output_dim': 64, 
        'nr_heads': 8, 
        'continuous_feature_indices':[i for i in range(product_data_tensor.shape[1])], # Embeddings start at index 17
        'categorical_feature_indices': [],#[i for i in range(0, 16)], # 16 is the number of genre features
        'internal_dimension': 32 # Internal Dimension for Continuous Features Embedding
        }
        
user_config_mlp = {'input_dim': user_data_tensor.shape[1], 
            'embed_dim': 128,
            'output_dim': 64}
product_config_mlp = {'input_dim': product_data_tensor.shape[1], 
                'embed_dim': 128, 
                'output_dim': 64}

In [10]:
# Create DataLoader
dataset = TensorDataset(user_data_tensor, product_data_tensor, target_data_tensor)
num_samples = target_data_tensor.shape[0]
train_size = int(0.8 * num_samples)
test_size = num_samples - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4)

# Initialize model
model_type = "multihead" # "multihead" or "mlp"
if model_type == "multihead":
    model = TwoTowerRecommendationModel(user_config, product_config, learning_rate)
elif model_type == "mlp":
    model = TwoTowerRecommendationModel_MLP(user_config_mlp, product_config_mlp, learning_rate)


if not use_gpu:

    # Train the model
    trainer = pl.Trainer(max_epochs=epochs)
    trainer.fit(model, train_loader, test_loader)

else:
    # Move model to GPU if available
    device = torch.device('mps' if torch.cuda.is_available() else 'mps')
    model.to(device)

    # Train the model on GPU
    trainer = pl.Trainer(max_epochs=epochs, accelerator="gpu", devices=1)
    trainer.fit(model, train_loader, test_loader)

# Print the model loss
print(f"Training Loss: {trainer.callback_metrics['train_loss']}")
print(f"Validation Loss: {trainer.callback_metrics['val_loss']}")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/DLLOREN/Library/CloudStorage/OneDrive-Mercedes-Benz(corpdir.onmicrosoft.com)/Desktop/tfgUVA/two_tower_model_ibs/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name          | Type         | Params | Mode 
-------------------------------------------------------
0 | user_tower    | UserTower    | 134 K  | train
1 | product_tower | ProductTower | 142 K  | train
2 | criterion     | MSELoss      | 0    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/DLLOREN/Library/CloudStorage/OneDrive-Mercedes-Benz(corpdir.onmicrosoft.com)/Desktop/tfgUVA/two_tower_model_ibs/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

  return F.mse_loss(input, target, reduction=self.reduction)
/Users/DLLOREN/Library/CloudStorage/OneDrive-Mercedes-Benz(corpdir.onmicrosoft.com)/Desktop/tfgUVA/two_tower_model_ibs/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 1455/1455 [01:25<00:00, 17.00it/s, v_num=0]

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 0: 100%|██████████| 1455/1455 [01:31<00:00, 15.92it/s, v_num=0]

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 4: 100%|██████████| 1455/1455 [01:27<00:00, 16.60it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1455/1455 [01:27<00:00, 16.58it/s, v_num=0]
Training Loss: 0.1006450206041336
Validation Loss: 0.18948660790920258


## Predictions

In [11]:
#Parameters definition

new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                    new_action, new_adventure, new_animation, new_childrens,
                    new_comedy, new_crime, new_documentary,
                    new_drama, new_fantasy, new_horror, new_mystery,
                    new_romance, new_scifi, new_thriller]])

user_vecs = gen_user_vecs(user_vec,len(item_vecs))
    
if scaledata:
    scaled_user_vecs = scalerUser.transform(user_vecs)
    scaled_item_vecs = scalerItem.transform(item_vecs)
    user_data_tensor = torch.tensor(scaled_user_vecs[:, u_s:], dtype=torch.float32)
    product_data_tensor = torch.tensor(scaled_item_vecs[:, i_s:], dtype=torch.float32)
    y_p = model(user_data_tensor, product_data_tensor).detach().numpy()
    y_p = targetScaler.inverse_transform(y_p.reshape(-1, 1))
else:
    y_p = model(user_vecs[:, u_s:], item_vecs[:, i_s:])
        
if np.any(y_p < 0) : 
    print("Error, expected all positive predictions")
    
print("Prediction Vector Shape", y_p.shape)


Prediction Vector Shape (1883, 1)


In [12]:
sorted_index = np.argsort(-y_p,axis=0).reshape(-1).tolist()  #- to get largest rating first
sorted_ypu   = y_p[sorted_index]
sorted_items = item_vecs[sorted_index]
sorted_user  = user_vecs[sorted_index]
    
y_p, user, item, movie_dict = sorted_ypu, sorted_user, sorted_items, movie_dict

maxcount=10
count = 0
movies_listed = defaultdict(int)
disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

for i in range(0, y_p.shape[0]):
    if count == maxcount:
        break
    count += 1
    movie_id = item[i, 0].astype(int)
    if movie_id in movies_listed:
        continue
    movies_listed[movie_id] = 1
    disp.append([y_p[i, 0], item[i, 0].astype(int), item[i, 2].astype(float),
                movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

df_predictions = pd.DataFrame(disp[1:], columns=disp[0])
 
print(df_predictions)

   y_p  movie id  rating ave  \
0  3.6     55721         4.3   
1  3.6     55442         4.2   
2  3.6     57669         4.2   
3  3.6     64716         4.1   
4  3.6     71899         4.2   
5  3.6     48516         4.3   
6  3.6     55118         4.0   
7  3.6     44555         4.1   
8  3.6     57504         4.1   
9  3.6     68954         4.0   

                                               title  \
0                Elite Squad (Tropa de Elite) (2007)   
1                                  Persepolis (2007)   
2                                   In Bruges (2008)   
3                                Seven Pounds (2008)   
4                                Mary and Max (2009)   
5                               Departed, The (2006)   
6                            Eastern Promises (2007)   
7  Lives of Others, The (Das leben der Anderen) (...   
8  Girl Who Leapt Through Time, The (Toki o kaker...   
9                                          Up (2009)   

                              