In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [2]:
# Load MovieLens dataset
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_movies = pd.read_csv('ml-latest-small/movies.csv')

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Preprocess data
df_ratings = df_ratings[['userId', 'movieId', 'rating']]
df_movies = df_movies[['movieId', 'title', 'genres']]


In [6]:

# Merge ratings and movies data
df_merged = pd.merge(df_ratings, df_movies, on='movieId')


In [7]:
df_merged.head(10)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6,19,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
7,21,1,3.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
8,27,1,3.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,31,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
ids = list(range(1,20))
df_merged_t = df_merged.query(f'userId in {ids}')

In [9]:
df_merged_movie_id = df_merged_t['movieId'].drop_duplicates().reset_index(drop=True).reset_index()
df_merged_movie_id.rename(columns={'index':'newmovie_id'},inplace=True)

In [10]:
df_merged_t = pd.merge(df_merged_t,df_merged_movie_id,how='inner',on='movieId')

In [11]:
df_merged_t.shape

(2977, 6)

In [12]:

# Split dataset into train and test
train_data, test_data = train_test_split(df_merged_t, test_size=0.2, random_state=42)

In [13]:
# Create BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
# Tokenize movie titles
train_tokens = tokenizer.batch_encode_plus(
    train_data['title'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='np',
    max_length=128
)

In [15]:
test_tokens = tokenizer.batch_encode_plus(
    test_data['title'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='np',
    max_length=128
)

In [16]:
# Prepare CF input data
train_user_ids = train_data['userId'].values
train_item_ids = train_data['newmovie_id'].values
train_ratings = train_data['rating'].values

test_user_ids = test_data['userId'].values
test_item_ids = test_data['newmovie_id'].values
test_ratings = test_data['rating'].values

# Prepare BERT input data
train_input_ids = train_tokens['input_ids']
train_attention_mask = train_tokens['attention_mask']

test_input_ids = test_tokens['input_ids']
test_attention_mask = test_tokens['attention_mask']

In [17]:
# Convert data types
train_user_ids = train_user_ids.astype(np.int32)
train_item_ids = train_item_ids.astype(np.int32)
train_ratings = train_ratings.astype(np.float32)

test_user_ids = test_user_ids.astype(np.int32)
test_item_ids = test_item_ids.astype(np.int32)
test_ratings = test_ratings.astype(np.float32)

train_input_ids = train_input_ids.astype(np.int32)
train_attention_mask = train_attention_mask.astype(np.int32)

test_input_ids = test_input_ids.astype(np.int32)
test_attention_mask = test_attention_mask.astype(np.int32)

In [18]:
from transformers import TFBertModel

In [19]:
import tensorflow as tf

In [20]:



# Define CF Model
class CFModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(CFModel, self).__init__()
        self.user_embeddings = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embeddings = tf.keras.layers.Embedding(num_items, embedding_dim)
        self.fc1 = tf.keras.layers.Dense(embedding_dim)
        self.fc2 = tf.keras.layers.Dense(embedding_dim)
        self.relu = tf.keras.layers.ReLU()

    def call(self, user_ids, item_ids):
        user_embeds = self.user_embeddings(user_ids)
        item_embeds = self.item_embeddings(item_ids)
        user_embeds = self.relu(self.fc1(user_embeds))
        item_embeds = self.relu(self.fc2(item_embeds))
        return user_embeds, item_embeds

# Define CF HybridBERT4Rec model
class CFHybridBERT4Rec(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim, bert_model):
        super(CFHybridBERT4Rec, self).__init__()
        self.cf_model = CFModel(num_users, num_items, embedding_dim)
        self.bert_model = bert_model
        self.fc = tf.keras.layers.Dense(1)

    def call(self, user_ids, item_ids, input_ids, attention_mask):
        user_embeds, item_embeds = self.cf_model(user_ids, item_ids)
        bert_outputs = self.bert_model(input_ids, attention_mask=attention_mask)[1]  # Pooled output
        combined_embeds = tf.concat([user_embeds, item_embeds, bert_outputs], axis=1)
        logits = self.fc(combined_embeds)
        return tf.squeeze(logits)

In [21]:
# Define hyperparameters
learning_rate = 0.001
batch_size = 32
epochs = 10

# Define loss function and optimizer
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate)

In [22]:
# num_users = df_ratings['userId'].nunique()

In [23]:

# Create an instance of the CFHybridBERT4Rec model
num_users = df_ratings['userId'].nunique()
num_items = df_ratings['movieId'].nunique()
embedding_dim = 32
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
hybrid_model = CFHybridBERT4Rec(num_users+1, num_items, embedding_dim, bert_model)

# Training loop
num_batches = len(train_user_ids) // batch_size

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [24]:
num_batches

74

In [25]:
for epoch in range(epochs):
    epoch_loss = 0.0

    for batch in range(num_batches):
        start_idx = batch * batch_size
        end_idx = start_idx + batch_size

        batch_user_ids = train_user_ids[start_idx:end_idx]
        batch_item_ids = train_item_ids[start_idx:end_idx]
        batch_input_ids = train_input_ids[start_idx:end_idx]
        batch_attention_mask = train_attention_mask[start_idx:end_idx]
        batch_ratings = train_ratings[start_idx:end_idx]

        with tf.GradientTape() as tape:
            logits = hybrid_model(
                batch_user_ids, batch_item_ids, batch_input_ids, batch_attention_mask
            )
            loss_value = loss_fn(batch_ratings, logits)

        gradients = tape.gradient(loss_value, hybrid_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, hybrid_model.trainable_variables))

        epoch_loss += loss_value

        # Compute average loss for the epoch
        avg_loss = epoch_loss / num_batches
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

Epoch 1/10, Loss: 0.26743969321250916
Epoch 1/10, Loss: 0.3394688665866852
Epoch 1/10, Loss: 0.4371115267276764
Epoch 1/10, Loss: 0.4605918526649475
Epoch 1/10, Loss: 0.5465521812438965
Epoch 1/10, Loss: 0.5607895255088806
Epoch 1/10, Loss: 0.5848242044448853
Epoch 1/10, Loss: 0.661707878112793
Epoch 1/10, Loss: 0.6872411966323853
Epoch 1/10, Loss: 0.7011879086494446
Epoch 1/10, Loss: 0.7299047708511353
Epoch 1/10, Loss: 0.7595760226249695
Epoch 1/10, Loss: 0.7796232104301453
Epoch 1/10, Loss: 0.7930862307548523
Epoch 1/10, Loss: 0.8119447231292725
Epoch 1/10, Loss: 0.8244717121124268
Epoch 1/10, Loss: 0.8392448425292969
Epoch 1/10, Loss: 0.8555927872657776
Epoch 1/10, Loss: 0.8750709295272827
Epoch 1/10, Loss: 0.8987287282943726
Epoch 1/10, Loss: 0.9205418229103088
Epoch 1/10, Loss: 0.9355074167251587
Epoch 1/10, Loss: 0.9557492136955261
Epoch 1/10, Loss: 0.970703125
Epoch 1/10, Loss: 0.9793684482574463
Epoch 1/10, Loss: 0.9949675798416138
Epoch 1/10, Loss: 1.0086506605148315
Epoch 1/

Epoch 4/10, Loss: 0.006985940039157867
Epoch 4/10, Loss: 0.011425546370446682
Epoch 4/10, Loss: 0.01706700213253498
Epoch 4/10, Loss: 0.02666308358311653
Epoch 4/10, Loss: 0.04004381597042084
Epoch 4/10, Loss: 0.04519251361489296
Epoch 4/10, Loss: 0.05560368299484253
Epoch 4/10, Loss: 0.06267324835062027
Epoch 4/10, Loss: 0.0795220136642456
Epoch 4/10, Loss: 0.0855463445186615
Epoch 4/10, Loss: 0.0989343672990799
Epoch 4/10, Loss: 0.10822919756174088
Epoch 4/10, Loss: 0.11460883170366287
Epoch 4/10, Loss: 0.11926297843456268
Epoch 4/10, Loss: 0.1263536661863327
Epoch 4/10, Loss: 0.13357987999916077
Epoch 4/10, Loss: 0.13983219861984253
Epoch 4/10, Loss: 0.14628556370735168
Epoch 4/10, Loss: 0.15488360822200775
Epoch 4/10, Loss: 0.16641803085803986
Epoch 4/10, Loss: 0.17575794458389282
Epoch 4/10, Loss: 0.18405325710773468
Epoch 4/10, Loss: 0.19405801594257355
Epoch 4/10, Loss: 0.20168693363666534
Epoch 4/10, Loss: 0.20411410927772522
Epoch 4/10, Loss: 0.20984117686748505
Epoch 4/10, Lo

KeyboardInterrupt: 

In [26]:
# Evaluate the model
test_logits = hybrid_model(
    test_user_ids, test_item_ids, test_input_ids, test_attention_mask
)
test_loss = loss_fn(test_ratings, test_logits)
print(f"Test Loss: {test_loss}")

Test Loss: 1.1304930448532104


In [32]:
test_logits.numpy()

array([3.277096 , 2.7689426, 4.098003 , 4.3144093, 3.4339588, 2.9514067,
       3.6401398, 3.6433227, 3.511151 , 4.275472 , 3.4982069, 2.7214282,
       3.431699 , 3.1523678, 4.368536 , 2.6521423, 3.471103 , 3.1395957,
       3.0039124, 2.9170804, 3.715742 , 2.588795 , 4.757001 , 3.829666 ,
       2.8495452, 4.786401 , 4.6477737, 2.576473 , 3.1374204, 3.3316638,
       2.733906 , 3.8679535, 3.850675 , 3.2318513, 1.8278787, 3.404947 ,
       3.68436  , 4.1371117, 3.1728039, 2.4653406, 2.8977911, 3.82677  ,
       4.211941 , 3.1268196, 1.6614109, 3.5294545, 2.7360318, 2.7193382,
       3.4139638, 3.7442453, 3.141382 , 4.0983744, 4.1111374, 3.640196 ,
       3.435186 , 4.8202496, 3.3584359, 3.5360508, 2.9413884, 3.8973007,
       3.055059 , 3.497141 , 4.003553 , 2.7753232, 3.6783087, 3.6472235,
       3.0492504, 3.800766 , 2.7955573, 4.1497574, 1.8532273, 3.4746854,
       4.952982 , 3.3740656, 4.714817 , 2.6489935, 3.6558173, 3.0110912,
       2.5753028, 3.911787 , 2.172591 , 3.6603034, 