In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load your data
df = pd.read_csv('../data/Groceries_cleaned_dataset2.csv')

# Encode user and item IDs as integers for embeddings
user_ids = df['Member_number'].unique()
item_ids = df['itemDescription'].unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}

df['user_idx'] = df['Member_number'].map(user2idx)
df['item_idx'] = df['itemDescription'].map(item2idx)

# Create positive interactions
positive_interactions = df[['user_idx', 'item_idx']].drop_duplicates()

# Negative sampling function
def generate_negative_samples(pos_df, num_users, num_items, num_neg=1):
    negatives = []
    user_item_set = set(zip(pos_df['user_idx'], pos_df['item_idx']))
    for (u, i) in user_item_set:
        for _ in range(num_neg):
            j = np.random.randint(num_items)
            while (u, j) in user_item_set:
                j = np.random.randint(num_items)
            negatives.append([u, j])
    neg_df = pd.DataFrame(negatives, columns=['user_idx', 'item_idx'])
    neg_df['label'] = 0
    return neg_df

num_users = len(user_ids)
num_items = len(item_ids)

# Label positive interactions as 1
positive_interactions['label'] = 1

# Generate negative samples (1 negative per positive)
negative_interactions = generate_negative_samples(positive_interactions, num_users, num_items, num_neg=1)

# Combine positive and negative samples
data = pd.concat([positive_interactions, negative_interactions])

# Shuffle data
data = data.sample(frac=1).reset_index(drop=True)

# Train-test split
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Prepare TensorFlow datasets
def df_to_dataset(df):
    return tf.data.Dataset.from_tensor_slices((
        {
            "user_idx": df['user_idx'].values,
            "item_idx": df['item_idx'].values
        },
        df['label'].values
    )).batch(256).shuffle(10000)

train_ds = df_to_dataset(train)
test_ds = df_to_dataset(test)

# Model parameters
embedding_dim = 32

# Build model
class MFModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(MFModel, self).__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_dim,
                                                        embeddings_initializer='he_normal',
                                                        embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.item_embedding = tf.keras.layers.Embedding(num_items, embedding_dim,
                                                        embeddings_initializer='he_normal',
                                                        embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        
    def call(self, inputs):
        user_vector = self.user_embedding(inputs['user_idx'])
        item_vector = self.item_embedding(inputs['item_idx'])
        dot = tf.reduce_sum(user_vector * item_vector, axis=1)
        # Use sigmoid to get probability
        return tf.nn.sigmoid(dot)

model = MFModel(num_users, num_items, embedding_dim)

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

# Train model
model.fit(train_ds, epochs=10, validation_data=test_ds)

# After training, you can predict for user-item pairs like:
# preds = model({"user_idx": np.array([user_index]), "item_idx": np.array([item_index])})



Epoch 1/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - binary_accuracy: 0.5042 - loss: 0.6933 - val_binary_accuracy: 0.5159 - val_loss: 0.6929
Epoch 2/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - binary_accuracy: 0.7921 - loss: 0.6853 - val_binary_accuracy: 0.6384 - val_loss: 0.6862
Epoch 3/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - binary_accuracy: 0.8831 - loss: 0.6596 - val_binary_accuracy: 0.7311 - val_loss: 0.6484
Epoch 4/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - binary_accuracy: 0.8527 - loss: 0.5880 - val_binary_accuracy: 0.7467 - val_loss: 0.5864
Epoch 5/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - binary_accuracy: 0.8348 - loss: 0.5021 - val_binary_accuracy: 0.7496 - val_loss: 0.5487
Epoch 6/10
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - binary_accuracy: 0.8424 - loss: 

<keras.src.callbacks.history.History at 0x1da1008c400>