In [1]:
import os
import numpy as np
import tensorflow as tf

In [2]:
users_dict = np.load('data/ml-1m/user_dict_new.npy', allow_pickle=True).item()

In [3]:
users_dict

{0: [(877, 0),
  (528, 0),
  (2460, 0),
  (1691, 0),
  (880, 0),
  (2046, 0),
  (490, 0),
  (491, 0),
  (2128, 0),
  (774, 0),
  (2034, 0),
  (1945, 0),
  (1452, 0),
  (2266, 0),
  (2040, 0),
  (1661, 0),
  (934, 0),
  (1679, 0),
  (43, 0),
  (814, 0),
  (582, 0),
  (2321, 0),
  (1364, 0),
  (611, 0),
  (763, 0),
  (125, 0),
  (0, 0),
  (1403, 0),
  (1404, 0),
  (214, 0),
  (767, 0),
  (768, 0),
  (1460, 0),
  (437, 0),
  (498, 0),
  (918, 0),
  (484, 1),
  (2273, 1),
  (696, 2),
  (945, 2),
  (698, 2),
  (708, 2),
  (1243, 2),
  (1641, 2),
  (1133, 2),
  (2009, 2),
  (1950, 2)],
 1: [(877, 0),
  (2266, 0),
  (1661, 0),
  (1404, 0),
  (1460, 0),
  (918, 0),
  (2239, 0),
  (1584, 0),
  (819, 0),
  (2126, 0),
  (890, 0),
  (1281, 0),
  (1224, 0),
  (2570, 0),
  (2098, 0),
  (1527, 0),
  (1168, 0),
  (926, 0),
  (2294, 0),
  (1119, 0),
  (950, 0),
  (872, 0),
  (95, 0),
  (1630, 0),
  (1307, 0),
  (2495, 0),
  (485, 0),
  (2619, 0),
  (1336, 0),
  (2078, 0),
  (903, 0),
  (424, 0),
  (361

In [4]:
user_ids = set(users_dict.keys())
item_ids = set()
for value in users_dict.values():
    for item_id, _ in value:
        item_ids.add(item_id)
print('num_user:', len(user_ids))
print('num_item:', len(item_ids))

num_user: 6039
num_item: 2819


In [5]:
def get_user_posItem(users_dict):
    # Args:
    #   users_dict : {user_id: [(item_id, rating), ...]}
    # Returns:
    #   user_posItem_pairs : ndarray([(user_id, item_id), ...])
    #   user_posItems_dict : {user_id: [item_id, ...], ...}
    
    user_posItem_pairs = []
    user_posItems_dict = {u: [] for u in users_dict.keys()}
    
    for user_id, item_rating_list in users_dict.items():
        for item_id, rating in item_rating_list:
            # if rating >= 1:
            user_posItem_pairs.append((user_id, item_id))
            user_posItems_dict[user_id].append(item_id)
                
    user_posItem_pairs = np.array(user_posItem_pairs)           
    return user_posItem_pairs, user_posItems_dict

In [6]:
u_i_pairs, u_i_dict = get_user_posItem(users_dict)

In [7]:
u_i_pairs[:10]

array([[   0,  877],
       [   0,  528],
       [   0, 2460],
       [   0, 1691],
       [   0,  880],
       [   0, 2046],
       [   0,  490],
       [   0,  491],
       [   0, 2128],
       [   0,  774]])

In [8]:
u_i_dict[0]

[877,
 528,
 2460,
 1691,
 880,
 2046,
 490,
 491,
 2128,
 774,
 2034,
 1945,
 1452,
 2266,
 2040,
 1661,
 934,
 1679,
 43,
 814,
 582,
 2321,
 1364,
 611,
 763,
 125,
 0,
 1403,
 1404,
 214,
 767,
 768,
 1460,
 437,
 498,
 918,
 484,
 2273,
 696,
 945,
 698,
 708,
 1243,
 1641,
 1133,
 2009,
 1950]

In [9]:
print(len(np.unique(u_i_pairs[:, 0])), len(np.unique(u_i_pairs[:, 1])))

6039 2819


In [10]:
a = np.zeros((3,3))
a[0] = (1,2,3)
a

array([[1., 2., 3.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [11]:
def get_dataloader(user_posItem_pairs, user_posItems_dict, batch_size, negative_ratio=0.5):
    # Args:
    #   user_posItem_pairs : ndarray([(user_id, item_id), ...])
    #   user_posItems_dict : {user_id: [item_id, ...], ...}
    #   negative_ratio : float
    # Returns:
    #   generator : (user_id_batch, item_id_batch, label_batch)

    batch = np.zeros((batch_size, 3))
    
    positive_batch_size = batch_size - int(batch_size*negative_ratio)
    negative_batch_size = batch_size - positive_batch_size
    
    num_user = 6039
    num_item = 2819
    
    while True:
        idx = np.random.choice(len(user_posItem_pairs), positive_batch_size)
        pos_sample = user_posItem_pairs[idx]
        batch[:positive_batch_size,:2] = pos_sample
        batch[:positive_batch_size, 2] = 1
        
        neg_user = np.random.randint(num_user, size=negative_batch_size)
        batch[positive_batch_size:, 0] = neg_user
        for i, user in enumerate(neg_user):
            neg_item = np.random.randint(num_item)
            while neg_item in user_posItems_dict[user]:
                neg_item = np.random.randint(num_item)
            batch[positive_batch_size+i, 1] = neg_item
            batch[positive_batch_size+i, 2] = 0
            
        np.random.shuffle(batch)
        yield batch[:,0].astype(int), batch[:,1].astype(int), batch[:,2]

In [12]:
ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'data/ml-1m/')

In [13]:

class UserMovieEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_movies, embedding_dim, modality=('video', 'audio', 'text'), fusion='early', aggregation='concat'):
        super(UserMovieEmbedding, self).__init__()
        self.modality = modality
        self.fusion = fusion
        self.aggregation = aggregation
        
        # input: (user, movie)
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        
        # user embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        
        # item embedding        
        if not modality:
            self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
        else:
            # load multimodal features
            for mod in modality:
                ROOT_DIR = os.getcwd()
                DATA_DIR = os.path.join(ROOT_DIR, 'data/ml-1m/')
                mod_name = 'image' if mod == 'video' else mod # rename due to file name
                setattr(self, f'{mod}_feat', np.load(f'{DATA_DIR}/{mod_name}_feat.npy'))
                
            if fusion == 'early':
                self.mm_fc = tf.keras.layers.Dense(embedding_dim, name='mm_fc')
                
            elif fusion == 'late':
                if aggregation == 'concat':
                    def divide_integer(n, parts):
                        q, r = divmod(n, parts)
                        return [q+1]*(r) + [q]*(parts-r)
                    embedding_dims = divide_integer(embedding_dim, len(modality))
                elif aggregation == 'mean':
                    embedding_dims = [embedding_dim]*len(modality)
                    
                for i, mod in enumerate(modality):
                    setattr(self, f'{mod}_fc', tf.keras.layers.Dense(embedding_dims[i], name=f'{mod}_fc'))
        
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='movie_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def get_embedding(self, x):
        x = self.m_u_input(x)
        uemb = self.u_embedding(x[0])
        
        if not self.modality:
            memb = self.m_embedding(x[1])
        else:
            mm_emb = []
            for mod in self.modality:
                mm_feat = getattr(self, f'{mod}_feat')
                x[1] = tf.cast(x[1], tf.int32)
                x[0] = tf.cast(x[0], tf.int32)
                mm_feat = tf.gather(mm_feat, x[1])
                
                if self.fusion == 'early':
                    mm_emb.append(mm_feat)
                elif self.fusion == 'late':
                    mm_emb.append(getattr(self, f'{mod}_fc')(mm_feat))
            
            if self.aggregation == 'concat':
                memb = tf.concat(mm_emb, axis=1)
            elif self.aggregation == 'mean':
                memb = tf.reduce_mean(tf.stack(mm_emb), axis=0)
                
            if self.fusion == 'early':
                memb = self.mm_fc(memb)
        return uemb, memb
        
    def call(self, x):
        uemb, memb = self.get_embedding(x)
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

In [None]:
class UserMovieEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_movies, embedding_dim, modality=('image', 'audio', 'text'), fusion='early', aggregation='concat'):
        super(UserMovieEmbedding, self).__init__()
        self.modality = modality
        self.fusion = fusion
        self.aggregation = aggregation
        
        # input: (user, movie)
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        
        # load multimodal features
        for mod in modality:
            setattr(self, f'{mod}_feat', np.load(f'{DATA_DIR}/{mod}_feat.npy'))
        
        # embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        
        if not modality:
            self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
        else:
            if fusion == 'early':
                self.mm_fc = tf.keras.layers.Dense(embedding_dim, name='mm_fc')
            elif fusion == 'late':
                if aggregation == 'concat':
                    def divide_integer(n, parts):
                        q, r = divmod(n, parts)
                        return [q+1]*(r) + [q]*(parts-r)
                    embedding_dims = divide_integer(embedding_dim, len(modality))
                elif aggregation == 'mean':
                    embedding_dims = [embedding_dim]*len(modality)
                    
                for i, mod in enumerate(modality):
                    setattr(self, f'{mod}_fc', tf.keras.layers.Dense(embedding_dims[i], name=f'{mod}_fc'))
        
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='movie_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, x):
        # Split input into user and movie indices
        user_indices = x[:, 0]
        movie_indices = x[:, 1]

        # User embedding lookup
        uemb = self.u_embedding(user_indices)
        
        if not self.modality:
            memb = self.m_embedding(movie_indices)
        else:
            mm_emb = []
            for mod in self.modality:
                mm_feat = getattr(self, f'{mod}_feat')
                # Use tf.gather for fancy indexing
                selected_mm_feat = tf.gather(mm_feat, movie_indices)
                if self.fusion == 'early':
                    mm_emb.append(selected_mm_feat)
                elif self.fusion == 'late':
                    mm_emb.append(getattr(self, f'{mod}_fc')(selected_mm_feat))
            
            if self.aggregation == 'concat':
                memb = tf.concat(mm_emb, axis=1)
            elif self.aggregation == 'mean':
                memb = tf.reduce_mean(tf.stack(mm_emb), axis=0)
                
            if self.fusion == 'early':
                memb = self.mm_fc(memb)
        
        # Compute dot product and output
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

In [128]:
class UserMovieEmbedding(tf.keras.Model):
    def __init__(self, len_users, len_movies, embedding_dim, modality=('image', 'audio', 'text'), fusion='early', aggregation='concat'):
        super(UserMovieEmbedding, self).__init__()
        self.modality = modality
        self.fusion = fusion
        self.aggregation = aggregation
        
        # input: (user, movie)
        self.m_u_input = tf.keras.layers.InputLayer(name='input_layer', input_shape=(2,))
        
        # load multimodal features
        mm_dir = os.path.join(ROOT_DIR, 'data/ml-1m')
        for mod in modality:
            setattr(self, f'{mod}_feat', np.load(f'{mm_dir}/{mod}_feat.npy'))
        
        # embedding
        self.u_embedding = tf.keras.layers.Embedding(name='user_embedding', input_dim=len_users, output_dim=embedding_dim)
        
        if not modality:
            self.m_embedding = tf.keras.layers.Embedding(name='movie_embedding', input_dim=len_movies, output_dim=embedding_dim)
            
        else:
            if fusion == 'early':
                self.mm_fc = tf.keras.layers.Dense(embedding_dim, name='mm_fc')
            elif fusion == 'late':
                if aggregation == 'concat':
                    def divide_integer(n, parts):
                        q, r = divmod(n, parts)
                        return [q+1]*(r) + [q]*(parts-r)
                    embedding_dims = divide_integer(embedding_dim, len(modality))
                elif aggregation == 'mean':
                    embedding_dims = [embedding_dim]*len(modality)
                    
                for i, mod in enumerate(modality):
                    setattr(self, f'{mod}_fc', tf.keras.layers.Dense(embedding_dims[i], name=f'{mod}_fc'))
        
        # dot product
        self.m_u_merge = tf.keras.layers.Dot(name='movie_user_dot', normalize=False, axes=1)
        # output
        self.m_u_fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, x):
        x = self.m_u_input(x)
        uemb = self.u_embedding(x[0])
        
        if not self.modality:
            memb = self.m_embedding(x[1])
        else:
            mm_emb = []
            for mod in self.modality:
                mm_feat = getattr(self, f'{mod}_feat')
                mm_feat = tf.gather(mm_feat, x[1])
                if self.fusion == 'early':
                    mm_emb.append(mm_feat)
                elif self.fusion == 'late':
                    mm_emb.append(getattr(self, f'{mod}_fc')(mm_feat))
            
            if self.aggregation == 'concat':
                memb = tf.concat(mm_emb, axis=1)
            elif self.aggregation == 'mean':
                memb = tf.reduce_mean(tf.stack(mm_emb), axis=0)
                
            if self.fusion == 'early':
                memb = self.mm_fc(memb)
        
        m_u = self.m_u_merge([memb, uemb])
        return self.m_u_fc(m_u)

In [14]:
dl = get_dataloader(u_i_pairs, u_i_dict, 32, 0.5)
model = UserMovieEmbedding(len(user_ids), len(item_ids), 64, modality=('image', 'audio', 'text'), fusion='late', aggregation='concat')

2024-06-11 21:09:11.505974: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-11 21:09:11.526393: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-11 21:09:11.526554: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-11 21:09:11.526946: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the 

In [15]:
u_batch, m_batch, u_m_label_batch = next(dl)

In [17]:
m_batch.shape

(32,)

In [19]:
uemb, memb = model.get_embedding([u_batch, m_batch])

In [20]:
uemb.shape

TensorShape([32, 64])

In [21]:
memb.shape

TensorShape([32, 64])

In [None]:
model([u_batch, m_batch])

In [None]:
model.summary()

In [None]:
MAX_EPOCH = 150
INIT_USER_BATCH_SIZE = 64
FINAL_USER_BATCH_SIZE = 1024

In [None]:
u_m_model = UserMovieEmbedding(len(user_ids), len(item_ids), 50, modality=('image', 'audio', 'text'), fusion='late', aggregation='concat')
optimizer = tf.keras.optimizers.Adam()
bce = tf.keras.losses.BinaryCrossentropy()

In [None]:
u_m_train_loss = tf.keras.metrics.Mean(name='train_loss')
u_m_train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')


@tf.function
def u_m_train_step(u_m_inputs, labels):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = u_m_model(u_m_inputs, training=True)
        loss = bce(labels, predictions)
    gradients = tape.gradient(loss, u_m_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, u_m_model.trainable_variables))

    u_m_train_loss(loss)
    u_m_train_accuracy(labels, predictions)

In [None]:
# for log
u_m_losses = []

for epoch in range(MAX_EPOCH):
    
    batch_size = INIT_USER_BATCH_SIZE * (epoch+1)
    if batch_size > FINAL_USER_BATCH_SIZE:
        batch_size = FINAL_USER_BATCH_SIZE
    u_m_generator = get_dataloader(u_i_pairs, u_i_dict, batch_size, 0.5)
    
    for step in range(len(u_i_pairs)//batch_size):
        # embedding layer update
        u_batch, m_batch, u_m_label_batch = next(u_m_generator)
        u_m_train_step([u_batch, m_batch], u_m_label_batch)
        
        print(f'{epoch} epoch, Batch size : {batch_size}, {step} steps, Loss: {u_m_train_loss.result():0.4f}, Accuracy: {u_m_train_accuracy.result() * 100:0.1f}', end='\r')

    u_m_losses.append(u_m_train_loss.result())