In [1]:
import os

import numpy as np

import os
import sys
sys.path.append('..')
#from layers import LSTMLayer, WaveNet, TimeDistributedDense, TemporalConvolution

In [2]:
import pandas as pd
from pandas import DataFrame

In [3]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls -a /content/drive/MyDrive/recsys_data/rnn_product_data

In [7]:
!mkdir data

In [None]:
!unzip /content/drive/MyDrive/recsys_data/rnn_product_data/data.zip -d data

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print(f'Running on a TPU w/{tpu.num_accelerators()["TPU"]} cores')
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

In [16]:
class TFDataReader2:
    def __init__(self, data_dir):
        # Define feature columns and label columns
        self.feature_cols = [
            'user_id', 'product_id', 'aisle_id', 'department_id',
            'is_ordered_history', 'index_in_order_history',
            'order_dow_history', 'order_hour_history',
            'days_since_prior_order_history', 'order_size_history',
            'reorder_size_history', 'order_number_history',
            'history_length', 'product_name', 'product_name_length',
            'product_embedding'
        ]
        self.expand_cols = ['user_id', 'product_id', 'aisle_id', 'department_id',
                            'history_length', 'product_name_length', 'label']
        self.label_cols = ['label']

        # Load all numpy arrays
        self.data = {}
        for col in self.feature_cols + self.label_cols:
            self.data[col] = np.load(os.path.join(
                data_dir, f'{col}.npy'), mmap_mode='r')[:100000]
            # if col in self.expand_cols:
            # self.data[col] = self.data[col].reshape(-1,1)
        # rint(self.data.keys())
        # Create train/val split
        total_size = len(next(iter(self.data.values())))
        train_size = int(0.9 * total_size)

        self.train_indices = np.arange(train_size)
        self.val_indices = np.arange(train_size, total_size)
        self.all_indices = np.arange(total_size)

    def _process_features(self, original_features, is_test):

        # Create new features dictionary with augmented features
        features = {
            # Copy original features
            **original_features,

            # Add augmented features
            'order_dow_history': tf.roll(original_features['order_dow_history'], -1, axis=0),
            'order_hour_history': tf.roll(original_features['order_hour_history'], -1, axis=0),
            'days_since_prior_order_history': tf.roll(original_features['days_since_prior_order_history'], -1, axis=0),
            'order_number_history': tf.roll(original_features['order_number_history'], -1, axis=0),
            'is_none': tf.cast(tf.equal(original_features['product_id'],
                                        tf.constant(0, dtype=tf.int32)), tf.int32)
        }
        print(features['is_none'].shape)
        # Adjust history length for non-test data
        if not is_test:
            features['history_length'] = original_features['history_length'] - 1
        else:
            features['history_length'] = original_features['history_length']
        return features, {'in_next_order': tf.cast(
            tf.gather(
                features['is_ordered_history'],  # The tensor to gather from
                # The indices (must be a scalar tensor or tensor of scalars)
                features['history_length']-1
            ),
            dtype=tf.float32
        )}

    def _create_dataset(self, indices, shuffle=True, is_test=False):
        # Create feature datasets
        features_dict = {col: tf.cast(
            self.data[col][indices], tf.int32) for col in self.feature_cols}

        dataset = tf.data.Dataset.from_tensor_slices(features_dict)
        # Apply processing before batching
        dataset = dataset.map(
            lambda x: self._process_features(x, is_test),
            num_parallel_calls=tf.data.AUTOTUNE
        )
        if shuffle:
            dataset = dataset.shuffle(buffer_size=10000)

        # Enable prefetching
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        return dataset

    def get_train_dataset(self, batch_size):
        dataset = self._create_dataset(self.train_indices, shuffle=True)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        # for element in dataset.take(1):
        # print(element[0])
        # Process features after batching
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=False))
        return dataset

    def get_val_dataset(self, batch_size):
        dataset = self._create_dataset(self.val_indices, shuffle=True)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=False))
        return dataset

    def get_test_dataset(self, batch_size):
        dataset = self._create_dataset(
            self.all_indices, shuffle=False, is_test=True)
        dataset = dataset.batch(batch_size, drop_remainder=False)
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=True))
        return dataset

In [17]:
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, conv_kernel_size=3, rate=0.1, **kwargs):
        super(TransformerEncoderBlock, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = layers.Dropout(rate)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Position-wise feed-forward network with convolution
        self.conv1 = layers.Conv1D(filters=ff_dim, kernel_size=conv_kernel_size, activation='relu', padding='same')
        self.conv2 = layers.Conv1D(filters=embed_dim, kernel_size=conv_kernel_size, activation='relu', padding='same')
        self.dropout2 = layers.Dropout(rate)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training, mask=None):
        # Self-attention layer
        attn_output = self.att(inputs, inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm1(inputs + attn_output)
        
        # Feed-forward network with convolutional layers
        ff_output = self.conv1(out1)
        ff_output = self.conv2(ff_output)
        ff_output = self.dropout2(ff_output, training=training)
        out2 = self.norm2(out1 + ff_output)
        return out2

In [21]:
class CustomModel(tf.keras.Model):
    def __init__(self, num_transformer_blocks,embed_dim=256, num_heads=6, ff_dim=512 , **kwargs):
        super(CustomModel, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_transformer_blocks = num_transformer_blocks

        self.project_dense = layers.Dense(embed_dim, activation='relu')
        # Embedding layers
        self.product_embeddings = layers.Embedding(50000, embed_dim, name='product_embeddings')
        self.user_embeddings = layers.Embedding(207000, embed_dim, name='user_embeddings')
        self.aisle_embeddings = layers.Embedding(250, 50, name='aisle_embeddings')
        self.department_embeddings = layers.Embedding(50, 10, name='department_embeddings')
        
        # Dense layers for non-embedding features
        self.product_name_dense = layers.Dense(100, activation='relu')
        
        # Positional encoding
        self.positional_encoding = self._get_positional_encoding(100, embed_dim)
        #print(self.positional_encoding.numpy())
        # Transformer encoder blocks
        self.transformer_blocks = tf.keras.Sequential([
            TransformerEncoderBlock(embed_dim, num_heads, ff_dim) 
            for _ in range(num_transformer_blocks)
        ])
        
        # Output layers
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(1, activation='sigmoid')
        self.bce_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    def _get_positional_encoding(self, maxlen, embed_dim):
        """Generate positional encoding using TensorFlow operations."""
        pos = tf.range(maxlen, dtype=tf.float32)[:, tf.newaxis]  # (maxlen, 1)
        i = tf.range(embed_dim, dtype=tf.float32)[tf.newaxis, :]  # (1, embed_dim)
        angle_rates = 1 / tf.pow(10000.0, (2 * (i//2)) / tf.cast(embed_dim, tf.float32))
        angle_rads = pos * angle_rates  # (maxlen, embed_dim)
        
        # Apply sin to even indices, cos to odd indices
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        
        pos_encoding = tf.concat([sines, cosines], axis=-1)  # (maxlen, embed_dim)
        pos_encoding = pos_encoding[tf.newaxis, ...]  # (1, maxlen, embed_dim)
        return tf.cast(pos_encoding, dtype=tf.float32)
    @tf.function
    def masked_mean_pooling(self, encoder_outputs, mask):
        """
        Computes the mean of encoder outputs considering the mask.

        Args:
            encoder_outputs: Tensor of shape (batch_size, max_len, embedding_dim)
            mask: Tensor of shape (batch_size, max_len), where 1 indicates valid tokens and 0 indicates padding
        Returns:
            context_vector: Tensor of shape (batch_size, embedding_dim)
        """
        mask = tf.cast(mask, dtype=tf.float32)  # Convert mask to float
        mask = tf.expand_dims(mask, axis=-1)   # Shape: (batch_size, max_len, 1)
        masked_outputs = encoder_outputs * mask  # Zero-out padding embeddings

        summed = tf.reduce_sum(masked_outputs, axis=1)  # Sum over the time steps
        lengths = tf.reduce_sum(mask, axis=1)           # Number of valid tokens per sample

        # Avoid division by zero
        lengths = tf.maximum(lengths, tf.ones_like(lengths))
        context_vector = summed / lengths  # Shape: (batch_size, embedding_dim)

        return context_vector
    def call(self,inputs):
        user_id = inputs['user_id']
        product_id = inputs['product_id']
        aisle_id = inputs['aisle_id']
        department_id = inputs['department_id']
        is_none = inputs['is_none']
        self.history_length = inputs['history_length'] - 1
        #print(self.history_length.shape)
        is_ordered_history = inputs['is_ordered_history']
        index_in_order_history = inputs['index_in_order_history']
        order_dow_history = inputs['order_dow_history']
        order_hour_history = inputs['order_hour_history']
        days_since_prior_order_history = inputs['days_since_prior_order_history']
        order_size_history = inputs['order_size_history']
        reorder_size_history = inputs['reorder_size_history']
        order_number_history = inputs['order_number_history']
        product_name = inputs['product_name']

        product_names = tf.one_hot(product_name, 2532)
        #print(product_names.shape)
        product_names = tf.reduce_max(product_names, axis=1)
        #print(product_names.shape)
        product_names = self.product_name_dense(product_names)
        #print(product_names.shape)

        is_none_float = tf.cast(tf.expand_dims(is_none, 1), tf.float32)

        product_embeddings = self.product_embeddings(product_id)
        aisle_embeddings = self.aisle_embeddings(aisle_id)
        department_embeddings = self.department_embeddings(department_id)

        #print(product_embeddings.shape, aisle_embeddings.shape, department_embeddings.shape, is_none_float.shape, product_names.shape)
        x_product = tf.concat([
            product_embeddings,
            aisle_embeddings,
            department_embeddings,
            is_none_float,
            product_names
        ], axis=1)
        x_product = tf.tile(tf.expand_dims(x_product, 1), (1, 100, 1))

        # User data
        user_embeddings = self.user_embeddings(user_id)
        x_user = tf.tile(tf.expand_dims(user_embeddings, 1), (1, 100, 1))

        # Sequence data
        is_ordered_history_onehot = tf.one_hot(is_ordered_history, 2)
        index_in_order_history_onehot = tf.one_hot(index_in_order_history, 20)
        order_dow_history_onehot = tf.one_hot(order_dow_history, 8)
        order_hour_history_onehot = tf.one_hot(order_hour_history, 25)
        days_since_prior_order_history_onehot = tf.one_hot(days_since_prior_order_history, 31)
        order_size_history_onehot = tf.one_hot(order_size_history, 60)
        reorder_size_history_onehot = tf.one_hot(reorder_size_history, 50)
        order_number_history_onehot = tf.one_hot(order_number_history, 101)

        index_in_order_history_scalar = tf.expand_dims(tf.cast(index_in_order_history, tf.float32) / 20.0, 2)
        order_dow_history_scalar = tf.expand_dims(tf.cast(order_dow_history, tf.float32) / 8.0, 2)
        order_hour_history_scalar = tf.expand_dims(tf.cast(order_hour_history, tf.float32) / 25.0, 2)
        days_since_prior_order_history_scalar = tf.expand_dims(tf.cast(days_since_prior_order_history, tf.float32) / 31.0, 2)
        order_size_history_scalar = tf.expand_dims(tf.cast(order_size_history, tf.float32) / 60.0, 2)
        reorder_size_history_scalar = tf.expand_dims(tf.cast(reorder_size_history, tf.float32) / 50.0, 2)
        order_number_history_scalar = tf.expand_dims(tf.cast(order_number_history, tf.float32) / 100.0, 2)

        x_history = tf.concat([
            is_ordered_history_onehot,
            index_in_order_history_onehot,
            order_dow_history_onehot,
            order_hour_history_onehot,
            days_since_prior_order_history_onehot,
            order_size_history_onehot,
            reorder_size_history_onehot,
            order_number_history_onehot,
            index_in_order_history_scalar,
            order_dow_history_scalar,
            order_hour_history_scalar,
            days_since_prior_order_history_scalar,
            order_size_history_scalar,
            reorder_size_history_scalar,
            order_number_history_scalar,
        ], axis=2)

        outputs = tf.concat([x_history, x_product, x_user], axis=2)
        
        print(self.history_length.numpy())
        outputs = self.project_dense(outputs)
        batch_size = tf.shape(self.history_length)[0]
        positional_encoding = tf.tile(self.positional_encoding, [batch_size, 1, 1])

        #print(positional_encoding.shape)
        # Step 2: Create a range tensor and compare it against each history_length to create a mask
        mask = tf.cast(tf.sequence_mask(self.history_length, maxlen=100), tf.float32)
        print(mask.numpy())

        expanded_mask = tf.expand_dims(mask, -1)  # Expand mask for embedding dimensions
        #print(mask.shape)
        mask1 = tf.broadcast_to(expanded_mask, tf.shape(positional_encoding))  # Broadcast to the shape of positional_encoding
        mask2 = tf.broadcast_to(expanded_mask, tf.shape(outputs))
        #print(mask2.numpy)
        # Step 4: Apply the mask to the positional encoding
        masked_positional_encoding = positional_encoding * mask1
        outputs *= mask2
        # Step 5: Add the masked positional encoding to the outputs
        outputs += masked_positional_encoding
        outputs = self.transformer_blocks(outputs)
        
        h = self.masked_mean_pooling(outputs, mask)
          # (batch_size, 100, embed_dim)
        h1 = self.dense1(h)
        #h = tf.keras.layers.TimeDistributed(self.dense1, name='hidden_states')(h)
        final_predictions = self.dense2(h1)
        # Pass through Dense layers


        # final_states = tf.keras.layers.Lambda(lambda x: x, name='final_states')(final_states)
        # final_predictions = tf.keras.layers.Lambda(lambda x: x, name='final_predictions')(final_predictions)
        return { 'final_states': h1, 'in_next_order': final_predictions}
    @tf.function
    def train_step(self, data):
        x, y = data
        history_length = x['history_length']

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            # Pass both prediction and history_length to loss
            loss = self.bce_loss(y['in_next_order'], y_pred['in_next_order'])

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y['in_next_order'], y_pred['in_next_order'])
        results = {m.name: m.result() for m in self.metrics}
        results['loss'] = loss
        return results
    @tf.function
    def test_step(self, data):
        x, y = data
        history_length = x['history_length']

        y_pred = self(x, training=False)
        # Pass both prediction and history_length to loss
        loss = self.bce_loss(y['in_next_order'], y_pred['in_next_order'])

        self.compiled_metrics.update_state(y['in_next_order'], y_pred['in_next_order'])

        # Return metrics and val_loss
        results = {m.name: m.result() for m in self.metrics}
        results['loss'] = loss
        return results




In [None]:
reader = TFDataReader2('data')
train_dataset = reader.get_train_dataset(128)
val_dataset = reader.get_val_dataset(1)

In [8]:
checkpoint_path = "/content/drive/MyDrive/recsys_data/checkpoints/rnn_products/cp-{epoch:04d}.ckpt"


In [9]:
callbacks = [
    # Early stopping to prevent overfitting
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),

    # Model checkpoint to save best model
    tf.keras.callbacks.ModelCheckpoint(
        filepath='models/best_model.h5',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    ),

    # Learning rate reduction on plateau
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),

    # CSV logger
    tf.keras.callbacks.CSVLogger(
        'training_log.csv',
        separator=',',
        append=False
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        save_freq='epoch',
        verbose=1
)

]


In [32]:
element = None
for el in val_dataset.take(1):
    element = el
    
    break

In [None]:
print(element[0]['history_length'])
print(element[0]['is_ordered_history'])
print(element[1]['in_next_order'].numpy())

In [None]:


model = CustomModel(
        num_transformer_blocks=3
    )
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision')
    ]
)
model(element[0])


In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    steps_per_epoch=None,
    validation_steps=None,
    callbacks=callbacks,
    verbose=1,  # 0: silent, 1: progress bar, 2: one line per epoch
    shuffle=True,  # Shuffle training data
    max_queue_size=10,
)

In [32]:
model.save_weights(
            f'models/epoch_1.h5')

In [None]:
eval_results = model.evaluate(val_dataset, verbose=1)
