In [12]:
import os

import numpy as np

import os
import sys
sys.path.append('..')
#from layers import LSTMLayer, WaveNet, TimeDistributedDense, TemporalConvolution

In [13]:
history_lengths = np.load("data/history_length.npy")

In [14]:
import pandas as pd
from pandas import DataFrame

In [15]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls -a /content/drive/MyDrive/recsys_data/rnn_product_data

In [7]:
!mkdir data

In [None]:
!unzip /content/drive/MyDrive/recsys_data/rnn_product_data/data.zip -d data

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print(f'Running on a TPU w/{tpu.num_accelerators()["TPU"]} cores')
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

In [46]:

class TFDataReader2:
    def __init__(self, data_dir):
        # Define feature columns and label columns
        self.feature_cols = [
            'user_id',
            'history_length',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
        ]
        # Load all numpy arrays
        self.data = {}
        for col in self.feature_cols:
            self.data[col] = np.load(os.path.join(
                data_dir, f'{col}.npy'), mmap_mode='r')
            # if col in self.expand_cols:
            # self.data[col] = self.data[col].reshape(-1,1)
        # rint(self.data.keys())
        # Create train/val split
        total_size = len(next(iter(self.data.values())))
        remainder = total_size % 512
        for col in self.feature_cols:
            self.data[col] = np.concatenate([self.data[col], np.zeros(
                (512-remainder, *self.data[col].shape[1:]), dtype=self.data[col].dtype)], axis=0)
        train_size = int(0.95 * total_size)

        self.train_indices = np.arange(train_size)
        self.val_indices = np.arange(train_size, total_size)
        self.all_indices = np.arange(total_size + 512 - remainder)

    def _process_features(self, original_features, is_test):

        if not is_test:
            history_lengths = original_features['history_length'] - 1
        else:
            # Create new features dictionary with augmented features
            history_lengths = original_features['history_length']
        features = {
            # Copy original features
            **original_features,

            # Add augmented features
            'next_order_dow': tf.gather(original_features['order_dow_history'], history_lengths),
            'next_order_hour': tf.gather(original_features['order_hour_history'], history_lengths),
            'days_since_prior_order': tf.gather(original_features['days_since_prior_order_history'], history_lengths),
            'next_order_number': tf.gather(original_features['order_number_history'], history_lengths),
            'history_length': history_lengths
        }
        # Adjust history length for non-test data
        if is_test:
            output = {}
        else:
            output = {'next_reorder_size': tf.cast(
                tf.gather(
                    # The tensor to gather from
                    features['reorder_size_history'],
                    # The indices (must be a scalar tensor or tensor of scalars)
                    history_lengths
                ),
                dtype=tf.float32
            ), 'next_order_size': tf.cast(
                tf.gather(
                    # The tensor to gather from
                    features['order_size_history'],
                    # The indices (must be a scalar tensor or tensor of scalars)
                    history_lengths
                ),
                dtype=tf.float32
            )}
        return features, output

    def _create_dataset(self, indices, shuffle=True, is_test=False):
        # Create feature datasets
        features_dict = {col: tf.cast(
            self.data[col][indices], tf.int32) for col in self.feature_cols}

        dataset = tf.data.Dataset.from_tensor_slices(features_dict)
        # Apply processing before batching
        dataset = dataset.map(
            lambda x: self._process_features(x, is_test),
            num_parallel_calls=tf.data.AUTOTUNE
        )
        if shuffle:
            dataset = dataset.shuffle(buffer_size=10000)

        # Enable prefetching
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        return dataset

    def get_train_dataset(self, batch_size):
        dataset = self._create_dataset(self.train_indices, shuffle=True)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        # for element in dataset.take(1):
        # print(element[0])
        # Process features after batching
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=False))
        return dataset

    def get_val_dataset(self, batch_size):
        dataset = self._create_dataset(self.val_indices, shuffle=True)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=False))
        return dataset

    def get_test_dataset(self, batch_size):
        dataset = self._create_dataset(
            self.all_indices, shuffle=False, is_test=True)
        dataset = dataset.batch(batch_size, drop_remainder=False)
        # dataset = dataset.map(lambda x: self._process_features((x), is_test=True))
        return dataset

In [18]:
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, conv_kernel_size=3, rate=0.1, **kwargs):
        super(TransformerEncoderBlock, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = layers.Dropout(rate)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Position-wise feed-forward network with convolution
        self.conv1 = layers.Conv1D(filters=ff_dim, kernel_size=conv_kernel_size, activation='relu', padding='same')
        self.conv2 = layers.Conv1D(filters=embed_dim, kernel_size=conv_kernel_size, activation='relu', padding='same')
        self.dropout2 = layers.Dropout(rate)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        

    def call(self, inputs, training, mask=None):
        # Self-attention layer
        attn_output = self.att(inputs, inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm1(inputs + attn_output)
        
        # Feed-forward network with convolutional layers
        ff_output = self.conv1(out1)
        ff_output = self.conv2(ff_output)
        ff_output = self.dropout2(ff_output, training=training)
        out2 = self.norm2(out1 + ff_output)
        return out2

In [None]:
tf.sequence_mask(
            4, maxlen=100)

In [94]:
class CustomModel(tf.keras.Model):
    def __init__(self, num_transformer_blocks, embed_dim=128, num_heads=4, ff_dim=256, **kwargs):
        super(CustomModel, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_transformer_blocks = num_transformer_blocks

        self.project_dense = layers.Dense(embed_dim, activation='relu')
        # Embedding layers

        # Dense layers for non-embedding features
        # Positional encoding
        self.positional_encoding = self._get_positional_encoding(
            100, embed_dim=embed_dim)
        # print(self.positional_encoding.numpy())
        # Transformer encoder blocks
        self.transformer_blocks = tf.keras.Sequential([
            TransformerEncoderBlock(embed_dim, num_heads, ff_dim)
            for _ in range(num_transformer_blocks)
        ])

        # Output layers
        self.dense1 = layers.Dense(64, activation='relu')
        self.dense2 = layers.Dense(1, activation='relu')
        self.dense3 = layers.Dense(1, activation='relu')
        self.mse_loss = tf.keras.losses.Huber(
            reduction=tf.keras.losses.Reduction.NONE)

    def _get_positional_encoding(self, maxlen, embed_dim):
        """Generate positional encoding using TensorFlow operations."""
        pos = tf.range(maxlen, dtype=tf.float32)[:, tf.newaxis]  # (maxlen, 1)
        i = tf.range(embed_dim, dtype=tf.float32)[
            tf.newaxis, :]  # (1, embed_dim)
        angle_rates = 1 / tf.pow(10000.0, (2 * (i//2)) /
                                 tf.cast(embed_dim, tf.float32))
        angle_rads = pos * angle_rates  # (maxlen, embed_dim)

        # Apply sin to even indices, cos to odd indices
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat(
            [sines, cosines], axis=-1)  # (maxlen, embed_dim)
        pos_encoding = pos_encoding[tf.newaxis, ...]  # (1, maxlen, embed_dim)
        return tf.cast(pos_encoding, dtype=tf.float32)

    @tf.function
    def masked_mean_pooling(self, encoder_outputs, mask):
        """
        Computes the mean of encoder outputs considering the mask.

        Args:
            encoder_outputs: Tensor of shape (batch_size, max_len, embedding_dim)
            mask: Tensor of shape (batch_size, max_len), where 1 indicates valid tokens and 0 indicates padding
        Returns:
            context_vector: Tensor of shape (batch_size, embedding_dim)
        """
        mask = tf.cast(mask, dtype=tf.float32)  # Convert mask to float
        # Shape: (batch_size, max_len, 1)
        mask = tf.expand_dims(mask, axis=-1)
        masked_outputs = encoder_outputs * mask  # Zero-out padding embeddings

        # Sum over the time steps
        summed = tf.reduce_sum(masked_outputs, axis=1)
        # Number of valid tokens per sample
        lengths = tf.reduce_sum(mask, axis=1)

        # Avoid division by zero
        lengths = tf.maximum(lengths, tf.ones_like(lengths))
        context_vector = summed / lengths  # Shape: (batch_size, embedding_dim)

        return context_vector

    def call(self, inputs, training=False):
        user_id = inputs['user_id']

        self.history_length = inputs['history_length']
        # print(self.history_length.shape)
        order_size_history = inputs['order_size_history']
        reorder_size_history = inputs['reorder_size_history']
        order_number_history = inputs['order_number_history']
        order_dow_history = inputs['order_dow_history']
        order_hour_history = inputs['order_hour_history']
        days_since_prior_order_history = inputs['days_since_prior_order_history']

        next_order_number = inputs['next_order_number']
        next_order_dow = inputs['next_order_dow']
        next_order_hour = inputs['next_order_hour']
        days_since_prior_order = inputs['days_since_prior_order']

        # print(product_embeddings.shape, aisle_embeddings.shape, department_embeddings.shape, is_none_float.shape, product_names.shape)

        # Sequence data
        order_dow_history_onehot = tf.one_hot(order_dow_history, 8)
        order_hour_history_onehot = tf.one_hot(order_hour_history, 25)
        days_since_prior_order_history_onehot = tf.one_hot(
            days_since_prior_order_history, 31)
        order_size_history_onehot = tf.one_hot(order_size_history, 60)
        reorder_size_history_onehot = tf.one_hot(reorder_size_history, 50)
        order_number_history_onehot = tf.one_hot(order_number_history, 101)

        next_order_dow_onehot = tf.one_hot(next_order_dow, 8)
        next_order_hour_onehot = tf.one_hot(next_order_hour, 25)
        days_since_prior_order_onehot = tf.one_hot(days_since_prior_order, 31)
        next_order_number_onehot = tf.one_hot(next_order_number, 101)

        #print('one_hot', next_order_dow_onehot.shape)

        order_dow_history_scalar = tf.expand_dims(
            tf.cast(order_dow_history, tf.float32) / 8.0, 2)
        order_hour_history_scalar = tf.expand_dims(
            tf.cast(order_hour_history, tf.float32) / 25.0, 2)
        days_since_prior_order_history_scalar = tf.expand_dims(
            tf.cast(days_since_prior_order_history, tf.float32) / 31.0, 2)
        order_size_history_scalar = tf.expand_dims(
            tf.cast(order_size_history, tf.float32) / 60.0, 2)
        reorder_size_history_scalar = tf.expand_dims(
            tf.cast(reorder_size_history, tf.float32) / 50.0, 2)
        order_number_history_scalar = tf.expand_dims(
            tf.cast(order_number_history, tf.float32) / 100.0, 2)

        next_order_dow_scalar = tf.expand_dims(
            tf.cast(next_order_dow, tf.float32) / 8.0, 1)
        next_order_hour_scalar = tf.expand_dims(
            tf.cast(next_order_hour, tf.float32) / 25.0, 1)
        days_since_prior_order_scalar = tf.expand_dims(
            tf.cast(days_since_prior_order, tf.float32) / 31.0, 1)
        next_order_number_scalar = tf.expand_dims(
            tf.cast(next_order_number, tf.float32) / 100.0, 1)

        #print('scalar', next_order_dow_scalar.shape)

        outputs = tf.concat([
            order_dow_history_onehot,
            order_hour_history_onehot,
            days_since_prior_order_history_onehot,
            order_size_history_onehot,
            reorder_size_history_onehot,
            order_number_history_onehot,
            order_dow_history_scalar,
            order_hour_history_scalar,
            days_since_prior_order_history_scalar,
            order_size_history_scalar,
            reorder_size_history_scalar,
            order_number_history_scalar,
        ], axis=2)

        #print(outputs.shape)
        outputs = self.project_dense(outputs)
        batch_size = tf.shape(self.history_length)[0]
        positional_encoding = tf.tile(
            self.positional_encoding, [batch_size, 1, 1])

        # print(positional_encoding.shape)
        # Step 2: Create a range tensor and compare it against each history_length to create a mask
        
        
        mask = tf.cast(tf.sequence_mask(
            self.history_length, maxlen=100), tf.float32)

        # Expand mask for embedding dimensions
        expanded_mask = tf.expand_dims(mask, -1)
        # print(mask.shape)
        # Broadcast to the shape of positional_encoding
        mask1 = tf.broadcast_to(expanded_mask, tf.shape(positional_encoding))
        mask2 = tf.broadcast_to(expanded_mask, tf.shape(outputs))
        # print(mask2.numpy)
        # Step 4: Apply the mask to the positional encoding
        masked_positional_encoding = positional_encoding * mask1
        outputs *= mask2
        # Step 5: Add the masked positional encoding to the outputs
        outputs += masked_positional_encoding
        outputs = self.transformer_blocks(outputs, training=training)

        h = self.masked_mean_pooling(outputs, mask)
        #print('h', h.shape)
        h = tf.concat([
            h,
            next_order_dow_onehot,
            next_order_hour_onehot,
            days_since_prior_order_onehot,
            next_order_number_onehot,
            next_order_dow_scalar,
            next_order_hour_scalar,
            days_since_prior_order_scalar,
            next_order_number_scalar,
        ], axis=1)
        # (batch_size, 100, embed_dim)
        h1 = self.dense1(h)
        # h = tf.keras.layers.TimeDistributed(self.dense1, name='hidden_states')(h)
        next_order_size = self.dense2(h1)
        # Pass through Dense layers

        # final_states = tf.keras.layers.Lambda(lambda x: x, name='final_states')(final_states)
        # final_predictions = tf.keras.layers.Lambda(lambda x: x, name='final_predictions')(final_predictions)
        return {'final_states': h1, 'next_order_size': next_order_size}

    @tf.function
    def train_step(self, data):
        x, y = data
        
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            # Pass both prediction and history_length to loss
            loss = self.mse_loss(y['next_order_size'],
                                 y_pred['next_order_size'])

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(
            y['next_order_size'], y_pred['next_order_size'])
        results = {m.name: m.result() for m in self.metrics}
        results['loss'] = loss
        return results

    @tf.function
    def test_step(self, data):
        x, y = data

        y_pred = self(x, training=False)
        # Pass both prediction and history_length to loss
        loss = self.mse_loss(y['next_order_size'],
                                 y_pred['next_order_size']) 
        self.compiled_metrics.update_state(
            y['next_order_size'], y_pred['next_order_size'])

        # Return metrics and val_loss
        results = {m.name: m.result() for m in self.metrics}
        results['loss'] = loss
        return results

In [88]:
reader = TFDataReader2('data')
train_dataset = reader.get_train_dataset(128)
val_dataset = reader.get_val_dataset(128)

In [95]:
checkpoint_path = "checkpoints/conformer_ordersize/cp-{epoch:04d}.ckpt"


In [96]:
callbacks = [
    # Early stopping to prevent overfitting
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),

    # Model checkpoint to save best model
    tf.keras.callbacks.ModelCheckpoint(
        filepath='models/best_model.h5',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    ),

    # Learning rate reduction on plateau
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),

    # CSV logger
    tf.keras.callbacks.CSVLogger(
        'training_log.csv',
        separator=',',
        append=False
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        save_freq='epoch',
        verbose=1
)

]


In [97]:
element = None
for el in val_dataset.take(1):
    element = el
    
    break

In [98]:


model = CustomModel(
        num_transformer_blocks=3
    )
model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(0.001),
    metrics=[
        tf.keras.metrics.AUC(name='auc')
    ]
)

#model(element[0])
#model.load_weights("models/best_model.h5")

In [None]:
history = model.fit(
    train_dataset.repeat(),
    validation_data=val_dataset,
    epochs=40,
    steps_per_epoch=500,
    validation_steps=None,
    callbacks=callbacks,
    verbose=1,  # 0: silent, 1: progress bar, 2: one line per epoch
    shuffle=True,  # Shuffle training data
    max_queue_size=10,
)

In [100]:
model.load_weights(
            f'models/best_model.h5')

In [101]:
test_dataset = reader.get_test_dataset(512)

In [None]:
outputs = model.predict(test_dataset)

In [103]:
true_len = len(reader.data['user_id'])

In [105]:
np.save('pred_data/final_states.npy', outputs['final_states'][:true_len] )
np.save('pred_data/pred_order_size.npy', outputs['next_order_size'][:true_len])

In [None]:
for i in outputs['next_order_size']:
    print(i)