In [301]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import keras_tuner as kt

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("../raw_data/data.xlsx")

In [302]:
# filtering down to essential features & transform datetime feature

df = data[['Account: Account ID', 'Product Category (6D)', 'Model  Name', 'Purchase  Date', 'Purchase Month', 'Purchase Year']]
df['Purchase  Date'] = pd.to_datetime(df['Purchase  Date'])
df = df[df['Purchase  Date']>'2017-04-01']
df['Today'] = datetime.today()
df['Recency'] = round((df['Today'] - df['Purchase  Date']).dt.days/30,0)
df['Recency'] = df['Recency'].astype('int')
df.drop(columns=['Purchase  Date', 'Purchase Month', 'Purchase Year', 'Today'], inplace=True)

In [303]:
# filtering down to customers with purchase history between 3~15
df_count = df.groupby('Account: Account ID').count()[['Model  Name']]
df_count = df_count[df_count['Model  Name']>=3]
df_count = df_count[df_count['Model  Name']<=15]
customer_id = df_count.index
input_data = df[df['Account: Account ID'].isin(customer_id)]
input_data = input_data.rename(columns={'Account: Account ID':'account_id', 'Model  Name':'model', 'Recency':'recency'})[['account_id','model', 'recency']]

In [304]:
# LabelEncoder model into integers
encoder = LabelEncoder()
input_data['model'] = encoder.fit_transform(input_data['model'])
input_data['model'] = input_data['model'].apply(lambda x: x+1)

In [305]:
# groupby customer, aggregate model and recency into arrays
input_data = input_data.groupby('account_id').agg(list)
input_data['model'] = input_data['model'].apply(lambda x: np.array(x))
input_data['recency'] = input_data['recency'].apply(lambda x: np.array(x))

In [306]:
# creating training_sequence & target_sequence
input_data['training_sequence'] = input_data['model'].apply(lambda x: x[:-1])
input_data['target_sequence'] = input_data['model'].apply(lambda x: x[1:])

In [307]:
input_data

Unnamed: 0_level_0,model,recency,training_sequence,target_sequence
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900000215889,"[131, 30, 107]","[50, 50, 48]","[131, 30]","[30, 107]"
0010o00002AGLLq,"[47, 121, 85]","[41, 40, 37]","[47, 121]","[121, 85]"
0010o00002AGU1E,"[116, 41, 98]","[40, 40, 4]","[116, 41]","[41, 98]"
0010o00002AGUKz,"[116, 41, 98]","[41, 41, 41]","[116, 41]","[41, 98]"
0010o00002AGYLD,"[41, 53, 116, 116]","[40, 18, 18, 10]","[41, 53, 116]","[53, 116, 116]"
...,...,...,...,...
0019000002ACnHZ,"[106, 99, 25]","[47, 41, 16]","[106, 99]","[99, 25]"
0019000002ACnby,"[12, 107, 131]","[41, 41, 36]","[12, 107]","[107, 131]"
0019000002ACsYC,"[41, 53, 116, 41, 41, 41, 41]","[41, 16, 11, 1, 1, 1, 1]","[41, 53, 116, 41, 41, 41]","[53, 116, 41, 41, 41, 41]"
0019000002B5m8N,"[41, 41, 97, 84, 54, 91, 54, 99, 3, 3]","[47, 46, 46, 36, 36, 32, 32, 28, 12, 11]","[41, 41, 97, 84, 54, 91, 54, 99, 3]","[41, 97, 84, 54, 91, 54, 99, 3, 3]"


In [309]:
# padding sequences to maximum length
maxlen = input_data['model'].apply(lambda x: len(x)).sort_values(ascending=False).iloc[0]

train_feat_dict = {'training_sequence':pad_sequences(input_data.training_sequence, maxlen=maxlen, padding='pre', value=0),
                    'recency':pad_sequences(input_data.recency, maxlen=maxlen, padding='pre', value=0)}
train_target_tensor = pad_sequences(input_data.target_sequence, maxlen=maxlen, padding='pre', value=0)

In [332]:
def create_train_tfdata(train_feat_dict, train_target_tensor,
                        batch_size, buffer_size=None):
    """
    Create train tf dataset for model train input
    :param train_feat_dict: dict, containing the features tensors for train data
    :param train_target_tensor: np.array(), the training TARGET tensor
    :param batch_size: (int) size of the batch to work with
    :param buffer_size: (int) Optional. Default is None. Size of the buffer
    :return: (tuple) 1st element is the training dataset,
                     2nd is the number of steps per epoch (based on batch size)
    """
    if buffer_size is None:
        buffer_size = batch_size*50

    train_steps_per_epoch = len(train_target_tensor) // batch_size

    train_dataset = tf.data.Dataset.from_tensor_slices((train_feat_dict,
                                                        train_target_tensor)).cache()
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
    train_dataset = train_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
    
    return train_dataset, train_steps_per_epoch


In [333]:
train_dataset, train_steps_per_epoch = create_train_tfdata(train_feat_dict,
                                                         train_target_tensor,
                                                         batch_size=64)

In [385]:
max_model = max([max(i) for i in input_data['model']])+1
max_recency = max([max(i) for i in input_data['recency']])+1

In [387]:
def build_model(maxlen=maxlen, max_model=max_model, max_recency=max_recency):
    """
    Build a model given the hyper-parameters with item and nb_days input features
    :param hp: (kt.HyperParameters) hyper-parameters to use when building this model
    :return: built and compiled tensorflow model 
    """
    inputs = {}
    inputs['training_sequence'] = tf.keras.Input(batch_input_shape=[None, maxlen],
                                       name='training_sequence', dtype=tf.int32)
    # create encoding padding mask
    encoding_padding_mask = tf.math.logical_not(tf.math.equal(inputs['training_sequence'], 0))

    # nb_days bucketized
    inputs['recency'] = tf.keras.Input(batch_input_shape=[None, maxlen],
                                       name='recency', dtype=tf.int32)

    # Pass categorical input through embedding layer
    # with size equals to tokenizer vocabulary size
    # Remember that vocab_size is len of item tokenizer + 1
    # (for the padding '0' value)
    
    embedding_training_sequence = tf.keras.layers.Embedding(input_dim=max_model,
                                               output_dim=10,
                                               name='embedding_item'
                                              )(inputs['training_sequence'])
    # nbins=100, +1 for zero padding
    embedding_recency = tf.keras.layers.Embedding(input_dim=max_recency,
                                                  output_dim=10,
                                                  name='embedding_recency'
                                                 )(inputs['recency'])

    #  Concatenate embedding layers
    concat_embedding_input = tf.keras.layers.Concatenate(
     name='concat_embedding_input')([embedding_training_sequence, embedding_recency])

    concat_embedding_input = tf.keras.layers.BatchNormalization(
     name='batchnorm_inputs')(concat_embedding_input)
    
    # LSTM layer
    rnn = tf.keras.layers.LSTM(units=32,
                                   return_sequences=True,
                                   stateful=False,
                                   recurrent_initializer='glorot_normal',
                                   name='LSTM_cat'
                                   )(concat_embedding_input)

    rnn = tf.keras.layers.BatchNormalization(name='batchnorm_lstm')(rnn)

    # Self attention so key=value in inputs
    att = tf.keras.layers.Attention(use_scale=False, causal=True,
                                    name='attention')(inputs=[rnn, rnn],
                                                      mask=[encoding_padding_mask,
                                                            encoding_padding_mask])

    # Last layer is a fully connected one
    output = tf.keras.layers.Dense(max_model, name='output')(att)

    model = tf.keras.Model(inputs, output)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=loss_function,
        metrics=['sparse_categorical_accuracy'])
    
    return model

In [388]:
def loss_function(real, pred):
    """
    We redefine our own loss function in order to get rid of the '0' value
    which is the one used for padding. This to avoid that the model optimize itself
    by predicting this value because it is the padding one.
    
    :param real: the truth
    :param pred: predictions
    :return: a masked loss where '0' in real (due to padding)
                are not taken into account for the evaluation
    """

    # to check that pred is numric and not nan
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_object_ = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                 reduction='none')
    loss_ = loss_object_(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [389]:
train_dataset

<PrefetchDataset shapes: ({training_sequence: (None, 15), recency: (None, 15)}, (None, 15)), types: ({training_sequence: tf.int32, recency: tf.int32}, tf.int32)>

In [390]:
model = build_model()

In [391]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
training_sequence (InputLayer)  [(None, 15)]         0                                            
__________________________________________________________________________________________________
recency (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_item (Embedding)      (None, 15, 10)       1400        training_sequence[0][0]          
__________________________________________________________________________________________________
embedding_recency (Embedding)   (None, 15, 10)       620         recency[0][0]                    
____________________________________________________________________________________________

In [488]:
history = model.fit(train_dataset, steps_per_epoch=train_steps_per_epoch, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [516]:
index = 500

test_feat_dict = {'training_sequence': train_feat_dict['training_sequence'][index:index+1],
                'recency': train_feat_dict['recency'][index:index+1]}
test_dataset = tf.data.Dataset.from_tensor_slices((test_feat_dict)).cache()

y_pred = model.predict(test_dataset)
output = []
for i in range(15):
    maxElement = np.amax(y_pred[i][0])
    result = np.where(y_pred[i][0] == np.amax(y_pred[i][0]))
    output.append(result[0][0])

print(f'predicted: {output}')
print(f'actual   : {train_target_tensor[index:index+1][0]}')

predicted: [130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 49, 43]
actual   : [ 0  0  0  0  0  0  0  0  0  0  0  0  0 41 43]
