In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import keras_tuner as kt


import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("../raw_data/data.xlsx")

In [3]:
# filtering down to essential features & transform datetime feature

df = data[['Account: Account ID', 'Product Category (6D)', 'Model  Name', 'Purchase  Date', 'Purchase Month', 'Purchase Year']]
df['Purchase  Date'] = pd.to_datetime(df['Purchase  Date'])
df = df[df['Purchase  Date']>'2017-04-01']
df['Today'] = datetime.today()
df['Recency'] = round((df['Today'] - df['Purchase  Date']).dt.days/30,0)
df['Recency'] = df['Recency'].astype('int')
df.drop(columns=['Purchase  Date', 'Purchase Month', 'Purchase Year', 'Today'], inplace=True)

In [4]:
# filtering down to customers with purchase history between 3~15
df_count = df.groupby('Account: Account ID').count()[['Model  Name']]
df_count = df_count[df_count['Model  Name']>=3]
df_count = df_count[df_count['Model  Name']<=15]
customer_id = df_count.index
input_data = df[df['Account: Account ID'].isin(customer_id)]
input_data = input_data.rename(columns={'Account: Account ID':'account_id', 'Model  Name':'model', 'Recency':'recency'})[['account_id','model', 'recency']]

In [5]:
# LabelEncoder model into integers
encoder = LabelEncoder()
input_data['model'] = encoder.fit_transform(input_data['model'])
input_data['model'] = input_data['model'].apply(lambda x: x+1)

In [6]:
# groupby customer, aggregate model and recency into arrays
input_data = input_data.groupby('account_id').agg(list)
input_data['model'] = input_data['model'].apply(lambda x: np.array(x))
input_data['recency'] = input_data['recency'].apply(lambda x: np.array(x))

In [7]:
# creating training_sequence & target_sequence
input_data['training_sequence'] = input_data['model'].apply(lambda x: x[:-1])
input_data['target_sequence'] = input_data['model'].apply(lambda x: x[1:])

In [8]:
input_data

Unnamed: 0_level_0,model,recency,training_sequence,target_sequence
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900000215889,"[131, 30, 107]","[50, 50, 48]","[131, 30]","[30, 107]"
0010o00002AGLLq,"[47, 121, 85]","[41, 40, 37]","[47, 121]","[121, 85]"
0010o00002AGU1E,"[116, 41, 98]","[40, 40, 4]","[116, 41]","[41, 98]"
0010o00002AGUKz,"[116, 41, 98]","[41, 41, 41]","[116, 41]","[41, 98]"
0010o00002AGYLD,"[41, 53, 116, 116]","[40, 18, 18, 10]","[41, 53, 116]","[53, 116, 116]"
...,...,...,...,...
0019000002ACnHZ,"[106, 99, 25]","[47, 41, 16]","[106, 99]","[99, 25]"
0019000002ACnby,"[12, 107, 131]","[41, 41, 36]","[12, 107]","[107, 131]"
0019000002ACsYC,"[41, 53, 116, 41, 41, 41, 41]","[41, 16, 11, 1, 1, 1, 1]","[41, 53, 116, 41, 41, 41]","[53, 116, 41, 41, 41, 41]"
0019000002B5m8N,"[41, 41, 97, 84, 54, 91, 54, 99, 3, 3]","[47, 46, 46, 36, 36, 32, 32, 28, 13, 11]","[41, 41, 97, 84, 54, 91, 54, 99, 3]","[41, 97, 84, 54, 91, 54, 99, 3, 3]"


In [74]:
train, test = train_test_split(input_data, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [75]:
# padding sequences to maximum length
maxlen = input_data['model'].apply(lambda x: len(x)).sort_values(ascending=False).iloc[0]

train_feat_dict = {'training_sequence':pad_sequences(train.training_sequence, maxlen=maxlen, padding='pre', value=0),
                    'recency':pad_sequences(train.recency, maxlen=maxlen, padding='pre', value=0)}
train_target_tensor = pad_sequences(train.target_sequence, maxlen=maxlen, padding='pre', value=0)


val_feat_dict = {'training_sequence':pad_sequences(val.training_sequence, maxlen=maxlen, padding='pre', value=0),
                    'recency':pad_sequences(val.recency, maxlen=maxlen, padding='pre', value=0)}
val_target_tensor = pad_sequences(val.target_sequence, maxlen=maxlen, padding='pre', value=0)


test_feat_dict = {'training_sequence':pad_sequences(test.training_sequence, maxlen=maxlen, padding='pre', value=0),
                    'recency':pad_sequences(test.recency, maxlen=maxlen, padding='pre', value=0)}
test_target_tensor = pad_sequences(test.target_sequence, maxlen=maxlen, padding='pre', value=0)

In [35]:
def create_tfdata(feat_dict, target_tensor, batch_size, buffer_size=None):
    """
    Create train tf dataset for model train input
    :param train_feat_dict: dict, containing the features tensors for train data
    :param train_target_tensor: np.array(), the training TARGET tensor
    :param batch_size: (int) size of the batch to work with
    :param buffer_size: (int) Optional. Default is None. Size of the buffer
    :return: (tuple) 1st element is the training dataset,
                     2nd is the number of steps per epoch (based on batch size)
    """
    if buffer_size is None:
        buffer_size = batch_size*50

    steps_per_epoch = len(target_tensor) // batch_size

    dataset = tf.data.Dataset.from_tensor_slices((feat_dict, target_tensor)).cache()
    dataset = dataset.shuffle(buffer_size).batch(batch_size)
    dataset = dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset, steps_per_epoch

In [37]:
train_dataset, train_steps_per_epoch = create_tfdata(train_feat_dict,
                                                         train_target_tensor,
                                                         batch_size=64)

val_dataset, val_steps_per_epoch = create_tfdata(val_feat_dict,
                                                         val_target_tensor,
                                                         batch_size=64)

test_dataset, test_steps_per_epoch = create_tfdata(test_feat_dict,
                                                         test_target_tensor,
                                                         batch_size=64)

In [38]:
max_model = max([max(i) for i in input_data['model']])+1
max_recency = max([max(i) for i in input_data['recency']])+1

In [122]:
def build_model(maxlen=maxlen, max_model=max_model, max_recency=max_recency):
    """
    Build a model given the hyper-parameters with item and nb_days input features
    :param hp: (kt.HyperParameters) hyper-parameters to use when building this model
    :return: built and compiled tensorflow model 
    """
    inputs = {}
    inputs['training_sequence'] = tf.keras.Input(batch_input_shape=[None, maxlen],
                                       name='training_sequence', dtype=tf.int32)
    # create encoding padding mask
    encoding_padding_mask = tf.math.logical_not(tf.math.equal(inputs['training_sequence'], 0))

    # nb_days bucketized
    inputs['recency'] = tf.keras.Input(batch_input_shape=[None, maxlen],
                                       name='recency', dtype=tf.int32)

    # Pass categorical input through embedding layer
    # with size equals to tokenizer vocabulary size
    # Remember that vocab_size is len of item tokenizer + 1
    # (for the padding '0' value)
    
    embedding_training_sequence = tf.keras.layers.Embedding(input_dim=max_model,
                                               output_dim=32,
                                               name='embedding_item'
                                              )(inputs['training_sequence'])
    # nbins=100, +1 for zero padding
    embedding_recency = tf.keras.layers.Embedding(input_dim=max_recency,
                                                  output_dim=32,
                                                  name='embedding_recency'
                                                 )(inputs['recency'])

    #  Concatenate embedding layers
    concat_embedding_input = tf.keras.layers.Concatenate(
     name='concat_embedding_input')([embedding_training_sequence, embedding_recency])

    concat_embedding_input = tf.keras.layers.BatchNormalization(
     name='batchnorm_inputs')(concat_embedding_input)
    
    # LSTM layer
    rnn = tf.keras.layers.LSTM(units=128,
                                   return_sequences=True,
                                   stateful=False,
                                   recurrent_initializer='glorot_normal',
                                   name='LSTM_cat'
                                   )(concat_embedding_input)

    rnn = tf.keras.layers.BatchNormalization(name='batchnorm_lstm')(rnn)

    # Self attention so key=value in inputs
    att = tf.keras.layers.Attention(use_scale=False, causal=True,
                                    name='attention')(inputs=[rnn, rnn],
                                                      mask=[encoding_padding_mask,
                                                            encoding_padding_mask])

    # Last layer is a fully connected one
    output = tf.keras.layers.Dense(max_model, name='output', activation='softmax')(att)

    model = tf.keras.Model(inputs, output)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=loss_function,
        metrics=['sparse_categorical_accuracy'])
    
    return model

In [123]:
def loss_function(real, pred):
    """
    We redefine our own loss function in order to get rid of the '0' value
    which is the one used for padding. This to avoid that the model optimize itself
    by predicting this value because it is the padding one.
    
    :param real: the truth
    :param pred: predictions
    :return: a masked loss where '0' in real (due to padding)
                are not taken into account for the evaluation
    """

    # to check that pred is numric and not nan
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_object_ = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                 reduction='none')
    loss_ = loss_object_(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [124]:
model = build_model()

In [125]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
training_sequence (InputLayer)  [(None, 15)]         0                                            
__________________________________________________________________________________________________
recency (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_item (Embedding)      (None, 15, 32)       4480        training_sequence[0][0]          
__________________________________________________________________________________________________
embedding_recency (Embedding)   (None, 15, 32)       1984        recency[0][0]                    
____________________________________________________________________________________________

In [126]:
es = EarlyStopping(patience=50, restore_best_weights=True)
history = model.fit(train_dataset, validation_data=val_dataset, epochs=1000, steps_per_epoch=40, validation_steps=10, callbacks=es, verbose=1)

Epoch 1/1000


2022-04-03 15:05:24.279665: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2591 num_cores: 12 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 12582912 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2022-04-03 15:05:26.051155: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2591 num_cores: 12 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 12582912 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000


Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000


In [128]:
# index = 35

# test_feat_dict = {'training_sequence': train_feat_dict['training_sequence'][index:index+1],
#                 'recency': train_feat_dict['recency'][index:index+1]}
# test_dataset = tf.data.Dataset.from_tensor_slices((test_feat_dict)).cache()

# y_pred = model.predict(test_dataset)
# output = []
# for i in range(15):
#     maxElement = np.amax(y_pred[i][0])
#     result = np.where(y_pred[i][0] == np.amax(y_pred[i][0]))
#     output.append(result[0][0])

# print(f'predicted: {output}')
# print(f'actual   : {train_target_tensor[index:index+1][0]}')

In [130]:
index =40

y_pred = model.predict(test_dataset)

output = []
for i in range(15):
    maxElement = np.amax(y_pred[i][0])
    result = np.where(y_pred[i][0] == np.amax(y_pred[i][0]))
    output.append(result[0][0])

print(f'predicted: {output}')
print(f'actual   : {test_target_tensor[index:index+1][0]}')

predicted: [121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 75, 41]
actual   : [  0   0   0   0   0   0   0   0   0   0   0 130  97  79  49]
