In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Sequential, layers
import tensorflow as tf
import keras_tuner as kt


import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("../raw_data/data.xlsx")

In [13]:
# filtering down to essential features & transform datetime feature

df = data[['Account: Account ID', 'Product Category (6D)', 'Model  Name', 'Purchase  Date', 'Purchase Month', 'Purchase Year']]
df['Purchase  Date'] = pd.to_datetime(df['Purchase  Date'])
df = df[df['Purchase  Date']>'2017-04-01']
df['Today'] = datetime.today()
df['Recency'] = round((df['Today'] - df['Purchase  Date']).dt.days/30,0)
df['Recency'] = df['Recency'].astype('int')
df.drop(columns=['Purchase  Date', 'Purchase Month', 'Purchase Year', 'Today'], inplace=True)

In [14]:
# filtering down to customers with purchase history between 3~15
df_count = df.groupby('Account: Account ID').count()[['Model  Name']]
df_count = df_count[df_count['Model  Name']>=3]
df_count = df_count[df_count['Model  Name']<=15]
customer_id = df_count.index
input_data = df[df['Account: Account ID'].isin(customer_id)]
input_data = input_data.rename(columns={'Account: Account ID':'account_id', 'Model  Name':'model'})[['account_id','model']]

In [15]:
# LabelEncoder model into integers
encoder = LabelEncoder()
input_data['model'] = encoder.fit_transform(input_data['model'])
input_data['model'] = input_data['model'].apply(lambda x: x+1)

In [16]:
# groupby customer, aggregate model into arrays
input_data = input_data.groupby('account_id').agg(list)
input_data['model'] = input_data['model'].apply(lambda x: np.array(x))

In [246]:
# creating training_sequence & target_sequence
input_data['training_sequence'] = input_data['model'].apply(lambda x: x[:-1])
input_data['target_sequence'] = input_data['model'].apply(lambda x: x[-1])

In [247]:
input_data

Unnamed: 0_level_0,model,training_sequence,target_sequence
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900000215889,"[131, 30, 107]","[131, 30]",107
0010o00002AGLLq,"[47, 121, 85]","[47, 121]",85
0010o00002AGU1E,"[116, 41, 98]","[116, 41]",98
0010o00002AGUKz,"[116, 41, 98]","[116, 41]",98
0010o00002AGYLD,"[41, 53, 116, 116]","[41, 53, 116]",116
...,...,...,...
0019000002ACnHZ,"[106, 99, 25]","[106, 99]",25
0019000002ACnby,"[12, 107, 131]","[12, 107]",131
0019000002ACsYC,"[41, 53, 116, 41, 41, 41, 41]","[41, 53, 116, 41, 41, 41]",41
0019000002B5m8N,"[41, 41, 97, 84, 54, 91, 54, 99, 3, 3]","[41, 41, 97, 84, 54, 91, 54, 99, 3]",3


In [248]:
maxlen = input_data['model'].apply(lambda x: len(x)).sort_values(ascending=False).iloc[0]

In [252]:
X_data = pad_sequences(input_data['training_sequence'], maxlen=maxlen, padding='pre', value=0)
y_data = input_data['target_sequence']
# y_data = pad_sequences(input_data['target_sequence'], maxlen=maxlen, padding='pre', value=0)

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)

In [255]:
def loss_function(real, pred):
    """
    We redefine our own loss function in order to get rid of the '0' value
    which is the one used for padding. This to avoid that the model optimize itself
    by predicting this value because it is the padding one.
    
    :param real: the truth
    :param pred: predictions
    :return: a masked loss where '0' in real (due to padding)
                are not taken into account for the evaluation
    """

    # to check that pred is numric and not nan
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_object_ = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                 reduction='none')
    loss_ = loss_object_(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [265]:
model = Sequential()
model.add(layers.Dense(64, input_shape=(15,) ,activation='relu'))
model.add(layers.LSTM(128))
model.add(layers.Normalization())
model.add(layers.Dense(15))

model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss='mse',
        metrics=['sparse_categorical_accuracy'])

ValueError: Input 0 of layer lstm_3 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 64)

In [266]:
es = EarlyStopping(patience=30, restore_best_weights=True)

In [258]:
history = model.fit(X_train, y_train,
                    epochs = 20,
                    batch_size = 32,
                   callbacks=[es],
                   verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [259]:
y_pred = model.predict(X_test)

In [261]:
y_pred.shape

(313, 15)