In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [2]:
data = pd.read_excel("../raw_data/ILC & Lens MySony RAW (01042022).xlsx")

In [76]:
df = data[['Account: Account ID', 'Product Category (6D)', 'Model  Name', 'Purchase  Date', 'Purchase Month', 'Purchase Year']]

In [77]:
df['Purchase  Date'] = pd.to_datetime(df['Purchase  Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Purchase  Date'] = pd.to_datetime(df['Purchase  Date'])


In [78]:
df = df[df['Purchase  Date']>'2017-04-01']

In [79]:
df['Today'] = datetime.today()

In [80]:
df['Recency'] = round((df['Today'] - df['Purchase  Date']).dt.days/30,0)

In [81]:
df['Recency'] = df['Recency'].astype('int')

In [82]:
df.drop(columns=['Purchase  Date', 'Purchase Month', 'Purchase Year', 'Today'], inplace=True)

In [83]:
df_count = df.groupby('Account: Account ID').count()[['Model  Name']]

In [84]:
df_count = df_count[df_count['Model  Name']>2]
df_count = df_count[df_count['Model  Name']<=15]

In [85]:
customer_id = df_count.index

In [284]:
input_data = df[df['Account: Account ID'].isin(customer_id)]

In [286]:
input_data.columns

Index(['Account: Account ID', 'Product Category (6D)', 'Model  Name',
       'Recency'],
      dtype='object')

In [287]:
input_data = input_data.rename(columns={'Account: Account ID':'account_id', 'Model  Name':'model', 'Recency':'recency'})[['account_id','model', 'recency']]

In [288]:
encoder = LabelEncoder()
input_data['model'] = encoder.fit_transform(input_data['model'])

In [293]:
input_data = input_data.groupby('account_id').agg(list)

In [309]:
input_data['model'] = input_data['model'].apply(lambda x: np.array(x))
input_data['recency'] = input_data['recency'].apply(lambda x: np.array(x))

In [311]:
input_data['training_sequence'] = input_data['model'].apply(lambda x: x[:-1])
input_data['target_sequence'] = input_data['model'].apply(lambda x: x[1:])

In [312]:
input_data

Unnamed: 0_level_0,model,recency,training_sequence,target_sequence
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900000215889,"[130, 29, 106]","[50, 50, 48]","[130, 29]","[29, 106]"
0010o00002AGLLq,"[46, 120, 84]","[41, 40, 37]","[46, 120]","[120, 84]"
0010o00002AGU1E,"[115, 40, 97]","[40, 40, 4]","[115, 40]","[40, 97]"
0010o00002AGUKz,"[115, 40, 97]","[41, 41, 41]","[115, 40]","[40, 97]"
0010o00002AGYLD,"[40, 52, 115, 115]","[40, 18, 18, 10]","[40, 52, 115]","[52, 115, 115]"
...,...,...,...,...
0019000002ACnHZ,"[105, 98, 24]","[47, 41, 16]","[105, 98]","[98, 24]"
0019000002ACnby,"[11, 106, 130]","[41, 41, 36]","[11, 106]","[106, 130]"
0019000002ACsYC,"[40, 52, 115, 40, 40, 40, 40]","[41, 16, 11, 1, 1, 1, 1]","[40, 52, 115, 40, 40, 40]","[52, 115, 40, 40, 40, 40]"
0019000002B5m8N,"[40, 40, 96, 83, 53, 90, 53, 98, 2, 2]","[47, 46, 46, 36, 36, 32, 32, 28, 12, 11]","[40, 40, 96, 83, 53, 90, 53, 98, 2]","[40, 96, 83, 53, 90, 53, 98, 2, 2]"


In [313]:
maxlen = input_data['model'].apply(lambda x: len(x)).sort_values(ascending=False).iloc[0]

In [335]:
input_data['training_sequence'] = pad_sequences(input_data.training_sequence, maxlen=maxlen, padding='pre', value=0).tolist()
input_data['recency'] = pad_sequences(input_data.recency, maxlen=maxlen, padding='pre', value=0).tolist()
input_data['target_sequence'] = pad_sequences(input_data.target_sequence, maxlen=maxlen, padding='pre', value=0).tolist()

In [339]:
input_data['training_sequence'] = input_data['training_sequence'].apply(lambda x: np.array(x))
input_data['recency'] = input_data['recency'].apply(lambda x: np.array(x))
input_data['target_sequence'] = input_data['target_sequence'].apply(lambda x: np.array(x))

In [343]:
print(input_data['training_sequence'].iloc[0])
print(input_data['recency'].iloc[0])
print(input_data['target_sequence'].iloc[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0 130  29]
[ 0  0  0  0  0  0  0  0  0  0  0  0 50 50 48]
[  0   0   0   0   0   0   0   0   0   0   0   0   0  29 106]


In [348]:
train_feat_dict = {'training_sequence': input_data.training_sequence.values,
                     'recency': input_data.recency.values}
train_target_tensor = input_data['target_sequence'].values

In [369]:
train_feat_dict

{'training_sequence': array([array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        130,  29]),
        array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         46, 120]),
        array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        115,  40]),
        ...,
        array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  40,  52, 115,  40,
         40,  40]),
        array([ 0,  0,  0,  0,  0,  0, 40, 40, 96, 83, 53, 90, 53, 98,  2]),
        array([  0,   0,   0,   0,  40,  97, 128,  84, 122, 120,  94, 107,  31,
         83,  90])], dtype=object),
 'recency': array([array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 50, 50, 48]),
        array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 41, 40, 37]),
        array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 40, 40,  4]),
        ...,
        array([ 0,  0,  0,  0,  0,  0,  0,  0, 41, 16, 11,  1,  1,  1,  1]),
        array([ 0,  0,  0,  0,  0, 47, 46

In [346]:
def create_train_tfdata(train_feat_dict, train_target_tensor,
                        batch_size, buffer_size=None):
    """
    Create train tf dataset for model train input
    :param train_feat_dict: dict, containing the features tensors for train data
    :param train_target_tensor: np.array(), the training TARGET tensor
    :param batch_size: (int) size of the batch to work with
    :param buffer_size: (int) Optional. Default is None. Size of the buffer
    :return: (tuple) 1st element is the training dataset,
                     2nd is the number of steps per epoch (based on batch size)
    """
    if buffer_size is None:
        buffer_size = batch_size*50

    train_steps_per_epoch = len(train_target_tensor) // batch_size

    train_dataset = tf.data.Dataset.from_tensor_slices((train_feat_dict,
                                                        train_target_tensor)).cache()
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
    train_dataset = train_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
    
    return train_dataset, train_steps_per_epoch

In [347]:
tf.data.Dataset.from_tensor_slices((train_feat_dict,train_target_tensor))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [240]:
tf.data.Dataset.from_tensor_slices((train_feat_dict, train_target_tensor))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [238]:
train_feat_dict['recency'].shape

(3127,)

In [236]:
train_target_tensor.shape

(3127,)

In [212]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_feat_dict,
                                                        train_target_tensor)).cache()

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).