In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import random
%matplotlib inline

from keras.models import Model
from keras.layers import Input, GRU, LSTM, Dense, Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Read all the files

data_folder = "input_data/"
proc_data_folder = "processed_data/"


aisles = pd.read_csv(os.path.join(data_folder, "aisles.csv"))
departments = pd.read_csv(os.path.join(data_folder, "departments.csv"))
products = pd.read_csv(os.path.join(data_folder, "products.csv"))


# Sample from the big files for now
order_prod_prior = pd.read_csv(os.path.join(data_folder, "order_products__prior.csv"))
order_prod_train = pd.read_csv(os.path.join(data_folder, "order_products__train.csv"))
orders = pd.read_csv(os.path.join(data_folder, "orders.csv"))


In [3]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
order_prod_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


#### Build a small sample of users and their order histories to try the model

In [22]:
random.seed(2018)

samples = 1200
all_user_ids = set(orders["user_id"].unique())
sampled_user_ids = sorted(random.sample(all_user_ids,samples))
print(sampled_user_ids[:20])

[295, 461, 791, 1026, 1318, 1362, 1404, 1568, 2020, 2206, 2985, 3214, 3283, 3322, 3402, 3513, 3790, 3809, 3930, 4053]


In [23]:
# Convert product ids to strings
order_prod_prior["product_id"] = order_prod_prior["product_id"].astype("str")
order_prod_train["product_id"] = order_prod_train["product_id"].astype("str")

# Sample n users' orders, inner join on order_id with order_prod_prior
order_samp = orders[orders["user_id"].isin(sampled_user_ids)]
train_subset = order_prod_prior.merge(order_samp, on="order_id")
target_subset = order_prod_train.merge(order_samp, on="order_id")

In [24]:
# It is important to sort on user ids, then on order ids and add-to-cart sequences!
train_subset = train_subset.sort_values(
    ["user_id", "order_number", "add_to_cart_order"])
target_subset = target_subset.sort_values(
    ["user_id", "order_number", "add_to_cart_order"])

In [25]:
# Identify users who did not reorder and append rows with no product into the target subset before
# reordering it so that it matches the train in terms of the users
no_reorder_users = [
    item for item in sampled_user_ids if item not in target_subset.user_id.unique()]
target_none_rows = pd.DataFrame(no_reorder_users, columns=["user_id"])
target_none_rows.head()

Unnamed: 0,user_id
0,461
1,791
2,1404
3,3790
4,3930


In [26]:
target_subset = target_subset.append(target_none_rows, ignore_index=True).drop_duplicates()
target_subset = target_subset.fillna(0)
target_subset.loc[target_subset["product_id"]==0,"product_id"]="None"

target_subset = target_subset.sort_values(["user_id","order_number","add_to_cart_order"])
target_subset.head()

Unnamed: 0,add_to_cart_order,days_since_prior_order,eval_set,order_dow,order_hour_of_day,order_id,order_number,product_id,reordered,user_id
0,1.0,8.0,train,2.0,14.0,405264.0,24.0,3957,1.0,295
1,2.0,8.0,train,2.0,14.0,405264.0,24.0,34270,1.0,295
2,3.0,8.0,train,2.0,14.0,405264.0,24.0,9623,1.0,295
3,4.0,8.0,train,2.0,14.0,405264.0,24.0,8174,1.0,295
4,5.0,8.0,train,2.0,14.0,405264.0,24.0,5212,1.0,295


In [27]:
print(train_subset.shape, target_subset.shape)

(194768, 10) (8631, 10)


In [28]:
vocab_size = len(train_subset.product_id.unique())
vocab_size

16790

In [29]:
# Generate sequences of products (discard other features for now)
(train_subset.sort_values(["user_id","order_number","add_to_cart_order"])
 .loc[train_subset["user_id"]==sampled_user_ids[0]])

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
183809,3214808,46802,1,0,295,prior,1,4,12,
183810,3214808,39275,2,0,295,prior,1,4,12,
183811,3214808,32578,3,0,295,prior,1,4,12,
183812,3214808,15804,4,0,295,prior,1,4,12,
88967,1558179,13292,1,0,295,prior,2,4,16,7.0
88968,1558179,20084,2,0,295,prior,2,4,16,7.0
88969,1558179,9862,3,0,295,prior,2,4,16,7.0
88970,1558179,32578,4,1,295,prior,2,4,16,7.0
88971,1558179,34270,5,0,295,prior,2,4,16,7.0
88972,1558179,3957,6,0,295,prior,2,4,16,7.0


In [30]:
# Generate sequences of products (discard other features for now)
# Can do this for each feature and build a matrix for each user (next iteration?)

train_seqs = np.array(train_subset
             .sort_values(["user_id","order_number","add_to_cart_order"])
             .groupby("user_id")["product_id"]
             .apply(list)
             )

target_seqs = np.array(target_subset
             .sort_values(["user_id","order_number","add_to_cart_order"])
             .groupby("user_id")["product_id"]
             .apply(list)
             )

print(train_seqs[0], target_seqs[0])

['46802', '39275', '32578', '15804', '13292', '20084', '9862', '32578', '34270', '3957', '39275', '21137', '46802', '19057', '8174', '19895', '30678', '12572', '31766', '18980', '19244', '29836', '10121', '13292', '20084', '34270', '34270', '3957', '16797', '18980', '3756', '2830', '13292', '22281', '23719', '34270', '3957', '13292', '18980', '20084', '19057', '27966', '8174', '5438', '16797', '34270', '3957', '9862', '18048', '34270', '32578', '27966', '9623', '4493', '10', '5438', '34270', '13292', '3957', '32578', '18980', '23986', '21616', '21903', '40706', '47209', '30391', '12572', '9623', '25466', '5438', '22281', '43218', '34270', '13292', '3957', '32578', '18980', '9862', '20084', '22281', '39275', '9623', '8053', '12572', '16797', '47209', '23986', '30391', '43218', '29836', '19895', '8174', '5212', '8277', '23719', '23986', '8053', '5212', '21137', '30489', '13292', '34270', '3957', '20084', '9623', '23986', '32578', '9862', '12572', '8174', '10749', '5438', '30489', '21903'

In [34]:
input_characters = train_subset.product_id.unique()
target_characters = target_subset.product_id.unique()
input_texts = train_seqs
target_texts = target_seqs

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

# def get_one_hot(targets, nb_classes):
#     res = np.eye(nb_classes)[np.array(targets).reshape(-1)]
#     return res.reshape(list(targets.shape)+[nb_classes])

# oh_target_characters = get_one_hot([int(item) for item in target_characters],num_decoder_tokens)

samples = len(input_texts)
print('Number of samples:', samples)
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 1200
Number of unique input tokens: 16790
Number of unique output tokens: 4085
Max sequence length for inputs: 1968
Max sequence length for outputs: 49


### Try a seq2seq
  
Source: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [35]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

# Setting up the matrices for inputs/outputs
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')


for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.



In [36]:
encoder_input_data.shape

(1200, 1968, 16790)

### Play with these parameters

In [37]:
batch_size = 64  # Batch size for training.
epochs = 12  # Number of epochs to train for.
latent_dim = 200  # Latent dimensionality of the encoding space (original 256)
# num_samples = 10  # Number of samples to train on.

#### GRU Model

In [19]:
# encoder_inputs = Input(shape=(None, num_encoder_tokens))
# encoder = GRU(latent_dim, return_state=True)
# encoder_outputs, state_h = encoder(encoder_inputs)

# decoder_inputs = Input(shape=(None, num_decoder_tokens))
# decoder_gru = GRU(latent_dim, return_sequences=True)
# decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
# decoder_dense = Dense(num_decoder_tokens, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)
# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#### LSTM Model

In [38]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('s2s_'+str(samples)+'_samp_'+str(epochs)+'_eps.h5')

Train on 1080 samples, validate on 120 samples
Epoch 1/12

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
#     target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence.append(sampled_char) 

        # Exit condition: either hit max length
        # or find stop character.
        if (len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


From the model, we generate for each visitor the number of predicted reordered products that corresponds to the size of their order in the training set.
We then join it to the set of product ids that were in the actual order to compare the labels. 
There are 4 possibilities:
1) True reorder - predicted order: correctly identified reorder
2) True reorder - null: missed reorder 
3) True first time product in the validation set  - predicted order: correctly identified first-time order
4) True first time product in the validation set - null: missed first-time order

In [None]:
pred_output = pd.DataFrame()

for seq_index in range(10):

    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)

    u_id = sampled_user_ids[seq_index]
    # print('Input sentence:', input_texts[seq_index])
    # print('Decoded sentence:', decoded_sentence)
    target_order_leng = target_subset[target_subset.user_id==u_id]["user_id"].shape[0]
    tmp = pd.DataFrame({"user_id":np.repeat(u_id,target_order_leng),
                        "pred_reordered":np.repeat(1,target_order_leng),
                 "product_id":decoded_sentence[:target_order_leng]})
    
    pred_output = pred_output.append(tmp, ignore_index=True)

In [None]:
# Join back to the training data, drop the non-reordering users, fill nans
train_results = target_subset.merge(pred_output,on=["user_id","product_id"], how="left")
train_results = train_results[train_results.product_id!="None"]
train_results["pred_reordered"] = train_results["pred_reordered"].fillna(0.0)
train_results[["user_id","product_id","reordered","pred_reordered"]].head()

In [None]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

y_true = train_results["reordered"]
y_pred = train_results["pred_reordered"]

cnf_matrix = confusion_matrix(y_true, y_pred)
print(cnf_matrix)
print("accuracy score:",str(accuracy_score(y_true, y_pred)))
print("f1 score:",str(f1_score(y_true, y_pred)))

In [None]:
train_results.groupby(["reordered"]).count()

In [None]:
products.head()
pred_output.head()
pred_output["product_id"]=pred_output["product_id"].astype("int")
pred_output_products = pred_output.merge(products,on=["product_id"])
pred_output_products_list = pred_output_products.groupby("user_id")["product_name"].apply(list).reset_index()

train_results["product_id"]=train_results["product_id"].astype("int")
train_results_products = train_results.merge(products,on=["product_id"])
train_results_products_list = train_results_products.groupby("user_id")["product_name"].apply(list).reset_index()

pred_output_products_list

In [None]:
user_id = pred_output_products_list.iloc[0,0]
print("user id:",user_id)
print("predicted:")
print(pred_output_products_list.iloc[0,1])
print("actual:")
print(list(train_results_products_list.loc[train_results_products_list.user_id==user_id,"product_name"])[0])

## TODO:
1) One-hot encode the outputs  
2) embedding of the inputs + add one-hot outputs (maybe make a clone of this notebook)  
3) how to run it on a bigger sample?(jupyter on AWS?)  
