In [35]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import random
%matplotlib inline

from keras.models import Model
from keras.layers import Input, GRU, LSTM, Dense, Embedding

In [36]:
# Read all the files

data_folder = "input_data/"
proc_data_folder = "processed_data/"


aisles = pd.read_csv(os.path.join(data_folder, "aisles.csv"))
departments = pd.read_csv(os.path.join(data_folder, "departments.csv"))
products = pd.read_csv(os.path.join(data_folder, "products.csv"))


# Sample from the big files for now
order_prod_prior = pd.read_csv(os.path.join(data_folder, "order_products__prior.csv"))
order_prod_train = pd.read_csv(os.path.join(data_folder, "order_products__train.csv"))
orders = pd.read_csv(os.path.join(data_folder, "orders.csv"))


In [37]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [38]:
order_prod_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


#### Build a small sample of users and their order histories to try the model

In [112]:
random.seed(2018)

samples = 200
all_user_ids = set(orders["user_id"].unique())
sampled_user_ids = sorted(random.sample(all_user_ids,samples))
print(sampled_user_ids)

[3322, 3790, 4053, 4564, 4654, 5963, 7234, 7259, 8295, 8935, 9162, 10727, 10769, 10844, 10921, 11505, 13843, 16639, 16788, 17325, 17632, 20481, 20704, 23062, 26145, 27510, 30828, 31094, 31130, 31572, 33248, 33403, 35988, 37364, 37601, 37798, 38818, 41457, 41958, 42015, 43165, 43911, 46169, 46601, 49041, 49225, 49924, 50516, 50616, 50762, 51695, 52963, 54099, 54220, 57300, 57455, 57768, 58228, 59338, 59390, 61704, 61775, 61828, 62238, 62530, 62787, 63495, 65472, 65959, 66656, 66791, 68112, 68773, 69111, 72948, 73523, 74835, 75876, 76491, 76887, 77736, 78134, 78226, 78838, 79323, 80469, 83661, 85125, 85333, 85362, 85603, 86324, 87982, 88564, 88890, 89312, 90031, 90444, 91064, 92250, 93128, 93378, 94748, 95244, 95956, 97069, 98777, 100770, 101095, 101230, 102259, 104853, 104887, 105844, 106390, 106589, 107531, 111910, 112464, 112718, 113864, 113886, 114396, 117530, 118545, 119239, 119248, 120295, 120588, 120651, 123347, 123680, 124149, 124723, 127616, 127644, 131577, 132960, 133782, 13498

In [113]:
# Convert product ids to strings
order_prod_prior["product_id"] = order_prod_prior["product_id"].astype("str")
order_prod_train["product_id"] = order_prod_train["product_id"].astype("str")

# Sample n users' orders, inner join on order_id with order_prod_prior
order_samp = orders[orders["user_id"].isin(sampled_user_ids)]
train_subset = order_prod_prior.merge(order_samp, on="order_id")
target_subset = order_prod_train.merge(order_samp, on="order_id")

In [114]:
# It is important to sort on user ids, then on order ids and add-to-cart sequences!
train_subset = train_subset.sort_values(
    ["user_id", "order_number", "add_to_cart_order"])
target_subset = target_subset.sort_values(
    ["user_id", "order_number", "add_to_cart_order"])

In [115]:
# Identify users who did not reorder and append rows with no product into the target subset before
# reordering it so that it matches the train in terms of the users
no_reorder_users = [
    item for item in sampled_user_ids if item not in target_subset.user_id.unique()]
target_none_rows = pd.DataFrame(no_reorder_users, columns=["user_id"])
target_none_rows.head()

Unnamed: 0,user_id
0,3790
1,4053
2,4564
3,5963
4,7234


In [116]:
target_subset = target_subset.append(target_none_rows, ignore_index=True).drop_duplicates()
target_subset = target_subset.fillna(0)
target_subset.loc[target_subset["product_id"]==0,"product_id"]="None"

target_subset = target_subset.sort_values(["user_id","order_number","add_to_cart_order"])
target_subset.head()

Unnamed: 0,add_to_cart_order,days_since_prior_order,eval_set,order_dow,order_hour_of_day,order_id,order_number,product_id,reordered,user_id
0,1.0,29.0,train,5.0,18.0,2502397.0,9.0,21783,1.0,3322
1,2.0,29.0,train,5.0,18.0,2502397.0,9.0,33575,1.0,3322
2,3.0,29.0,train,5.0,18.0,2502397.0,9.0,38277,1.0,3322
3,4.0,29.0,train,5.0,18.0,2502397.0,9.0,26915,1.0,3322
4,5.0,29.0,train,5.0,18.0,2502397.0,9.0,16101,1.0,3322


In [117]:
print(train_subset.shape, target_subset.shape)

(35164, 10) (1539, 10)


In [118]:
vocab_size = len(train_subset.product_id.unique())
vocab_size

6439

In [119]:
# Generate sequences of products (discard other features for now)
(train_subset.sort_values(["user_id","order_number","add_to_cart_order"])
 .loc[train_subset["user_id"]==sampled_user_ids[0]])

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26104,2567960,41593,1,0,3322,prior,1,0,16,
26105,2567960,33575,2,0,3322,prior,1,0,16,
26106,2567960,26915,3,0,3322,prior,1,0,16,
26107,2567960,10768,4,0,3322,prior,1,0,16,
26108,2567960,38277,5,0,3322,prior,1,0,16,
26109,2567960,21783,6,0,3322,prior,1,0,16,
26110,2567960,14036,7,0,3322,prior,1,0,16,
26111,2567960,6748,8,0,3322,prior,1,0,16,
26112,2567960,40180,9,0,3322,prior,1,0,16,
26113,2567960,33731,10,0,3322,prior,1,0,16,


In [120]:
# Generate sequences of products (discard other features for now)
# Can do this for each feature and build a matrix for each user (next iteration?)

train_seqs = np.array(train_subset
             .sort_values(["user_id","order_number","add_to_cart_order"])
             .groupby("user_id")["product_id"]
             .apply(list)
             )

target_seqs = np.array(target_subset
             .sort_values(["user_id","order_number","add_to_cart_order"])
             .groupby("user_id")["product_id"]
             .apply(list)
             )

print(train_seqs[0], target_seqs[0])

['41593', '33575', '26915', '10768', '38277', '21783', '14036', '6748', '40180', '33731', '11777', '43772', '7916', '33575', '26915', '10768', '21783', '14036', '40180', '14036', '41593', '26915', '40180', '33575', '21783', '10768', '33731', '38277', '2086', '16101', '26915', '33575', '10768', '14036', '40180', '41593', '16101', '11777', '5537', '34969', '17795', '33575', '26915', '10768', '14036', '40180', '41593', '16101', '38277', '17795', '5537', '38662', '29406', '7916', '6748', '7916', '38662', '16101', '29406', '40180', '5537', '41593', '14036', '33575', '10768', '26915', '17795', '11777', '34969', '2086', '16282', '40180', '5537', '38662', '16282', '29406', '16101', '6748', '33575', '41593', '26915', '14036', '10768', '44471', '22281', '9477', '46175', '14036', '41593', '33575', '26915', '16101', '16282', '5537', '38277', '43772', '574'] ['21783', '33575', '38277', '26915', '16101', '41593', '16282', '43772', '11777', '17795']


In [121]:
input_characters = train_subset.product_id.unique()
target_characters = target_subset.product_id.unique()
input_texts = train_seqs
target_texts = target_seqs

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

samples = len(input_texts)
print('Number of samples:', samples)
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 200
Number of unique input tokens: 6439
Number of unique output tokens: 1100
Max sequence length for inputs: 1218
Max sequence length for outputs: 43


### Try a seq2seq
  
Source: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [122]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

# Setting up the matrices for inputs/outputs
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')


for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.



In [123]:
encoder_input_data.shape

(200, 1218, 6439)

### Play with these parameters

In [124]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 200  # Latent dimensionality of the encoding space (original 256)
# num_samples = 10  # Number of samples to train on.

In [125]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [126]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s_'+str(samples)+'_samp_'+str(epochs)+'_eps.h5')

Train on 160 samples, validate on 40 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  '. They will not be included '


In [127]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [128]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
#     target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence.append(sampled_char) 

        # Exit condition: either hit max length
        # or find stop character.
        if (len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


From the model, we generate for each visitor the number of predicted reordered products that corresponds to the size of their order in the training set.
We then join it to the set of product ids that were in the actual order to compare the labels. 
There are 4 possibilities:
1) True reorder - predicted order: correctly identified reorder
2) True reorder - null: missed reorder 
3) True first time product in the validation set  - predicted order: correctly identified first-time order
4) True first time product in the validation set - null: missed first-time order

In [129]:
pred_output = pd.DataFrame()

for seq_index in range(10):

    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)

    u_id = sampled_user_ids[seq_index]
    # print('Input sentence:', input_texts[seq_index])
    # print('Decoded sentence:', decoded_sentence)
    target_order_leng = target_subset[target_subset.user_id==u_id]["user_id"].shape[0]
    tmp = pd.DataFrame({"user_id":np.repeat(u_id,target_order_leng),
                        "pred_reordered":np.repeat(1,target_order_leng),
                 "product_id":decoded_sentence[:target_order_leng]})
    
    pred_output = pred_output.append(tmp, ignore_index=True)

In [130]:
# Join back to the training data, drop the non-reordering users, fill nans
train_results = target_subset.merge(pred_output,on=["user_id","product_id"], how="left")
train_results = train_results[train_results.product_id!="None"]
train_results["pred_reordered"] = train_results["pred_reordered"].fillna(0.0)
train_results[["user_id","product_id","reordered","pred_reordered"]].head()

Unnamed: 0,user_id,product_id,reordered,pred_reordered
0,3322,21783,1.0,0.0
1,3322,33575,1.0,0.0
2,3322,38277,1.0,0.0
3,3322,26915,1.0,0.0
4,3322,16101,1.0,0.0


In [131]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

y_true = train_results["reordered"]
y_pred = train_results["pred_reordered"]

cnf_matrix = confusion_matrix(y_true, y_pred)
print(cnf_matrix)
print("accuracy score:",str(accuracy_score(y_true, y_pred)))
print("f1 score:",str(f1_score(y_true, y_pred)))

[[541   0]
 [927   6]]
accuracy score: 0.3710990502035278
f1 score: 0.012779552715654953


In [132]:
train_results.groupby(["reordered"]).count()

Unnamed: 0_level_0,add_to_cart_order,days_since_prior_order,eval_set,order_dow,order_hour_of_day,order_id,order_number,product_id,user_id,pred_reordered
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,541,541,541,541,541,541,541,541,541,541
1.0,933,933,933,933,933,933,933,933,933,933


In [133]:
products.head()
pred_output.head()
pred_output["product_id"]=pred_output["product_id"].astype("int")
pred_output_products = pred_output.merge(products,on=["product_id"])
pred_output_products_list = pred_output_products.groupby("user_id")["product_name"].apply(list).reset_index()

train_results["product_id"]=train_results["product_id"].astype("int")
train_results_products = train_results.merge(products,on=["product_id"])
train_results_products_list = train_results_products.groupby("user_id")["product_name"].apply(list).reset_index()

pred_output_products_list

Unnamed: 0,user_id,product_name
0,3322,"[Banana, Banana, Banana, Banana, Banana, Banan..."
1,3790,[Banana]
2,4053,[Banana]
3,4564,[Banana]
4,4654,"[Banana, Banana, Banana, Banana, Banana, Banan..."
5,5963,[Banana]
6,7234,[Banana]
7,7259,"[Banana, Banana, Banana, Banana, Banana, Banan..."
8,8295,[Banana]
9,8935,"[Banana, Banana, Banana, Banana, Banana]"


In [134]:
user_id = pred_output_products_list.iloc[0,0]
print("user id:",user_id)
print("predicted:")
print(pred_output_products_list.iloc[0,1])
print("actual:")
print(list(train_results_products_list.loc[train_results_products_list.user_id==user_id,"product_name"])[0])

user id: 3322
predicted:
['Banana', 'Banana', 'Banana', 'Banana', 'Banana', 'Banana', 'Banana', 'Organic Strawberries', 'Organic Strawberries', 'Organic Baby Spinach']
actual:
['Coconut Milk Vanilla Mini Sandwiches', 'Dairy Free French Vanilla Coconut Milk Creamer', 'Dairy Free Vanilla Coconut Milk', 'Organic Lowfat Yogurt Banana Vanilla', 'Plain Brownies', 'Blue Cheese Crumbles', 'Chocolate Chip Cookie DO Bites', 'Cherubs Heavenly Salad Tomatoes', 'Red Raspberries', 'Organic Green Leaf Lettuce']


## TODO:
1) One-hot encode the outputs  
2) embedding of the inputs + add one-hot outputs (maybe make a clone of this notebook)  
3) how to run it on a bigger sample?(jupyter on AWS?)  
