# Master's thesis code

Code for the Master's thesis in <a href="https://www.tilburguniversity.edu/education/masters-programmes/data-science-and-society" target="_blank">Data Science & Society</a> at Tilburg University (The Netherlands), inspired from the paper <a href="https://arxiv.org/abs/1907.00400" target="_blank">"Prediction is very hard, especially about conversion. Predicting user purchases from clickstream data in fashion e-commerce" (Bigon et al., 2019)</a>, and only possible thanks to it. Work supervised by Dr. Giovanni Cassani.

Info about the thesis will be available when it is publicly accessible through the university archive.


### Copyright and License

&copy; 2020 Guillermo Villar. All rights reserved.

(Copyright 2020 Guillermo Villar. All rights reserved)

### Importing the libraries

In [None]:
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)

import csv
import string
import itertools
import numpy as np
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from collections import Counter

from tensorflow import keras
from tensorflow.keras import layers, regularizers, optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, GlobalMaxPooling1D

## First steps for refining the data

### Separate datasets

In [None]:
# hypothetic names to include events and dwell times
session_cl = pd.read_csv("ev.csv")
session_dw = pd.read_csv("dwe.csv")

In [None]:
# checking that values are floats
for arr in np.array(session_dw):
    for value in arr:
        if isinstance(value,float) == False:
            print(value)

### Merged dataset

In [None]:
"""As both columns will be merged, we don't need the column for the labels twice"""
# deleting the column of the label not to have it duplicated in the merged dataset
session_cl.pop("label_cl")

# merging both datasets
session_merged = pd.concat([session_cl, session_dw], axis=1)
session_merged.rename(columns = lambda x: int(x) if x != "label_dw" else x, inplace = True)
session_merged.head(5)

In [None]:
"""Now we need to sort the numeric values naming the columns to have the sequence like this: "event - dwell time - event..."""
# sorting numeric values, with label as the last column 
sorted_session_col = sorted(session_merged.columns[:-1], key = int)
sorted_session_col.append("label_dw")
print(sorted_session_col)

# reindexing the columns based on this order
session_merged = session_merged.reindex(sorted_session_col, axis=1)
session_merged.head(5)

In [None]:
buy_df = session_merged[session_merged['label_dw'] == 1]
nobuy_df = session_merged[session_merged['label_dw'] == 0]
nobuy_df.head(5)

## Final steps for refining the data

*These steps have been omitted since they refer to the specifities of the datasets. Only the steps for the combination of both datasets are shown (following piece of code)*

In [None]:
"""Combining datasets"""
final_df = pd.merge(buy_df, nobuy_down, how='outer')

# checking the merged dataset corresponds to twice the rows and the same columns
print(final_df.shape[0] == nobuy_down.shape[0] * 2)
print(final_df.shape[1] == nobuy_down.shape[1])

In [None]:
final_df[final_df["label_dw"] == 0].iloc[:,30:40]

In [None]:
"""Splitting the dataset to make the quantiles based on the training set, not on the whole dataset"""
x = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
# splitting in train, val and test set to bin the dataset ONLY with the dwell times from the train set
x_train, x_testval, y_train, y_testval = train_test_split(x, y, stratify = y, test_size = 0.3, random_state= 333)
x_train.head(5)

In [None]:
x_train[range(1, x_train.shape[1], 2)]

In [None]:
"""With the list of dwell times, we can make the customized quantiles for the dataset"""
# taking the dwell times from the training set
nested_dw = list(np.array(x_train[range(1, x_train.shape[1], 2)]))
# ignoring cells after more info is not provided (dw = -1000000)
list_dw = [j for i in nested_dw for j in i if j != -1000000]
print(list_dw[:30])

## Lists of dwelltimes for each sort of sequences to compute quantiles

### Computing and saving dataframe with quantiles

In [None]:
"""The functions s.quantile and pd.cut are combined to set the limits for the quantiles
and apply this personalized partition for the binning process in quantiles, respectively."""

"""k represents the number of quantiles"""
# starting from 1 to compute the 'division' variable
for k in range(1,5):
    division = list(np.arange(0,1,1/k))
    if len(division) > k:
        division = division[:-1]
    # using s.quantile in the list of every dwell time to set which dwell times match the divisions of the quantiles
    part = list(pd.Series(list_dw).quantile(division))
    """Fictional boundary towards unexpected values: min -1, max 9999"""
    part.append(9999)
    partition = [x if x != 0 else -1 for x in part]
    print(partition)

    """Creating a variable to use the resulting dataset for the modifications through bins"""
    # final_df.iloc[:,:-1] is the equivalent of x. x not typed because x_k would also affect x
    x_k = final_df.iloc[:,:-1]
    # applying pd.cut with the personalized s.quantile bins
    for i in range(1, x_k.shape[1], 2):
        # labels as strings because it is the way to force the number to be a false integer in the future 
        x_k.iloc[:,i] = pd.cut(x_k.iloc[:,i], bins = partition, labels = [str(i+1) for i in range(k)])
    print(range(k), "partition", partition, "k", k)
        
    """EXPERIMENT 1. Dataset 1. Events + dwell times"""
    # first EOS
    x_k = x_k.replace(np.nan, 0)
    x_k = x_k.replace("_", 0)
    np.savez_compressed("x_" + str(k) + "_dwell_times_and_events", np.array(x_k))
    
    """EXPERIMENT 1. Dataset 2. Only events"""
    x_k_nodw = x_k.iloc[:, np.arange(0,x_k.shape[1], 2)]
    np.savez_compressed("x_" + str(k) + "_only_events", np.array(x_k_nodw))
    
    """EXPERIMENT 1. Dataset 3. Only dwell times"""
    x_k_onlydw = x_k.iloc[:, np.arange(1,x_k.shape[1], 2)]
    np.savez_compressed("x_" + str(k) + "_only_dwell_times", np.array(x_k_onlydw))
    
    """EXPERIMENT 2. Dataset 4. Mixed features"""
    x_k_mixed = []
    for arr in np.array(x_k):
        mixed_arr = np.array([str(arr[i]) + str(arr[i+1]) for i in range(0, len(arr), 2)])
        refined_arr = np.array([mixed if '0' not in mixed else '0' for mixed in mixed_arr])
        x_k_mixed.append(refined_arr)
    x_k_mixed = pd.DataFrame(np.array(x_k_mixed))
    mixed_features = np.unique(np.array(x_k_mixed))
    # -1 because 0 will not be converted to a new symbol
    names_mixed = string.ascii_lowercase[:len(mixed_features)-1]
    names_mixed = "0" + names_mixed
    dict_features = dict(zip(mixed_features, names_mixed))
    x_k_mixed = x_k_mixed.replace(dict_features)
    np.savez_compressed("x_" + str(k) + "_mixed_features", np.array(x_k_mixed))
    
    """EXPERIMENT 2. Dataset 5. Event unrolling"""
    x_k_unrolled = []
    for arr in np.array(x_k):
        unrolled_arr = np.array([arr[i] * int(arr[i+1]) for i in range(0, len(arr), 2) if arr[i+1] != 0])
        split_arr = np.array(list("".join(unrolled_arr)))
        padded_arr = np.pad(split_arr, (0, k*(x_k_nodw.shape[1]) - len(split_arr)), "constant", constant_values="0")
        x_k_unrolled.append(padded_arr)
    x_k_unrolled = pd.DataFrame(np.array(x_k_unrolled))
    np.savez_compressed("x_" + str(k) + "_unrolled_events", np.array(x_k_unrolled))
    
    """EXPERIMENT 3. Dataset 6. Shuffled dwell times"""
    x_k_shuffled = []
    rd.seed(500)
    for arr in np.array(x_k):
        shuffled_arr = np.array([arr[i] if (arr[i] == 0 or str(arr[i]).isdigit() == False) else str(rd.randint(1,k)) for i in range(len(arr))])
        x_k_shuffled.append(shuffled_arr)
    x_k_shuffled = pd.DataFrame(np.array(x_k_shuffled))
    np.savez_compressed("x_" + str(k) + "_shuffled_dwell_times_and_events", np.array(x_k_shuffled))

    
# y saved as a variable
y.to_csv('y.csv', index = False, header=True)

## Checking the created datasets for each experiment

In [None]:
print("Dataset with dwell times between events")
x_k.head(10)

In [None]:
print("Dataset with only events")
x_k_nodw.head(10)

In [None]:
print("Dataset with only dwell times")
x_k_onlydw.head(10)

In [None]:
print("Dataset with mixed features")
x_k_mixed.head(10)

In [None]:
print("Dataset with unrolled events")
x_k_unrolled.head(10)

In [None]:
print("Dataset with shuffled dwell times between events")
x_k_shuffled.head(10)

In [None]:
"""Visualization about dwell times"""
# extracting dwell times for each sort of sequences
counts_dw_buy = np.array([value for session in np.array(buy_df.iloc[:,:-1]) for value in session if (isinstance(value, str) == False and value != -1000000)])
counts_dw_nobuy = np.array([value for session in np.array(nobuy_down.iloc[:,:-1]) for value in session if (isinstance(value, str) == False and value != -1000000)])
# plotting
counts_b, bins_b, bars_b = plt.hist(counts_dw_buy, 100, alpha = 0.5, weights = np.zeros_like(counts_dw_buy) + 1 / len(counts_dw_buy),  label="buy")
counts_nb, bins_nb, bars_nb = plt.hist(counts_dw_nobuy, 100, alpha = 0.5,  weights = np.zeros_like(counts_dw_nobuy) + 1 / len(counts_dw_nobuy), label="no buy")
plt.legend(loc= "upper right")
plt.title("Relative frequencies of dwell times")
plt.show()

In [None]:
"""Visualization about the medians of dwell times"""
# extracting dwell times for each sort of sequences
medians = []
for session in np.array(final_df.iloc[:,:-1]):
    median_session = np.median([value for value in session if (isinstance(value, str) == False and value != -1000000)])
    medians.append(int(median_session))

# plotting
plt.hist(medians, 100, alpha = 0.5)
plt.title("Medians of dwell times from every sequence")
plt.show()

In [None]:
"""Visualization about the bins of dwell times"""
# bins of dwell times
xy_k = pd.concat([x_k, y], axis = 1)
xy_buy = xy_k[xy_k["label_dw"] == 1].iloc[:,:-1]
xy_nobuy = xy_k[xy_k["label_dw"] == 0].iloc[:,:-1]
# counting the types of bins for each sort of sequences
xy_buy_counts = Counter(np.array([session[i] for session in np.array(xy_buy) for i in range(1,len(session),2) if session[i] != 0]))
xy_nobuy_counts = Counter(np.array([session[i] for session in np.array(xy_nobuy) for i in range(1,len(session),2) if session[i] != 0]))
# percentages of the types of bins for each sort of sequences
xy_buy_perc = {key : xy_buy_counts[key] / sum(xy_buy_counts.values()) for key in sorted(xy_buy_counts.keys())}
xy_nobuy_perc = {key : xy_nobuy_counts[key] / sum(xy_nobuy_counts.values()) for key in sorted(xy_nobuy_counts.keys())}
xy_perc = {"Buy": xy_buy_perc, "No buy": xy_nobuy_perc}
# plotting
pd.DataFrame(xy_perc).T.plot.barh(stacked=True, align='center', colors = ['#00ce00','#009a00','#006700','#003400'])
print(pd.DataFrame(xy_perc))
ax = plt.subplot()
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
                 box.width, box.height * 0.9])

ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
          fancybox=True, shadow=False, ncol=4)
plt.show()

In [None]:
"""FINAL PREPROCESSING FOR NAIVE BAYES, MARKOV CHAINS AND LSTM"""
for type_dataframe in ["dwell_times_and_events", "only_events", "only_dwell_times",
                       "mixed_features","unrolled_events","shuffled_dwell_times_and_events"]:
    for k in range(1,5):
        saved_arr =  np.load("x_" + str(k) + "_" + type_dataframe + ".npz")["arr_0"].astype("str")
        array_k = []
        for arr in saved_arr:
            new_arr = np.array([value if value != "0" else "_" for value in arr])
            array_k.append(new_arr)
        array_k = np.array(array_k)
        print(array_k)
        corpus_k = ["".join(seq) for seq in array_k]
        for n in range(1,6):
            # BOS added while working on the algorithms (NB and MC)
            bos_symbol = "#"
            bos_corpus = ["#"*(n-1) + seq for seq in corpus_k]
            ngram_corpus = []
            for seq in bos_corpus:
                # adding EOS symbols corresponding to the ngrams
                ngram_seq = [seq[i:n+i] for i in range(len(seq)-n)]
                ngram_corpus.append(ngram_seq)
            np.savez_compressed(str(n) + "gram_" + str(k) + "bins" + "_" + type_dataframe, ngram_corpus)

# Naive Bayes

In [None]:
# giving y_nb the same format than the xbags
y_nb = np.array(list(pd.read_csv('y.csv').iloc[:,0]))

"""These lists will store every result from the three types of df: with dw, without dw and with dw shuffled"""
all_nb_train, all_nb_val, all_nb_test = [], [], []

for type_dataframe in ["dwell_times_and_events", "only_events", "only_dwell_times",
                       "mixed_features","unrolled_events","shuffled_dwell_times_and_events"]:
    
    """These lists will store every result from all the ngrams with all the k from a dataframe"""
    df_nb_train, df_nb_val, df_nb_test = [], [], []
    
    for k in range(1,5):
        
        """These lists will store every result from all the ngrams with a specific k dataframe"""
        nb_train, nb_val, nb_test = [], [], []
        
        for n in range(2,6):

            # reading the array with ngrams and removing ngrams with only zeros
            xnb = np.array(pd.DataFrame(np.load(str(n) + "gram_" + str(k) + "bins" + "_" + type_dataframe + ".npz")["arr_0"]))
            x_nb = []
            for arr in xnb:
                new_arr = np.array([value for value in arr if set(value) != {"_"}])
                x_nb.append(new_arr)
            x_nb = np.array(x_nb)
            
            # train and the rest
            x_nb_train, x_nb_testval, y_nb_train, y_nb_testval = train_test_split(x_nb, y_nb, stratify = y_nb, test_size = 0.3, random_state= 333)
            # train, val, test (0.3 * 0.5 is 0.15)
            x_nb_val, x_nb_test, y_nb_val, y_nb_test = train_test_split(x_nb_testval, y_nb_testval, stratify = y_nb_testval, test_size = 0.5, random_state=400)

        
            """FINAL PREPROCESSING FOR NB"""
            x_nb_train, x_nb_val, x_nb_test = list(x_nb_train), list(x_nb_val), list(x_nb_test)
            
            # converting into dict for b-o-ngrams
            train_dict = np.array([Counter(x_nb_train[i]) for i in range(len(x_nb_train))])
            val_dict = np.array([Counter(x_nb_val[i]) for i in range(len(x_nb_val))])
            test_dict = np.array([Counter(x_nb_test[i]) for i in range(len(x_nb_test))])
            
            vec = DictVectorizer()
            tfidf = TfidfTransformer()
            xbag_train = []
            xbagtrain = vec.fit_transform(train_dict).toarray()
            for session in xbagtrain:
                ones_session = np.array([1 if value != 0 else 0 for value in session])
                xbag_train.append(ones_session)
            xbag_train = np.array(xbag_train)
            
            xbag_val = []
            xbagval = vec.transform(val_dict).toarray()
            for session in xbagval:
                ones_session = np.array([1 if value != 0 else 0 for value in session]) 
                xbag_val.append(ones_session) 
            xbag_val = np.array(xbag_val)
            
            xbag_test = []
            xbagtest = vec.transform(test_dict).toarray()
            for session in xbagtest:
                ones_session = np.array([1 if value != 0 else 0 for value in session])
                xbag_test.append(ones_session)
            xbag_test = np.array(xbag_test)

            """Setting the Bernoulli Naive Bayes"""
            nb = BernoulliNB()
            nb.fit(xbag_train, y_nb_train)
            
           # adding accuracy to the terciary subgroups of lists
            ybag_pred_tr = nb.predict(xbag_train)
            nb_train.append(accuracy_score(y_nb_train, ybag_pred_tr))
            ybag_pred_val = nb.predict(xbag_val)
            nb_val.append(accuracy_score(y_nb_val, ybag_pred_val))
            ybag_pred_test = nb.predict(xbag_test)
            nb_test.append(accuracy_score(y_nb_test, ybag_pred_test))
            
        # adding these lists to the secondary subgroups of lists
        df_nb_train.append(nb_train)
        df_nb_val.append(nb_val)
        df_nb_test.append(nb_test)
        
    """Visualization of the results"""
    nb_viz = pd.DataFrame(df_nb_test, columns = ["2grams", "3grams", "4grams", "5grams"], index = ["1 bin", "2 bins", "3 bins", "4 bins"])
    ax = plt.axes()
    heatmap = sns.heatmap(nb_viz, annot=True, cmap="Blues", vmin=0.50, vmax=1, ax = ax)
    plt.title("Dataset with " + type_dataframe.replace("_"," "))
    plt.yticks(rotation=0)
    plt.show()
    
    # adding these lists to the primary subgroups of lists
    all_nb_train.append(df_nb_train)
    all_nb_val.append(df_nb_val)
    all_nb_test.append(df_nb_test)  

In [None]:
"""Final visualization"""
types_dataframes = ["Dwell times between events", "Only events", "Only dwell times",
                       "Mixed features","Unrolled events","Shuffled dwell times between events"]

lollipop_nb = pd.DataFrame(np.array(all_nb_test)[:,:,3], columns = ["1 bin", "2 bins", "3 bins", "4 bins"], index = ["dataset 1", "dataset 2", "dataset 3", "dataset 4", "dataset5", "dataset 6"])
plt.hlines(y=range(6), xmin=lollipop_nb["1 bin"], xmax=lollipop_nb["4 bins"], color='grey', alpha = 0.5)
plt.plot(lollipop_nb['1 bin'], types_dataframes, "o", alpha = 0)
plt.scatter(lollipop_nb['4 bins'], types_dataframes, color='#00ce00', label='4 bins')
plt.scatter(lollipop_nb['3 bins'], types_dataframes, color='#009a00', label='3 bins')
plt.scatter(lollipop_nb['2 bins'], types_dataframes, color='#006700', label='2 bins')
plt.scatter(lollipop_nb['1 bin'], types_dataframes, color='#003400', label='1 bin')
plt.legend()
plt.xlim(0.5, 1)
 
# adding title and axis names
plt.title("Accuracy with Naive Bayes", loc='left')

## Checking the essential parts and the results of the Naive Bayes

In [None]:
"""latest bag of ngrams in the loop"""
# all the bag
print("bag of ngrams of dataset with" + "shuffled" + " and " + str(n) + "grams", xbag_test)
# bag shape
print("shape", xbag_test.shape)
# encoding of the first session
print("first session encoded", xbag_test[0])
# 100 first examples of feature names
nb_feats = list(vec.get_feature_names())
print("100 examples of features", nb_feats[:100])

all_feats = list(np.unique([ngram for session in x_nb for ngram in session]))
train_feats = list(np.unique([ngram for session in x_nb_train for ngram in session]))

# this will return "False" because the features from the bag of ngrams do not take the vocabulary from the whole dataset
print(list(all_feats) == nb_feats)

# this will return "True" because the features from the bag of ngrams only take the vocabulary from the training set, as it must be
print(list(train_feats) == nb_feats)

In [None]:
np.savez_compressed("results_nb", np.array(all_nb_test))
print("raw results with dwell times between events\n", np.array(all_nb_test[0]), "\n")
print("raw results with only events\n", np.array(all_nb_test[1]), "\n")
print("raw results with only dwell times\n", np.array(all_nb_test[2]), "\n")
print("raw results with mixed features\n", np.array(all_nb_test[3]), "\n")
print("raw results with unrolled events\n", np.array(all_nb_test[4]), "\n")
print("raw results with shuffled dwell times between events\n", np.array(all_nb_test[5]), "\n")

# Markov Chains

In [None]:
ymc = np.array(list(pd.read_csv('y.csv').iloc[:,0]))

"""These lists will store every result from the three types of df: with dw, without dw and with dw shuffled"""
all_mc_train, all_mc_val, all_mc_test = [], [], []

for type_dataframe in ["dwell_times_and_events", "only_events", "only_dwell_times",
                       "mixed_features","unrolled_events","shuffled_dwell_times_and_events"]:
    
    """These lists will store every result from all the ngrams with all the k from a dataframe"""
    df_mc_train, df_mc_val, df_mc_test = [], [], []
    
    for k in range(1,5):
        
        """These lists will store every result from all the ngrams with a specific k dataframe"""
        mc_train, mc_val, mc_test = [], [], []
        
        # transition probabilities don't work on unigrams
        for n in range(2,6):
            
            """PART 1. RELEVANT INFORMATION TO FEED THE ALGO (TRANSITION PROBABILITY MATRIX) AND TEST IT"""
            # ngrams (n from 2 to 5) sequences created before to associate ngrams to the transition prob matrix
            xmc = pd.DataFrame(np.load(str(n) + "gram_" + str(k) + "bins" + "_" + type_dataframe + ".npz")["arr_0"])
            # train, val, test (to associate them to the divisions above)
            xmc_train, xmc_valte, ymc_train, ymc_valte = train_test_split(xmc, ymc, stratify = ymc, test_size = 0.3, random_state = 333)
            xmc_val, xmc_test, ymc_val, ymc_test = train_test_split(xmc_valte, ymc_valte, stratify = ymc_valte, test_size = 0.5, random_state = 400)               
            # to retrieve each sort of sequences under this split
            xmc_all = pd.concat([pd.DataFrame(np.array(xmc_train)), pd.DataFrame(np.array(ymc_train))], axis = 1)
            # to create the transition probability matrix on buy sequences
            xmc_buy = np.array(xmc_all[xmc_all.iloc[:,-1] == 1].iloc[:,:-1])
            # to create the transition probability matrix on no-buy sequences
            xmc_nobuy = np.array(xmc_all[xmc_all.iloc[:,-1] == 0].iloc[:,:-1])

            """PART 2. ALL POSSIBLE COMBINATIONS FROM THE BIGRAMS VERSION OF THE TRAINING SET TO GENERATE THE WHOLE VOCAB"""
            # this only uses the training set with bigrams to exploit the possible combinations that there could be in higher ngrams
            if n == 2:
                unique_bigrams = list(np.unique(np.array(xmc_train)))
                # adding the following because bigger ngrams have also two or more bos and eos
                unique_bigrams.extend(["##", "__"])
                # unique unigrams to make all the possible combinations
                unique_features = list(np.unique([symbol for bigram in unique_bigrams for symbol in bigram]))
                # all the possible combinations from unigrams up to 5 symbols together
                all_combinations = list(itertools.product(unique_features, unique_features, unique_features, unique_features, unique_features))
                all_combinations = ["".join(combination) for combination in all_combinations]
            
            """Now we will use these possible combinations for our ngram model"""
            # cutting these possible combinations according to the size of the ngram
            all_ngrams = np.unique([combination[:n] for combination in all_combinations])
            initialization_transitions = []
            for combination in all_ngrams:
                counter = 0
                for i in range(1,len(combination)):
                    if combination[i-1] + combination[i] not in unique_bigrams:
                        counter += 1
                if counter == 0 and (set(combination) not in [{"_"},{"#"}]):
                    if (combination.startswith("#") and combination.endswith("_")) == False:
                        initialization_transitions.append(combination)
            initialization_transitions = [np.array(initialization_transitions)]
            
            """PART 3. TRANSITION PROBABILITIES OF N-ORDER MARKOV CHAIN"""
            # initialized transitions + real counts
            all_count_dicts = []
            for count in [[initialization_transitions, xmc_nobuy], [initialization_transitions, xmc_buy]]:
                count_dict = dict()
                for type_count in count:
                    for session in type_count:    
                        for key in session:
                            i = key[:-1]
                            j = key[-1]
                            if i in count_dict:
                                if j in count_dict[i]:
                                    count_dict[i][j] += 1
                                else:
                                    count_dict[i][j] = 1
                            else:
                                count_dict[i] = {j : 1}
                all_count_dicts.append(count_dict)

            # probability matrix
            for type_count in all_count_dicts:
                for key in type_count:
                    key_sum = sum(type_count[key].values())
                    for subkey in type_count[key]:
                        type_count[key][subkey] = type_count[key][subkey]/key_sum

            """PART 3. COMPUTE THE TRANSITIONS IN THE SEQUENCES"""
            
            prior = np.log(0.5)
            all_mc_pred = []
            for subset in [xmc_train, xmc_val, xmc_test]:
                pred_mc = []
                for session in np.array(subset):
                    prod_buy = []
                    prod_nobuy = []
                    for ng in session:
                        key = ng[:-1]
                        subkey = ng[-1]
                        try:
                            prod_nobuy.append(np.log(all_count_dicts[0][key][subkey]))
                        except KeyError:
                            prod_nobuy.append(1)
                        try:
                            prod_buy.append(np.log(all_count_dicts[1][key][subkey]))
                        except KeyError:
                            prod_buy.append(1)
                            
                    likelihood_nobuy = np.sum(np.array(prod_nobuy))
                    # print(likelihood_nobuy)
                    likelihood_buy = np.sum(np.array(prod_buy))
                    # print(likelihood_buy)
                    numerator_buy = np.exp(likelihood_buy + prior)
                    numerator_nobuy = np.exp(likelihood_nobuy + prior)
                    denominator = numerator_buy + numerator_nobuy
                    perc_buy = (numerator_buy / denominator)
                    perc_nobuy = (numerator_nobuy / denominator)
                    # print(perc_nobuy, perc_buy)
                    guess_mc = np.argmax([perc_nobuy, perc_buy])
                    pred_mc.append(guess_mc)
                pred_mc = np.array(pred_mc)
                all_mc_pred.append(pred_mc)

            # adding accuracy to the terciary subgroups of lists
            mc_train.append(accuracy_score(ymc_train, all_mc_pred[0]))
            mc_val.append(accuracy_score(ymc_val, all_mc_pred[1]))
            mc_test.append(accuracy_score(ymc_test, all_mc_pred[2]))
            
        # adding these lists to the secondary subgroups of lists
        df_mc_train.append(mc_train)
        df_mc_val.append(mc_val)
        df_mc_test.append(mc_test)
    
    """Visualization of the results"""
    mc_viz = pd.DataFrame(df_mc_test, columns = ["2grams", "3grams", "4grams", "5grams"], index = ["1 bin", "2 bins", "3 bins", "4 bins"])
    ax = plt.axes()
    heatmap = sns.heatmap(mc_viz, annot=True, cmap="Blues", vmin=0.50, vmax=1, ax = ax)
    plt.title("Dataset with " + type_dataframe.replace("_"," "))
    plt.yticks(rotation=0)
    plt.show()
        
    # adding these lists to the primary subgroups of lists
    all_mc_train.append(df_mc_train)
    all_mc_val.append(df_mc_val)
    all_mc_test.append(df_mc_test)  

In [None]:
"""Final visualization"""
types_dataframes = ["Dwell times between events", "Only events", "Only dwell times",
                       "Mixed features","Unrolled events","Shuffled dwell times between events"]

lollipop_mc = pd.DataFrame(np.array(all_mc_test)[:,:,3], columns = ["1 bin", "2 bins", "3 bins", "4 bins"], index = ["dataset 1", "dataset 2", "dataset 3", "dataset 4", "dataset5", "dataset 6"])
plt.hlines(y=range(6), xmin=lollipop_mc["1 bin"], xmax=lollipop_mc["4 bins"], color='grey', alpha = 0.5)
plt.plot(lollipop_mc['1 bin'], types_dataframes, "o", alpha = 0)
plt.scatter(lollipop_mc['4 bins'], types_dataframes, color='#00ce00', label='4 bins')
plt.scatter(lollipop_mc['3 bins'], types_dataframes, color='#009a00', label='3 bins')
plt.scatter(lollipop_mc['2 bins'], types_dataframes, color='#006700', label='2 bins')
plt.scatter(lollipop_mc['1 bin'], types_dataframes, color='#003400', label='1 bin')
plt.legend()
plt.xlim(0.5, 1)
 
# adding title and axis names
plt.title("Accuracy with Markov Chains", loc='left')

## Checking the essential parts and the results of the Markov Chains

In [None]:
# this shows the last created dictionary of transition probabilities (no buy) in the loop (belonging to the shuffled dwell times with the events)
print("no buy sequences, transition probability matrix")
all_count_dicts[0]

In [None]:
# this shows the last created dictionary of transition probabilities (buy) in the loop (belonging to the shuffled dwell times with the events)
print("buy sequences, transition probability matrix")
all_count_dicts[1]

In [None]:
# this shows that all the probabilities in the sequence sum 1 (or very close numbers)
set([sum(type_dict[key].values()) for type_dict in all_count_dicts for key in type_dict])

In [None]:
# the code below save the results and show the raw numbers of the obtained accuracy
np.savez_compressed("results_mc", np.array(all_mc_test))
print("raw results with dwell times between events\n", np.array(all_mc_test[0]), "\n")
print("raw results with only events\n", np.array(all_mc_test[1]), "\n")
print("raw results with only dwell times\n", np.array(all_mc_test[2]), "\n")
print("raw results with mixed features\n", np.array(all_mc_test[3]), "\n")
print("raw results with unrolled_events\n", np.array(all_mc_test[4]), "\n")
print("raw results with shuffled_dwell_times_and_events\n", np.array(all_mc_test[5]), "\n")

# LSTM

In [None]:
y_lstm = np.array(list(pd.read_csv('y.csv').iloc[:,0]))

all_lstm_test = []
for type_dataframe in ["dwell_times_and_events", "only_events", "only_dwell_times",
                       "mixed_features","unrolled_events","shuffled_dwell_times_and_events"]:
    lstm_scores_test = []
    for k in range(1,5):
        if type_dataframe != "only_events" or (type_dataframe == "only_events" and k == 1):
            x_lstm = np.load("x_" + str(k) + "_" + type_dataframe + ".npz")["arr_0"].astype("str")
            dict_keys = np.unique([x for session in x_lstm for x in session])
            print("features", dict_keys)
            dict_values = np.array(pd.get_dummies(dict_keys))
            lstm_dict = dict(zip(dict_keys, dict_values))
            lstm_dict["0"] = np.full(len(dict_keys), -100000)
            ohe_x = []
            for arr in np.array(x_lstm):
                ohe_arr = np.array([lstm_dict[value][1:] for value in arr])
                ohe_x.append(ohe_arr)
            ohe_x = np.array(ohe_x)
            print(ohe_x)

            # train and the rest
            x_lstm_train, x_lstm_testval, y_lstm_train, y_lstm_testval = train_test_split(ohe_x, y_lstm, stratify = y_lstm, test_size = 0.3, random_state= 333)
            # train, val, test (0.3 * 0.5 is 0.15)
            x_lstm_val, x_lstm_test, y_lstm_val, y_lstm_test = train_test_split(x_lstm_testval, y_lstm_testval, stratify = y_lstm_testval, test_size = 0.5, random_state= 400)
            # checking number of features
            print(x_lstm_train.shape)

            n_batches = 10
            n_epochs = 20
            n_timesteps = x_lstm_train.shape[1]
            n_features = x_lstm_train.shape[2]

            # lstm with one-hot-encoding
            lstm = Sequential()
            lstm.add(LSTM(20, input_shape = (n_timesteps, n_features), return_sequences = True))
            lstm.add(GlobalMaxPooling1D())
            lstm.add(Dropout(0.5))
            lstm.add(Dense(1, activation='sigmoid'))
            np.random.seed(40)
            adam = optimizers.Adam(lr=0.001)
            lstm.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
            print(lstm.summary())
            np.random.seed(40)
            history = lstm.fit(x_lstm_train, y_lstm_train, batch_size=n_batches, epochs=n_epochs, validation_data=(x_lstm_val, y_lstm_val), shuffle=True)

            # accuracy on the val set 
            plt.plot(history.history['accuracy'])
            plt.plot(history.history['val_accuracy'])
            plt.title('model accuracy with ' + str(k) + "bins in " + type_dataframe)
            plt.ylabel('accuracy')
            plt.xlabel('epoch')
            plt.legend(['train', 'val'], loc='upper left')
            plt.show()
            # loss on the val set
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('model loss with ' + str(k) + "bins in " + type_dataframe)
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'val'], loc='upper left')
            plt.show()
            # accuracy on the test set 
            if type_dataframe != "only_events": 
                lstm_scores_test.append(lstm.evaluate(x_lstm_test, y_lstm_test, verbose=0))
            else:
                lstm_scores_test.extend([lstm.evaluate(x_lstm_test, y_lstm_test, verbose=0)]*4)
    all_lstm_test.append(lstm_scores_test)

## Checking the essential parts and the results of the LSTM

In [None]:
np.savez_compressed("results_lstm", np.array(all_lstm_test))
print("raw results with dwell times between events", all_lstm_test[0], "\n")
print("raw results with only events", all_lstm_test[1], "\n")
print("raw results with only dwell times", all_lstm_test[2], "\n")
print("raw results with mixed features", all_lstm_test[3], "\n")
print("raw results with unrolled_events", all_lstm_test[4], "\n")
print("raw results with shuffled_dwell_times_and_events", all_lstm_test[5], "\n")

In [None]:
"""Final visualization"""
types_dataframes = ["Dwell times between events", "Only events", "Only dwell times",
                       "Mixed features","Unrolled events","Shuffled dwell times between events"]

lollipop_lstm = pd.DataFrame(np.array(all_lstm_test)[:,:,1], columns = ["1 bin", "2 bins", "3 bins", "4 bins"], index = ["dataset 1", "dataset 2", "dataset 3", "dataset 4", "dataset5", "dataset 6"])
plt.hlines(y=range(6), xmin=lollipop_lstm["1 bin"], xmax=lollipop_lstm["4 bins"], color='grey', alpha = 0.5)
plt.plot(lollipop_lstm['1 bin'], types_dataframes, "o", alpha = 0)
plt.scatter(lollipop_lstm['4 bins'], types_dataframes, color='#00ce00', label='4 bins')
plt.scatter(lollipop_lstm['3 bins'], types_dataframes, color='#009a00', label='3 bins')
plt.scatter(lollipop_lstm['2 bins'], types_dataframes, color='#006700', label='2 bins')
plt.scatter(lollipop_lstm['1 bin'], types_dataframes, color='#003400', label='1 bin')
plt.legend()
plt.xlim(0.5, 1)
 
# adding title and axis names
plt.title("Accuracy with LSTM", loc='left')