# Batch processing

In [1]:
import pandas as pd
from ast import literal_eval
from time import time
from IPython.display import display

from sequential.seq2pat import Seq2Pat, Attribute
from sequential.pat2feat import Pat2Feat
from sequential.dpm import dichotomic_pattern_mining, DichotomicAggregation
from sequential.utils import read_data

In [2]:
# args = {}
# args['data'] = '../tests/data/input.txt'
# args['attribute1'] = '../tests/data/input_att1.txt'
# args['attribute2'] = '../tests/data/input_att2.txt'

args = {}
args['data'] = '../tests/data/sample_data.csv'
args['min_frequency'] = 0.1

## Sample Data
- This notebook is going to run DPM on a sample sequences dataset, which is extracted from the published dataset in E-commerce Shopper Intent Prediction (Requena et al., 2020). The sequences are associated with positive or negative labels, e.g. purchase vs. non-purchase.

In [3]:
sequence_df = pd.read_csv(args['data'])
sequence_df.head()

Unnamed: 0,event_sequence,event_time,label
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...","[13118, 17085, 11839, 41749, 35195, 3348, 3309...",1
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...","[50205, 32403, 51377, 4256, 52139, 15020, 6999...",1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]","[49647, 45922, 26113, 422659, 9128, 82561, 709...",1
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...","[355031, 50126, 26262, 44512, 39795, 49730, 14...",1
4,"[1, 2, 1, 1, 1]","[19173, 159782, 12811, 88544, 53858]",1


### Transform sequence from string to list

In [4]:
literal_columns = ['event_sequence', 'event_time']

for column in literal_columns:
    sequence_df[column] = sequence_df[column].apply(literal_eval)

# Input lists
sequences = sequence_df['event_sequence'].values.tolist()
times = sequence_df['event_time'].values.tolist()

## Data Exploration

In [5]:
# EDA for items, max length, average length, number of positive and negative
num_sequences = len(sequence_df)
max_len = sequence_df['event_sequence'].apply(len).max()
avg_len = sequence_df['event_sequence'].apply(len).mean()
num_pos = len(sequence_df[sequence_df['label']==1])

print(f'Number of sequences: {num_sequences}')
print(f'Maximum length: {max_len}')
print(f'Average length: {avg_len}')
print(f'Number of positives: {num_pos}; Number of negatives: {num_sequences - num_pos}')

Number of sequences: 2000
Maximum length: 155
Average length: 28.3755
Number of positives: 1000; Number of negatives: 1000


In [6]:
#Run seq2pat on the entire set
seq2pat = Seq2Pat(sequences)

t = time()
patterns_entire_set = seq2pat.get_patterns(min_frequency=args['min_frequency'])
print("Runtime: ", round((time()-t), 3), " secs")

print(patterns_entire_set[0:30])

Runtime:  8.277  secs
[[1, 1, 1977], [1, 1, 1, 1947], [1, 1, 1, 1, 1874], [2, 1, 1754], [1, 1, 1, 1, 1, 1734], [1, 2, 1734], [2, 1, 1, 1683], [1, 1, 2, 1676], [1, 2, 1, 1676], [1, 1, 2, 1, 1609], [1, 2, 1, 1, 1593], [2, 1, 1, 1, 1559], [1, 1, 1, 2, 1558], [2, 2, 1538], [1, 1, 2, 1, 1, 1504], [1, 1, 1, 1, 1, 1, 1478], [1, 1, 1, 2, 1, 1468], [2, 2, 1, 1462], [1, 2, 1, 1, 1, 1457], [1, 2, 2, 1450], [2, 1, 2, 1430], [2, 1, 1, 1, 1, 1407], [1, 1, 1, 1, 2, 1382], [1, 1, 2, 2, 1372], [1, 2, 1, 2, 1366], [2, 2, 1, 1, 1366], [1, 2, 2, 1, 1365], [2, 1, 2, 1, 1357], [1, 1, 2, 1, 1, 1, 1340], [1, 1, 1, 2, 1, 1, 1328]]


In [7]:
# When batch_size is not None, Seq2Pat will split the sequences dataset into batches 
# with each batch having batch_size sequences. Pattern mining with constraints will be applied on
# each batch. In the end, patterns mined from all batches are aggregated by their frequencies.
#
# A few parameters are relevant to batch processing:
# - batch_size: Number of sequences in one batch
# - n_jobs: Number of processes that are used when mining tasks are applied on batches in parallel. n_jobs=2 by default. If -1 all CPUs are used. If -2, all CPUs but one are used.
# - discount_factor: Discount factor is used to reduce the minimum row count (min_frequency) when Seq2Pat is applieon a batch. 
#                    The value should be a float in the range (0, 1.0). A higher value might result in missing patterns of which the frequencies are lower and close to the minimum row count.
#                    Discount factor is recommended to be 0.2 (by default) for robustness in practices.

# Run seq2pat on batches of sequences
seq2pat = Seq2Pat(sequences, batch_size=1000, n_jobs=2, discount_factor=0.2)

t = time()
patterns_batch = seq2pat.get_patterns(min_frequency=args['min_frequency'])
print("Runtime: ", round((time()-t),3), " secs")


print(patterns_batch[0:30])

print("Patterns mined from entire set and batches are the same: ", patterns_entire_set == patterns_batch)

Runtime:  5.56  secs
[[1, 1, 1977], [1, 1, 1, 1947], [1, 1, 1, 1, 1874], [2, 1, 1754], [1, 1, 1, 1, 1, 1734], [1, 2, 1734], [2, 1, 1, 1683], [1, 1, 2, 1676], [1, 2, 1, 1676], [1, 1, 2, 1, 1609], [1, 2, 1, 1, 1593], [2, 1, 1, 1, 1559], [1, 1, 1, 2, 1558], [2, 2, 1538], [1, 1, 2, 1, 1, 1504], [1, 1, 1, 1, 1, 1, 1478], [1, 1, 1, 2, 1, 1468], [2, 2, 1, 1462], [1, 2, 1, 1, 1, 1457], [1, 2, 2, 1450], [2, 1, 2, 1430], [2, 1, 1, 1, 1, 1407], [1, 1, 1, 1, 2, 1382], [1, 1, 2, 2, 1372], [1, 2, 1, 2, 1366], [2, 2, 1, 1, 1366], [1, 2, 2, 1, 1365], [2, 1, 2, 1, 1357], [1, 1, 2, 1, 1, 1, 1340], [1, 1, 1, 2, 1, 1, 1328]]
Patterns mined from entire set and batches are the same:  True


- The above analysis shows the patterns mined by running Seq2Pat on the entire set and batches are the same.

- We also observe an improvement on runtime. In general, we would recommend to run the batch mode for gaining performance benefits only when data size is large, e.g. hundreds of thousands of sequences, such that the computing costs for mining on entire set is high.