# Batch processing in Seq2Pat

In [1]:
import pandas as pd
from ast import literal_eval
from time import time
from IPython.display import display

from sequential.seq2pat import Seq2Pat, Attribute
from sequential.pat2feat import Pat2Feat
from sequential.dpm import dichotomic_pattern_mining, DichotomicAggregation
from sequential.utils import read_data

In [2]:
args = {}
args['data'] = '../tests/data/input.txt'
args['attribute1'] = '../tests/data/input_att1.txt'
args['attribute2'] = '../tests/data/input_att2.txt'

In [3]:
# Load Sequences
sequences = read_data(args['data']) 
print("Number of sequences: ", len(sequences))


Number of sequences:  52619


In [4]:
# Run seq2pat on the entire set
seq2pat = Seq2Pat(sequences)

t = time()
patterns_entire_set = seq2pat.get_patterns(.01)
print("Runtime: ", round((time()-t), 3), " secs")

print(patterns_entire_set[0:30])

Runtime:  2.762  secs
[[10, 11, 1501], [9, 10, 1360], [9, 11, 1287], [1, 11, 1271], [1, 10, 1164], [11, 18, 1086], [1, 2, 1043], [1, 9, 1041], [7, 10, 1033], [7, 11, 1024], [9, 10, 11, 1013], [6, 11, 960], [10, 18, 957], [1, 7, 898], [6, 10, 892], [7, 9, 889], [2, 11, 864], [6, 7, 856], [2, 10, 824], [1, 6, 805], [9, 18, 801], [2, 7, 795], [1, 10, 11, 791], [5, 11, 791], [1, 9, 10, 771], [4, 6, 770], [6, 9, 761], [7, 10, 11, 747], [7, 9, 10, 744], [11, 12, 739]]


In [5]:
# When batch_size is not None, Seq2Pat will split the sequences dataset into batches 
# with each batch having batch_size sequences. Pattern mining with constraints will be applied on
# each batch. In the end, patterns mined from all batches are aggregated by their frequencies.
#
# A few parameters are relevant to batch processing:
# - batch_size: Number of sequences in one batch
# - n_jobs: Number of processes that are used when mining tasks are applied on batches in parallel. n_jobs=2 by default. If -1 all CPUs are used. If -2, all CPUs but one are used.
# - discount_factor: Discount factor is used to reduce the minimum row count (min_frequency) when Seq2Pat is applieon a batch. 
#                    The value should be a float in the range (0, 1.0). A higher value might result in missing patterns with lower frequencies close to the minimum row count.
#                    Discount factor is recommended to be 0.2 (by default) in practices.

# Run seq2pat on batches of sequences
seq2pat = Seq2Pat(sequences, batch_size=30000, n_jobs=2, discount_factor=0.8)

t = time()
patterns_batch = seq2pat.get_patterns(.01)
print("Runtime: ", round((time()-t),2), " secs")


print(patterns_batch[0:30])

print("Patterns mined from entire set and batches are the same: ", patterns_entire_set == patterns_batch)

Runtime:  2.95  secs
[[10, 11, 1501], [9, 10, 1360], [9, 11, 1287], [1, 11, 1271], [1, 10, 1164], [11, 18, 1086], [1, 2, 1043], [1, 9, 1041], [7, 10, 1033], [7, 11, 1024], [9, 10, 11, 1013], [6, 11, 960], [10, 18, 957], [1, 7, 898], [6, 10, 892], [7, 9, 889], [2, 11, 864], [6, 7, 856], [2, 10, 824], [1, 6, 805], [9, 18, 801], [2, 7, 795], [1, 10, 11, 791], [5, 11, 791], [1, 9, 10, 771], [4, 6, 770], [6, 9, 761], [7, 10, 11, 747], [7, 9, 10, 744], [11, 12, 739]]
Patterns mined from entire set and batches are the same:  True


- The patterns mined by running Seq2Pat on the entire set and batches are the same.

- Due to the size of test data is comparatively small, we are not observing an improvement on runtime. We would recommend to run the batch mode for gaining runtime benefits only when data size is large, e.g. at least tens of thousands of sequences, and mining on the entire set takes a long time.