# Dichotomic Pattern Mining (DPM)

In [1]:
import pandas as pd
from ast import literal_eval
from time import time
from IPython.display import display

from sequential.seq2pat import Seq2Pat, Attribute
from sequential.dpm import dichotomic_pattern_mining, get_one_hot_encodings, DichotomicAggregation

## Arguments

In [2]:
args = {}
args['data'] = '../tests/data/sample_data.csv'
args['min_frequency_pos'] = 0.3
args['min_frequency_neg'] = 0.3
args['rolling_window_size'] = 10

## Sample Data

In [3]:
sequence_df = pd.read_csv(args['data'])
sequence_df.head()

Unnamed: 0,event_sequence,event_time,event_order,label
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...","[13118, 17085, 11839, 41749, 35195, 3348, 3309...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...","[50205, 32403, 51377, 4256, 52139, 15020, 6999...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]","[49647, 45922, 26113, 422659, 9128, 82561, 709...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",1
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...","[355031, 50126, 26262, 44512, 39795, 49730, 14...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
4,"[1, 2, 1, 1, 1]","[19173, 159782, 12811, 88544, 53858]","[0, 1, 2, 3, 4]",1


### Transform sequence from string to list

In [4]:
literal_columns = ['event_sequence', 'event_time', 'event_order']

for column in literal_columns:
    sequence_df[column] = sequence_df[column].apply(literal_eval)

# Input lists
sequences = sequence_df['event_sequence'].values.tolist()
times = sequence_df['event_time'].values.tolist()
orders = sequence_df['event_order'].values.tolist()

## Data Exploration

In [5]:
# EDA for items, max length, average length, number of positive and negative
num_sequences = len(sequence_df)
max_len = sequence_df['event_sequence'].apply(len).max()
avg_len = sequence_df['event_sequence'].apply(len).mean()
num_pos = len(sequence_df[sequence_df['label']==1])

print(f'Number of sequences: {num_sequences}')
print(f'Maximum length: {max_len}')
print(f'Average length: {avg_len}')
print(f'Number of positives: {num_pos}; Number of negatives: {num_sequences - num_pos}')

Number of sequences: 2000
Maximum length: 155
Average length: 28.3755
Number of positives: 1000; Number of negatives: 1000


## Seq2Pat for Positive Labels
- There are two attributes: `event_time` and `event_order`
- Constraint 1: to enforce the average event time greater than 20 sec
- Constraint 2: to enforce the span of event order less than 9. This is to restrict the length of sequence to be <= 10.

In [6]:
# Get sequences having positive labels, and associated attributes.
sequences_pos = sequence_df[sequence_df['label']==1]['event_sequence'].values.tolist()
times_pos = sequence_df[sequence_df['label']==1]['event_time'].values.tolist()
orders_pos = sequence_df[sequence_df['label']==1]['event_order'].values.tolist()

seq2pat_pos = Seq2Pat(sequences_pos)

# Define a constraint on event time, average time >= 20 sec
time_attr_pos = Attribute(times_pos)
time_ct_pos = 20000 <= time_attr_pos.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order_attr_pos = Attribute(orders_pos)
order_ct_pos = order_attr_pos.span() <= 9

# List of constraints 
constraints = [time_ct_pos, order_ct_pos]

# Add constraints to seq2pat
for constraint in constraints:
    seq2pat_pos.add_constraint(constraint)

## Seq2Pat for Negative Labels
- There are two attributes: `event_time` and `event_order`
- Constraint 1: to enforce the average event time greater than 20 sec
- Constraint 2: to enforce the span of event order less than 9. This is to restrict the length of sequence to be <=10.

In [7]:
# Get sequences having positive labels, and associated attributes.
sequences_neg = sequence_df[sequence_df['label']==0]['event_sequence'].values.tolist()
times_neg = sequence_df[sequence_df['label']==0]['event_time'].values.tolist()
orders_neg = sequence_df[sequence_df['label']==0]['event_order'].values.tolist()

seq2pat_neg = Seq2Pat(sequences_neg)

# Define a constraint on event time, average time >= 20 sec
time_attr_neg = Attribute(times_neg)
time_ct_neg = 20000 <= time_attr_neg.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order_attr_neg = Attribute(orders_neg)
order_ct_neg = order_attr_neg.span() <= 9 

# List of constraints 
constraints = [time_ct_neg, order_ct_neg]

# Add constraints to seq2pat
for constraint in constraints:
    seq2pat_neg.add_constraint(constraint)

## Dichotomic Pattern Mining: From Sequences to Patterns

In [8]:
t = time()

# Run DPM on positive and negative patterns and return a dict of pattern aggregations
aggregation_to_patterns = dichotomic_pattern_mining(seq2pat_pos, seq2pat_neg,
                                                    args['min_frequency_pos'],
                                                    args['min_frequency_neg'])

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
for aggregation, patterns in aggregation_to_patterns.items():
    print("Aggregation: ", aggregation, " with number of patterns: ", len(patterns))

DPM finished! Runtime: 12.5875 sec
Aggregation:  intersection  with number of patterns:  215
Aggregation:  union  with number of patterns:  498
Aggregation:  unique_negative  with number of patterns:  29
Aggregation:  unique_positive  with number of patterns:  254


## From Patterns to Encodings

In [9]:
# Define constraints for all sequences in encoding process
# Define a constraint on event time, average time >= 20 sec
time_attr = Attribute(times)
time_ct = 20000 <= time_attr.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order_attr = Attribute(orders)
order_ct = order_attr.span() <= 9

# List of constraints 
constraints = [time_ct, order_ct]

for aggregation, patterns in aggregation_to_patterns.items():
    print("Aggregation: ", aggregation)
    
    t = time()
    # find one hot encoding of each sequence for each pattern subject to constraints
    encodings = get_one_hot_encodings(sequences, patterns, constraints, args['rolling_window_size'])
    
    print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
    display(encodings.head())

Aggregation:  intersection
Encoding finished! Runtime: 227.5064 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat205,feat206,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,1,1,1,1,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Aggregation:  union
Encoding finished! Runtime: 430.7576 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat488,feat489,feat490,feat491,feat492,feat493,feat494,feat495,feat496,feat497
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Aggregation:  unique_negative
Encoding finished! Runtime: 18.0415 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat19,feat20,feat21,feat22,feat23,feat24,feat25,feat26,feat27,feat28
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Aggregation:  unique_positive
Encoding finished! Runtime: 183.6087 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat244,feat245,feat246,feat247,feat248,feat249,feat250,feat251,feat252,feat253
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
