# Dichotomic pattern mining

In [1]:
import pandas as pd
from ast import literal_eval
from time import time

from sequential.seq2pat import Seq2Pat, Attribute
from sequential.utils import dichotomic_pattern_mining, get_one_hot_encodings

## Sample Data

In [2]:
args = {}
args['data'] = '../tests/data/sample_data.csv'
args['min_frequency'] = 0.3

In [3]:
sequence_df = pd.read_csv(args['data'])
sequence_df.head()

Unnamed: 0,event_sequence,event_time,event_order,label
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...","[13118, 17085, 11839, 41749, 35195, 3348, 3309...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...","[50205, 32403, 51377, 4256, 52139, 15020, 6999...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]","[49647, 45922, 26113, 422659, 9128, 82561, 709...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",1
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...","[355031, 50126, 26262, 44512, 39795, 49730, 14...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
4,"[1, 2, 1, 1, 1]","[19173, 159782, 12811, 88544, 53858]","[0, 1, 2, 3, 4]",1


Transform sequence from string to list

In [4]:
literal_cols = ['event_sequence', 'event_time', 'event_order']

for col in literal_cols:
    sequence_df[col] = sequence_df[col].apply(literal_eval)

EDA

In [5]:
# EDA, e.g. # items, max length, average length, # postive, # negative
num_sequences = len(sequence_df)
max_len = sequence_df['event_sequence'].apply(len).max()
avg_len = sequence_df['event_sequence'].apply(len).mean()
num_pos = len(sequence_df[sequence_df['label']==1])
print(f'Number of sequences: {num_sequences}')
print(f'Maximum length: {max_len}')
print(f'Average length: {avg_len}')
print(f'Number of positives: {num_pos}; Number of negatives: {num_sequences - num_pos}')

Number of sequences: 2000
Maximum length: 155
Average length: 28.3755
Number of positives: 1000; Number of negatives: 1000


## Define Attributes and Constraints
- There are two attributes: `event_time` and `event_order`
- Constraint 1: to enforce the average event time greater than 20 sec
- Constraint 2: to enforce the span of event order less than 10. This is to restrict the length of sequence.

In [6]:
# Define a constraint on event time, average time >= 20 sec
time_attr = Attribute(values = sequence_df['event_time'].values.tolist())
time_ct = 20000 <= time_attr.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order_attr = Attribute(values = sequence_df['event_order'].values.tolist())
order_ct = order_attr.span() <= 10

# Store constraints into a dictionary
attr_col_to_constraints = {}
attr_col_to_constraints['event_time'] = [time_ct]
attr_col_to_constraints['event_order'] = [order_ct]

## Run DPM and return the `union` of patterns from pos/neg groups

In [7]:
t = time()
dpm_patterns = dichotomic_pattern_mining(sequence_df, sequence_col_name='event_sequence', label_col_name='label',
                                         attr_col_to_constraints=attr_col_to_constraints,
                                         min_frequency=args['min_frequency'],
                                         pattern_aggregation='union')

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
print(f'Number of DPM patterns: {len(dpm_patterns)}')

DPM finished! Runtime: 45.3411 sec
Number of DPM patterns: 691


Create encodings

In [8]:
sequences = sequence_df['event_sequence'].values.tolist()

t = time()
encodings = get_one_hot_encodings(sequences, dpm_patterns, 
                                  rolling_window_size=10,
                                  drop_pattern_frequency=False)

print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
encodings.head()

Encoding finished! Runtime: 72.8731 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat681,feat682,feat683,feat684,feat685,feat686,feat687,feat688,feat689,feat690
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",1,1,1,1,1,0,0,0,0,...,0,0,1,1,1,1,0,1,1,1
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,0,1,...,0,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Run DPM and return the `intersection` of patterns from pos/neg groups

In [9]:
t = time()

dpm_patterns = dichotomic_pattern_mining(sequence_df, sequence_col_name='event_sequence', label_col_name='label',
                                         attr_col_to_constraints=attr_col_to_constraints,
                                         min_frequency=args['min_frequency'],
                                         pattern_aggregation='intersection')

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
print(f'Number of DPM patterns: {len(dpm_patterns)}')

DPM finished! Runtime: 48.7303 sec
Number of DPM patterns: 294


Create encodings

In [10]:
t = time()
encodings = get_one_hot_encodings(sequences, dpm_patterns, 
                                  rolling_window_size=10,
                                  drop_pattern_frequency=False)

print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
encodings.head()

Encoding finished! Runtime: 22.1154 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat284,feat285,feat286,feat287,feat288,feat289,feat290,feat291,feat292,feat293
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Run DPM and return the unique patterns, `unique_positive`, to the positve sequences

In [11]:
t = time()

dpm_patterns = dichotomic_pattern_mining(sequence_df, sequence_col_name='event_sequence', label_col_name='label',
                                         attr_col_to_constraints=attr_col_to_constraints,
                                         min_frequency=args['min_frequency'],
                                         pattern_aggregation='unique_positive')

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
print(f'Number of DPM patterns: {len(dpm_patterns)}')

DPM finished! Runtime: 49.6565 sec
Number of DPM patterns: 360


Create encodings

In [12]:
t = time()
encodings = get_one_hot_encodings(sequences, dpm_patterns, 
                                  rolling_window_size=10,
                                  drop_pattern_frequency=False)

print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
encodings.head()

Encoding finished! Runtime: 39.7772 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat350,feat351,feat352,feat353,feat354,feat355,feat356,feat357,feat358,feat359
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",0,1,0,1,0,0,0,1,1,...,0,0,1,1,1,1,0,1,1,1
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",0,1,0,1,0,0,0,1,1,...,0,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Run DPM and return the unique patterns, `unique_negative`, to the negative sequences

In [13]:
t = time()

dpm_patterns = dichotomic_pattern_mining(sequence_df, sequence_col_name='event_sequence', label_col_name='label',
                                         attr_col_to_constraints=attr_col_to_constraints,
                                         min_frequency=args['min_frequency'],
                                         pattern_aggregation='unique_negative')

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
print(f'Number of DPM patterns: {len(dpm_patterns)}')

DPM finished! Runtime: 49.4787 sec
Number of DPM patterns: 37


Create encodings

In [14]:
t = time()
encodings = get_one_hot_encodings(sequences, dpm_patterns, 
                                  rolling_window_size=10,
                                  drop_pattern_frequency=False)

print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
encodings.head()

Encoding finished! Runtime: 3.9857 sec


Unnamed: 0,sequence,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat27,feat28,feat29,feat30,feat31,feat32,feat33,feat34,feat35,feat36
0,"[1, 1, 1, 2, 3, 1, 4, 1, 2, 3, 1, 4, 1, 2, 1, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, ...",1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
2,"[1, 1, 1, 2, 1, 2, 1, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[1, 1, 2, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[1, 2, 1, 1, 1]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Run DPM and return all the four aggregation results

In [15]:
t = time()

dpm_patterns = dichotomic_pattern_mining(sequence_df, sequence_col_name='event_sequence', label_col_name='label',
                                         attr_col_to_constraints=attr_col_to_constraints,
                                         min_frequency=args['min_frequency'],
                                         pattern_aggregation='all')

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
print(f'Number of dpm_patterns with UNION aggregation: {len(dpm_patterns[0])}')
print(f'Number of dpm_patterns with INTERSECTION aggregation: {len(dpm_patterns[1])}')
print(f'Number of dpm_patterns with UNIQUE_POSITIVE aggregation: {len(dpm_patterns[2])}')
print(f'Number of dpm_patterns with UNIQUE_NEGATIVE aggregation: {len(dpm_patterns[3])}')


DPM finished! Runtime: 56.2757 sec
Number of dpm_patterns with UNION aggregation: 691
Number of dpm_patterns with INTERSECTION aggregation: 294
Number of dpm_patterns with UNIQUE_POSITIVE aggregation: 360
Number of dpm_patterns with UNIQUE_NEGATIVE aggregation: 37
