# Dichotomic Pattern Mining (DPM)

In [None]:
import pandas as pd
from ast import literal_eval
from time import time

from sequential.seq2pat import Seq2Pat, Attribute
from sequential.dpm import dichotomic_pattern_mining, get_one_hot_encodings, DichotomicAggregation

## Arguments

In [None]:
args = {}
args['data'] = '../tests/data/sample_data.csv'
args['min_frequency_pos'] = 0.3
args['min_frequency_neg'] = 0.3
args['rolling_window_size'] = 10

## Sample Data

In [None]:
sequence_df = pd.read_csv(args['data'])
sequence_df.head()

### Transform sequence from string to list

In [None]:
literal_columns = ['event_sequence', 'event_time', 'event_order']

for column in literal_columns:
    sequence_df[column] = sequence_df[column].apply(literal_eval)

# Input lists
sequences = sequence_df['event_sequence'].values.tolist()
times = sequence_df['event_time'].values.tolist()
orders = sequence_df['event_order'].values.tolist()

## Data Exploration

In [None]:
# EDA for items, max length, average length, number of positive and negative
num_sequences = len(sequence_df)
max_len = sequence_df['event_sequence'].apply(len).max()
avg_len = sequence_df['event_sequence'].apply(len).mean()
num_pos = len(sequence_df[sequence_df['label']==1])

print(f'Number of sequences: {num_sequences}')
print(f'Maximum length: {max_len}')
print(f'Average length: {avg_len}')
print(f'Number of positives: {num_pos}; Number of negatives: {num_sequences - num_pos}')

## Seq2Pat for Positive Labels
- There are two attributes: `event_time` and `event_order`
- Constraint 1: to enforce the average event time greater than 20 sec
- Constraint 2: to enforce the span of event order less than 10. This is to restrict the length of sequence.

In [None]:
## TODO
seq2pat_pos = Seq2Pat(sequences)

# Define a constraint on event time, average time >= 20 sec
time_attr = Attribute(times)
time_ct = 20000 <= time_attr.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order_attr = Attribute(orders)
order_ct = order_attr.span() <= 10

# List of constraints 
constraints = [time_ct, order_ct]

## Seq2Pat for Negative Labels
- There are two attributes: `event_time` and `event_order`
- Constraint 1: to enforce the average event time greater than 20 sec
- Constraint 2: to enforce the span of event order less than 10. This is to restrict the length of sequence.

In [None]:
## TODO
seq2pat_neg = Seq2Pat(sequences)

# Define a constraint on event time, average time >= 20 sec
time = Attribute(times)
time_ct = 20000 <= time.average()

# Define a constraint to restrict sequence length, span of sequence <= 10
order = Attribute(orders)
order_ct = order.span() <= 10

# List of constraints 
constraints = [time_ct, order_ct]

## Dichotomic Pattern Mining: From Sequences to Patterns

In [None]:
t = time()

# Run DPM on positive and negative patterns and return a dict of pattern aggregations
aggregation_to_patterns = dichotomic_pattern_mining(seq2pat_pos, seq2pat_neg,
                                                    args['min_frequency_pos'],
                                                    args['min_frequency_neg'])

print(f'DPM finished! Runtime: {time()-t:.4f} sec')
for aggregation, patterns in aggregation_to_patterns.items():
    print("Aggregation: ", aggregation, " with number of patterns: ", len(patterns))

## From Patterns to Encodings

In [None]:
t = time()

for aggregation, patterns in aggregation_to_patterns.items():
    print("Aggregation: ", aggregation)
    
    # find one hot encoding of each sequence for each pattern subject to constraints
    encodings = get_one_hot_encodings(sequences, patterns, constraints, args['rolling_window_size'])
    
    print(f'Encoding finished! Runtime: {time()-t:.4f} sec')
    print(encodings.head())