In [1]:
from utils import *

%load_ext autoreload
%autoreload 2

In [2]:
n_candidates = 3
rdm = np.random.RandomState(seed=123)
#sequence dataset to be transformed in a single-valued representation; 'z' encodes missing values
seqs = ['aaabbaacc', 'bbbaaaccaa', 'aaaabcbcab', 'bcbcb', 'z', 'z', 'z', 'z' ,'z']
labels = [1,0,1,0,1,1,1,0,1]
#which strategy to use to handle missing values
missing = 'lr'

In [3]:
#get non-empty seqs and their labels
actual_seqs, actual_labels = zip(*[(_, labels[i]) for i, _ in enumerate(seqs) if _ != 'z'])

#generate unique candidates; sort them to preserve order
candidates = set([get_random_subsequence(actual_seqs, rdm=rdm) for _ in range(n_candidates)])
candidates = list(sorted(candidates))

#evaluate candidates according to 'lr' or 'plain' method
print('1) Candidate evaluation')
if missing == 'lr':
    missing_data_labels = [l for i,l in enumerate(labels) if seqs[i] == 'z']
    candidate_evals = [evaluate_candidate(c,
                                          [sliding_ed(s, c) for s in actual_seqs],
                                          actual_labels,
                                          entropy(labels), 
                                          missing_data_labels=missing_data_labels
                                          ) for c in candidates]    
    
else:#'plain'  
    candidate_evals = [evaluate_candidate(c,
                                          [sliding_ed(s, c) for s in seqs],
                                          labels,
                                          entropy(labels),
                                          missing='plain') for c in candidates]

#select candidate (shapelet) yielding maximum information gain (to break ties, max margin and min length)
shapelet = sorted(candidate_evals, key = lambda e : (-e['ig'],#max ig
                                                     -e['margin'],#max margin
                                                     len(e['subseq'])))[0]#min length

print('\n2) Shapelet selection')
print('selected shapelet:{} ig:{:.3f} margin:{}'.format(shapelet['subseq'], shapelet['ig'], shapelet['margin']))

#transform sequence dataset based on the selected shapelet
if missing == 'lr':
    transformed_seqs = [sliding_ed(s, shapelet['subseq']) if s != 'z' else shapelet['z'] for s in seqs]
else:
    transformed_seqs = [sliding_ed(s, shapelet['subseq']) for s in seqs]
    
print('\n3) Dataset transformation')
print('transformed sequences:')
for _ in zip(seqs, transformed_seqs) : print(_)

1) Candidate evaluation

candidate: aaab
[(0, 1), (0, 1), (1, 0), (3, 0)]
split:((1, 1), (0, 0)) ig:0.458 margin:1
split:((1, 1, 0), (0,)) ig:0.197 margin:2
best split: {'subseq': 'aaab', 'ig': 0.45810589515712374, 'z': 0, 'margin': 1, 'threshold': 1, 'index': 0}

candidate: aab
[(0, 1), (0, 1), (1, 0), (2, 0)]
split:((1, 1), (0, 0)) ig:0.458 margin:1
split:((1, 1, 0), (0,)) ig:0.197 margin:1
best split: {'subseq': 'aab', 'ig': 0.45810589515712374, 'z': 0, 'margin': 1, 'threshold': 1, 'index': 0}

candidate: b
[(0, 0), (0, 0), (0, 1), (0, 1)]

2) Shapelet selection
selected shapelet:aab ig:0.458 margin:1

3) Dataset transformation
transformed sequences:
('aaabbaacc', 0)
('bbbaaaccaa', 1)
('aaaabcbcab', 0)
('bcbcb', 2)
('z', 0)
('z', 0)
('z', 0)
('z', 0)
('z', 0)
