In [1]:
import random
import pickle
from collections import namedtuple 
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

import itertools

Transition = namedtuple('Transition', ['entity', 'property', 'value'])

In [2]:
with open('data/tuples_v2_with_add_info.pkl', 'rb') as f:
    d = pickle.load(f)
    data = d['data']
    topics_map = d['topics_map']
    all_transitions_ = d['transitions']

X, y, z = [], [], []

for k, v in data.items():
    z.append(f'{topics_map[k]}/{k}')
    y.append(topics_map[k])
    X.append(v)

len(all_transitions_)

3235

In [3]:
all_transitions = {}

for cat, proc in zip(z, X):
    cat = cat.split('/')[0]
    if cat not in all_transitions:
        all_transitions[cat] = {}
    for step in proc:
        for transition in step:
            all_transitions[cat][transition] = all_transitions[cat].get(transition, 0) + 1


all_transitions.keys()


dict_keys(['Controllers', 'Headphones', 'Keyboards', 'Linux', 'Mac', 'Microphones', 'Monitors', 'OS', 'OSX', 'Printers', 'Scanners', 'Ubuntu', 'Webcams', 'Windows', 'Windows10'])

In [12]:
for k, v in all_transitions.items():
    # print(k)
    for x in v:
        if x.entity=='uninstall':
            print(k, x)

Windows Transition(entity='uninstall', property='isSettingsChanged', value='True')
Windows10 Transition(entity='uninstall', property='isOpened', value='True')
Windows10 Transition(entity='uninstall', property='isSettingsChanged', value='True')


In [6]:
# Drop if there are more than `thresh` transitions in a single step

X_, y_, z_ = [], [], []
thresh = 5

for i, j, k in zip(X,y,z):
    ignore = False
    for s in i:
        if len(s) > thresh:
            print(f'Length {len(s)} > {thresh}. Skipping...')
            ignore = True
            break
    if not ignore:
        X_.append(i)
        y_.append(j)
        z_.append(k)

    

print(f'\n\n{len(X)} procedures --> {len(X_)}')
    
X, y, z = X_, y_, z_


Length 8 > 5. Skipping...
Length 6 > 5. Skipping...
Length 15 > 5. Skipping...
Length 6 > 5. Skipping...
Length 7 > 5. Skipping...
Length 11 > 5. Skipping...
Length 9 > 5. Skipping...
Length 8 > 5. Skipping...
Length 6 > 5. Skipping...
Length 7 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...
Length 11 > 5. Skipping...
Length 6 > 5. Skipping...
Length 7 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...
Length 6 > 5. Skipping...


1243 procedures --> 1222


In [7]:
# Get train, val, test split in ratio 0.75:0.1:0.15 of data_v
X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, z, test_size=0.15, random_state=42, stratify=y, shuffle=True)
X_train, X_val, y_train, y_val, z_train, z_val = train_test_split(X_train, y_train, z_train, test_size=0.1, random_state=42, stratify=y_train, shuffle=True)


len(X_train), len(X_val), len(X_test)

(934, 104, 184)

In [8]:
z_val[30]

'Ubuntu/686_1___Write_a_Shell_Script_Using_Bash_Shell_in_Ubuntu___Steps'

In [9]:
# sample

X_train[0]

[[Transition(entity='start', property='isOpened', value='True'),
  Transition(entity='xbox', property='isOpened', value='True')],
 [Transition(entity='setting', property='isOpened', value='True')],
 [Transition(entity='game dvr', property='isOpened', value='True')],
 [Transition(entity='game dvr', property='isSettingsChanged', value='True')],
 [Transition(entity='open folder', property='isOpened', value='True')]]

In [10]:
def traverse(procedure):
    return tuple([transition for step in procedure for transition in step])

def get_traversals(procedures):
    return [traverse(procedure) for procedure in procedures]
    

In [11]:
def traverse_all_combinations(procedure):
    res = None
    for step in procedure:
        step_allcomb = list(itertools.permutations(step))
        if res is None:
            res = step_allcomb
        else:
            res = [s + t for s in res for t in step_allcomb]
    return res

def get_traversals_all_combinations(procedures):
    return list(itertools.chain(*[traverse_all_combinations(procedure) for procedure in tqdm(procedures)]))
    

In [12]:
# traverse([
#     ['1.1'],
#     ['2.1', '2.2'],
#     ['3.1', '3.2', '3.3'],
#     ['4.1']
# ])

In [13]:
# traverse_all_combinations([
#     ['1.1'],
#     ['2.1', '2.2'],
#     ['3.1', '3.2', '3.3'],
#     ['4.1']
# ])

In [14]:
# get_traversals([
#     [
#         ['1.1'],
#         ['2.1', '2.2', '2.3'],
#         ['3.1', '3.2'],
#     ],
#     [
#         ['1.1', '1.2'],
#         ['2.1', '2.2','2.3'],
#     ]
# ])

In [15]:
# get_traversals_all_combinations([
#     [
#         ['1.1'],
#         ['2.1'],
#         ['3.1', '3.2'],
#     ],
#     [
#         ['1.1', '1.2'],
#         ['2.1', '2.2','2.3'],
#     ]
# ])

In [16]:
# get one traversal

train = get_traversals(X_train)
val = get_traversals(X_val)
test = get_traversals(X_test)

In [17]:
train_all_combinations = get_traversals_all_combinations(X_train)
val_all_combinations = get_traversals_all_combinations(X_val)
test_all_combinations = get_traversals_all_combinations(X_test)

all_transitions_all_combinations = list(train_all_combinations + val_all_combinations + test_all_combinations)

len(train_all_combinations), len(val_all_combinations), len(test_all_combinations)

100%|██████████| 934/934 [00:05<00:00, 178.20it/s]
100%|██████████| 104/104 [00:00<00:00, 1436.13it/s]
100%|██████████| 184/184 [00:00<00:00, 6240.21it/s]


(721133, 2872, 29928)

In [18]:
len(all_transitions), len(all_transitions_all_combinations)

(15, 753933)

In [19]:
# sample
train[0]

(Transition(entity='start', property='isOpened', value='True'),
 Transition(entity='xbox', property='isOpened', value='True'),
 Transition(entity='setting', property='isOpened', value='True'),
 Transition(entity='game dvr', property='isOpened', value='True'),
 Transition(entity='game dvr', property='isSettingsChanged', value='True'),
 Transition(entity='open folder', property='isOpened', value='True'))

In [20]:
# # store the data
# with open('processed_data/traversed_data.pkl', 'wb') as f:
#     pickle.dump({'train': train, 'val': val, 'test': test}, f)

## Start dataset preparation

In [21]:
def _add_procedure_to_dataset(procedure, length, result, negative_transitions, procedure_name, num_negatives=5, split='train', cat=None):
    '''
    Given a `procedure`(format: [s1, s2, ..., sn]), add the `length`-length prefix to `results` with
        `num_negatives` as the universe of negative samples in case of `split` in ['train'] and 
        `all_transitions` as the universe of negtaives when `split` in ['val', 'test'].
    '''
    assert 0 < length < len(procedure), "Length of procedure must be between one and one less than the length of the procedure"
    assert split in ['train', 'val', 'test'], "Split must be one of 'train', 'val', or 'test'"

    candidates = [procedure[length]]

    if split == 'train':
        if len(procedure) - length - 1 > num_negatives:
            candidates.extend(random.sample(procedure[length+1:], num_negatives))
        else:
            candidates.extend(procedure[length+1:])
            num_to_sample = num_negatives - len(procedure[length+1:])
            if num_to_sample > 0:
                candidates.extend(random.sample(negative_transitions, num_to_sample))
    else:
        # for testing... (uncomment this block)
        # all_transitions = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0}
        # all_transitions_all_combinations = [
        #     ['1', '2', '3', '4', '5', '6', '7', '8'],
        #     ['10', '1', '2', '3', '4'],
        #     ['1', '2', '4']
        # ]
        negatives = set(all_transitions[cat].keys()).copy()
        count = 0
        for x in all_transitions_all_combinations:
            if x[:length] == procedure[:length]:
                if len(x) > length and x[length] in negatives:
                    negatives.remove(x[length])
                    count += 1
        # print(f'Removed {count} negatives')
        candidates.extend(list(negatives))
            
                

    result.append({
        'given': procedure[:length],
        'next': candidates,
        'answer': 0,
        'procedure_name': procedure_name,
    })

    if split == 'train':
        assert len(result[-1]['next']) == num_negatives + 1, "Candidate length must be equal to the number of negatives + 1"


In [22]:
# # testing...
# # uncomment the block in the `_add_procedure_to_dataset()`` function if used

# random.seed(42)
# procedure = ['1', '2', '3', '4', '5', '6', '7', '8']

# result = []
# _add_procedure_to_dataset(procedure, 2, result, all_transitions.keys(), split='test')

# result

In [23]:
def sampling(data, procedure_names, lower_bound=3, upper_bound=7, sampling_prob_1=0.25, split='train'):
    '''
    Prepare the `data` for the `split` type.
    Buckets-- [1,1], [2,lower_bound], [lower_bound+1,*]
    
    @NOTE: upper_bound is dropped
    '''

    assert lower_bound >= 2
    random.seed(42)
    result = []

    for procedure, procedure_name in (zip(tqdm(data), procedure_names)):
        proc_length = len(procedure)
        if proc_length <= 1:
            continue

        # construct the possible negatives
        negative_transitions = [t for t in all_transitions_ if t not in set(procedure)]

        # add length-1 as "given sequence"
        _add_procedure_to_dataset(procedure, proc_length - 1, result, negative_transitions, procedure_name, split=split, cat=procedure_name.split('/')[0])

        # sample smaller lengths
        # if proc_length <= lower_bound:
        #     _add_procedure_to_dataset(procedure, proc_length - 1, sampled_transitions, result)
        # else:

        # just sample required number of transitions
        for given_length in range(lower_bound - 1, proc_length - 1):
            if random.random() < sampling_prob_1:
                _add_procedure_to_dataset(procedure, given_length, result, negative_transitions, procedure_name, split=split, cat=procedure_name.split('/')[0])

    return result


In [24]:
_train = sampling(train, z_train, split='train')
_val = sampling(val, z_val, split='val')
_test = sampling(test, z_test, split='test')

len(_train), len(_val), len(_test)


100%|██████████| 934/934 [00:02<00:00, 329.32it/s]
100%|██████████| 104/104 [01:03<00:00,  1.64it/s]
100%|██████████| 184/184 [01:33<00:00,  1.98it/s]


(1504, 179, 290)

In [25]:
# # sanity check
# for i, x in enumerate(_val):
#     if len(x['next']) != 3235:
#         print(i, len(x['next']))

In [26]:
_train[0]['next']

[Transition(entity='open folder', property='isOpened', value='True'),
 Transition(entity='configure unallocated disk space', property='isOpened', value='True'),
 Transition(entity='netmask', property='isSettingsChanged', value='True'),
 Transition(entity='" change you language preference " menu', property='isOpened', value='True'),
 Transition(entity='microsoft app store', property='isOpened', value='True'),
 Transition(entity='mainmenu.xib', property='isOpened', value='True')]

In [27]:
# store the data
with open('processed_data/sampled.0_25.new.cat_constrained.pkl', 'wb') as f:
    pickle.dump({'train': _train, 'val': _val, 'test': _test}, f)

### Some Analysis

In [23]:
(len(X_train), len(X_val), len(X_test)), (len(_train), len(_val), len(_test))

((934, 104, 184), (1504, 179, 290))

In [24]:
def get_distribution_of_lengths(arr):
    lengths = [len(x['given']) for x in arr]
    counts = np.bincount(lengths)
    return counts

## get_distribution_of_lengths(_train), get_distribution_of_lengths(_val), get_distribution_of_lengths(_test)

In [25]:
#  def test_delimiters():
#     data_ = get_traversals(X)

#     # check if ':::' present in data
#     for procedure in data_:
#         for transition in procedure:
#             for elements in transition:
#                 if '###' in elements:
#                     print(transition)
#                     break
    
# # test_delimiters()

In [26]:
# # incase we want to generate a text file
# def to_text(data):
#     result = []
#     for procedure in data:
#         datapoints = ' ::::: '.join([f'{x.entity}:::{x.property}:::{x.value}' for x in procedure['given']])
#         for idx, next in enumerate(procedure['next']):
#             result.append(f'{datapoints}\t:::::::\t{next.entity}:::{next.property}:::{next.value}\t:::::::\t{str(int(idx==procedure["answer"]))}')
#     return result
#
# # print(*to_text([_train[0]]), sep='\n')

In [27]:
def pretty_print_array(arr):
    for idx, item in enumerate(arr):
        print(idx, item, sep=';')

pretty_print_array(get_distribution_of_lengths(_train))

0;0
1;99
2;294
3;285
4;232
5;174
6;125
7;83
8;67
9;38
10;30
11;22
12;16
13;12
14;8
15;7
16;2
17;4
18;1
19;2
20;1
21;1
22;0
23;1
