In [14]:
import random
import pickle
from collections import namedtuple 
from sklearn.model_selection import train_test_split
import numpy as np

Transition = namedtuple('Transition',['entity', 'property', 'value'])

In [15]:
with open('data/tuples_v2_with_add_info.pkl', 'rb') as f:
    d = pickle.load(f)
    data = d['data']
    topics_map = d['topics_map']
    all_transitions = d['transitions']

X, y = [], []

for k, v in data.items():
    y.append(topics_map[k])
    X.append(v)


In [16]:
# Get train, val, test split in ratio 0.75:0.1:0.15 of data_v
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y, shuffle=True)
X_train, X_val = train_test_split(X_train, test_size=0.1, random_state=42, stratify=y_train, shuffle=True)


In [17]:
len(X_train), len(X_val), len(X_test)

(950, 106, 187)

In [18]:
def traverse(procedure):
    return [transition for step in procedure for transition in step]

def get_traversals(procedures):
    return [traverse(procedure) for procedure in procedures]
    

In [19]:
traverse([
    ['1.1'],
    ['2.1', '2.2'],
    ['3.1', '3.2', '3.3'],
    ['4.1']
])

['1.1', '2.1', '2.2', '3.1', '3.2', '3.3', '4.1']

In [20]:
get_traversals([
    [
        ['1.1'],
        ['2.1', '2.2'],
        ['3.1', '3.2', '3.3'],
    ],
    [
        ['1.1'],
        ['2.1', '2.2','2.3'],
    ]
])

[['1.1', '2.1', '2.2', '3.1', '3.2', '3.3'], ['1.1', '2.1', '2.2', '2.3']]

In [21]:
train = get_traversals(X_train)
val = get_traversals(X_val)
test = get_traversals(X_test)

In [22]:
# store the data
with open('processed_data/traversed_data.pkl', 'wb') as f:
    pickle.dump({'train': train, 'val': val, 'test': test}, f)

In [23]:
def _add_procedure_to_dataset(procedure, length, negatives, result):
    # print(procedure, length, negatives)
    assert 0 < length < len(procedure), "Length of procedure must be between one and one less than the length of the procedure"
    negatives_length = len(negatives) - (len(procedure) - length) + 1
    result.append({
        'given': procedure[:length],
        'next': procedure[length:] + negatives[:negatives_length],
        'answer': 0,
    })


In [24]:


# def sampling1(data):
#     random.seed(42)
#     result = []
#     for procedure in data:
#         # randomly sample 5 transitions that is not in current procedure
#         negative_transitions = [t for t in all_transitions if t not in set(procedure)]
#         sampled_transitions = random.sample(negative_transitions, 5)

#         if len(procedure) <= 1:
#             continue
#         elif len(procedure) <= 2:
#             result.append({'given': [procedure[0]], 'next': [procedure[1]]+sampled_transitions, 'answer': 0})
#         elif len(procedure) <= 3:
#             result.append({'given': [procedure[0], procedure[1]], 'next': [procedure[2]]+sampled_transitions, 'answer': 0})
#         elif len(procedure) <= 4:
#             # generate a random number between 2 and 3 with probability 0.9 and 0.1
#             if random.random() <= 0.15:
#                 result.append({'given': [procedure[0], procedure[1]], 'next': [procedure[2], procedure[3]]+sampled_transitions[:4], 'answer': 0})
#             result.append({'given': [procedure[0], procedure[1], procedure[2]], 'next': [procedure[3]]+sampled_transitions, 'answer': 0})
#         elif len(procedure) <= 5:
#             if random.random() <= 0.15:
#                 result.append({'given': [procedure[0], procedure[1]], 'next': [procedure[2], procedure[3], procedure[4]]+sampled_transitions[:3], 'answer': 0})
#             elif random.random() <= 0.3:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2]], 'next': [procedure[3], procedure[4]]+sampled_transitions[:4], 'answer': 0})
#             result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3]], 'next': [procedure[4]]+sampled_transitions, 'answer': 0})
#         elif len(procedure) <= 6:
#             if random.random() <= 0.15:
#                 result.append({'given': [procedure[0], procedure[1]], 'next': [procedure[2], procedure[3], procedure[4], procedure[5]]+sampled_transitions[:2], 'answer': 0})
#             elif random.random() <= 0.3:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2]], 'next': [procedure[3], procedure[4], procedure[5]]+sampled_transitions[:3], 'answer': 0})
#             elif random.random() <= 0.45:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3]], 'next': [procedure[4], procedure[5]]+sampled_transitions[:4], 'answer': 0})
#             result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3], procedure[4]], 'next': [procedure[5]]+sampled_transitions, 'answer': 0})
#         elif len(procedure) <= 7:
#             if random.random() <= 0.15:
#                 result.append({'given': [procedure[0], procedure[1]], 'next': [procedure[2], procedure[3], procedure[4], procedure[5], procedure[6]]+sampled_transitions[:1], 'answer': 0})
#             elif random.random() <= 0.3:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2]], 'next': [procedure[3], procedure[4], procedure[5], procedure[6]]+sampled_transitions[:2], 'answer': 0})
#             elif random.random() <= 0.45:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3]], 'next': [procedure[4], procedure[5], procedure[6]]+sampled_transitions[:3], 'answer': 0})
#             elif random.random() <= 0.6:
#                 result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3], procedure[4]], 'next': [procedure[5], procedure[6]]+sampled_transitions[:4], 'answer': 0})
#             result.append({'given': [procedure[0], procedure[1], procedure[2], procedure[3], procedure[4], procedure[5]], 'next': [procedure[6]]+sampled_transitions, 'answer': 0})
#         else:
#             # randomly choose the number of steps to retain
#             retain_num = random.randint(4, len(procedure)-1)
#             result.append({'given': [procedure[i] for i in range(retain_num)], 'next': ([procedure[i] for i in range(retain_num, len(procedure))]+sampled_transitions)[:6], 'answer': 0})
#             retain_num_2 = random.randint(4, len(procedure)-1)
#             while retain_num_2 == retain_num:
#                 retain_num_2 = random.randint(5, len(procedure)-1)
#             result.append({'given': [procedure[i] for i in range(retain_num_2)], 'next': ([procedure[i] for i in range(retain_num_2, len(procedure))]+sampled_transitions)[:6], 'answer': 0})
#     return result


In [25]:


def sampling(data, lower_bound=3, upper_bound=7, sampling_prob_1=0.25):
    # @NOTE: [1,1], [2,lower_bound], [lower_bound+1,upper_bound], [upper_bound,*]
    assert lower_bound >= 2
    random.seed(42)
    result = []
    for procedure in data:
        # randomly sample 5 transitions that is not in current procedure
        negative_transitions = [t for t in all_transitions if t not in set(procedure)]
        sampled_transitions = random.sample(negative_transitions, 5)
        proc_length = len(procedure)
        if proc_length <= 1:
            continue
        elif proc_length <= lower_bound:
            _add_procedure_to_dataset(procedure, proc_length - 1, sampled_transitions, result)
        elif proc_length <= upper_bound:
            random_num = random.random()
            for given_length in range(lower_bound - 1, proc_length - 1):
                # if (given_length-lower_bound+1)*sampling_prob_1 < random.random() <= (given_length-lower_bound+2)*sampling_prob_1:
                if random_num < (given_length-lower_bound+2)*sampling_prob_1:
                    _add_procedure_to_dataset(procedure, given_length, sampled_transitions, result)
                    break
            _add_procedure_to_dataset(procedure, proc_length - 1, sampled_transitions, result)
        else:
            # randomly choose the number of steps to retain
            retain_num = random.randint(4, len(procedure)-1)
            result.append({'given': [procedure[i] for i in range(retain_num)], 'next': ([procedure[i] for i in range(retain_num, len(procedure))]+sampled_transitions)[:6], 'answer': 0})
            retain_num_2 = random.randint(4, len(procedure)-1)
            while retain_num_2 == retain_num:
                retain_num_2 = random.randint(5, len(procedure)-1)
            result.append({'given': [procedure[i] for i in range(retain_num_2)], 'next': ([procedure[i] for i in range(retain_num_2, len(procedure))]+sampled_transitions)[:6], 'answer': 0})
    return result


In [26]:
_train = sampling(train)
_val = sampling(val)
_test = sampling(test)


In [27]:
len(_train), len(_val), len(_test)

(1348, 162, 266)

In [28]:
def get_distribution_of_lengths(arr):
    lengths = [len(x['given']) for x in arr]
    counts = np.bincount(lengths)
    return counts

## get_distribution_of_lengths(_train), get_distribution_of_lengths(_val), get_distribution_of_lengths(_test)

In [29]:
(len(X_train), len(X_val), len(X_test)), (len(_train), len(_val), len(_test))

((950, 106, 187), (1348, 162, 266))

In [30]:
# _train[0]

In [31]:
 def test_delimiters():
    data_ = get_traversals(X)

    # check if ':::' present in data
    for procedure in data_:
        for transition in procedure:
            for elements in transition:
                if '###' in elements:
                    print(transition)
                    break
    
# test_delimiters()

In [32]:
def to_text(data):
    result = []
    for procedure in data:
        datapoints = ' ::::: '.join([f'{x.entity}:::{x.property}:::{x.value}' for x in procedure['given']])
        for idx, next in enumerate(procedure['next']):
            result.append(f'{datapoints}\t:::::::\t{next.entity}:::{next.property}:::{next.value}\t:::::::\t{str(int(idx==procedure["answer"]))}')
    return result



In [33]:
## print(*to_text([_train[0]]), sep='\n')

In [34]:
# # store the data
# with open('processed_data/sampled.0_25.pkl', 'wb') as f:
#     pickle.dump({'train': _train, 'val': _val, 'test': _test}, f)

In [35]:
def pretty_print_array(arr):
    for idx, item in enumerate(arr):
        print(idx, item, sep='\t')


In [36]:


pretty_print_array(get_distribution_of_lengths(_train))

0	0
1	101
2	242
3	232
4	247
5	190
6	160
7	57
8	43
9	25
10	18
11	11
12	9
13	5
14	3
15	2
16	0
17	0
18	0
19	0
20	0
21	1
22	0
23	1
24	0
25	0
26	0
27	0
28	0
29	0
30	0
31	0
32	0
33	0
34	0
35	1
