In [1]:
import json
import numpy as np
import random
from tqdm.auto import tqdm
import itertools
import os
from copy import deepcopy
import matplotlib.pyplot as plt

In [2]:
def build_dicts(entities):
    entity2ind = dict()
    ind2entity = []
    for i in range(len(entities)):
        entity = entities[i]
        if not (entity in ind2entity):
            ind2entity.append(entity)
            entity2ind[entity] = len(ind2entity) - 1
    return ind2entity, entity2ind

def choose(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    if num >= len(arr):
        return arr
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    return [arr[i] for i in rand_inds]
    
def split(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    train, test = [], []
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    for i in tqdm(range(len(arr))):
        if i in rand_inds:
            train.append(arr[i])
        else:
            test.append(arr[i])
    return [train, test]

def form_items(c, t, b = None, noise = 0):
    len_c = len(c)
    input_text = "".join(c)
    target_text = input_text + "".join([t, "</a>"])
    item = None
    if len_c == 3 and b!= None: #inferred
        # try1: h r1 <> r2 -> h r1 b r2 t
        # input_text_cot = "".join(c[:2])+ "<b>" +c[-1]
        # target_text_cot = input_text_cot.replace("<b>",b) + "".join([t, "</a>"])
        # try2: h r1 r2 -> h r1 r2 b t
        input_text_cot = "".join(c)
        target_text_cot = input_text_cot + "".join([b, t, "</a>"])
        item = {
        "input_text": input_text_cot,
        "target_text": target_text_cot,
        'train_noise': noise
            }
    else:
        item = {
        "input_text": input_text,
        "target_text": target_text
        }
    return item

In [22]:
def build_dataset(num_entities, num_relations, out_degree=20, split_train_inferred=False):
 
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # maps a head entity to a list of (r, t) pairs
    atomic_facts = []
    atomics = []

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    if not split_train_inferred:
        inferred_facts = []
        for ent in tqdm(entities):
            for (r1, b) in atomic_dict[ent]:
                for (r2, t) in atomic_dict[b]:
                    inferred_facts.append(form_items([ent, r1, r2], t, b))
        return entities, relations, atomic_facts, inferred_facts
    
    # split ID/OOD
    OOD_ratio = 0.05
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))
    OOD_facts, ID_facts = set(OOD_facts), set(ID_facts)

    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]

    # whether train with noise?
    train_noise = True
    lambda_noise = 0.4
    noise_num = 0
    train_inferred_facts, test_inferred_iid, test_inferred_ood = [], [], []
    for ent in tqdm(entities):
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:
                if (ent, r1, b) in OOD_facts or (b, r2, t) in OOD_facts:
                    if (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts:
                        test_inferred_ood.append(form_items([ent, r1, r2], t, b))
                    continue
                if np.random.uniform() > 0.005:
                    if train_noise:
                        if np.random.rand()>lambda_noise:
                            train_inferred_facts.append(form_items([ent, r1, r2], t, b))
                        else: #TODO: h,r1,r2  -> b,t
                            t_noise = random.choice(entities)
                            t = t_noise #the second hop noise
                            # b_noise = random.choice(entities)
                            # b = b_noise #the first hop noise
                            noise_num += 1
                            train_inferred_facts.append(form_items([ent, r1, r2], t, b, noise = 1))
                    else:
                        train_inferred_facts.append(form_items([ent, r1, r2], t, b))
                else:
                    test_inferred_iid.append(form_items([ent, r1, r2], t, b))
    print(lambda_noise)
    print(noise_num)

    return entities, relations, id_atomic_facts, ood_atomic_facts, train_inferred_facts, test_inferred_iid, test_inferred_ood, lambda_noise 
    
NUM_ENTITY_IN = 2000
NUM_RELATION = 200

train_entities, train_relations, id_atomic_facts, ood_atomic_facts, train_inferred_facts, test_inferred_iid, test_inferred_facts, lambda_noise = build_dataset(NUM_ENTITY_IN, NUM_RELATION, split_train_inferred=True)

  0%|          | 0/2000 [00:00<?, ?it/s]

100%|██████████| 2000/2000 [00:00<00:00, 7432.27it/s]
100%|██████████| 40000/40000 [00:00<00:00, 51268.42it/s]
100%|██████████| 2000/2000 [00:04<00:00, 418.54it/s]

0.4
300356





In [23]:
import numpy as np
rand_num = np.random.rand()
rand_num
len(train_inferred_facts)

750886

In [24]:
vocab = []
vocab = vocab + train_entities + train_relations
# special tokens
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>", "<b>"]
assert len(vocab) == len(set(vocab))
print("vocab size:", len(vocab))

vocab size: 2207


In [25]:
test_size = 3000
id_atomic_facts_ds = choose(id_atomic_facts, test_size)
ood_atomic_facts_ds = choose(ood_atomic_facts, test_size)
test_inferred_iid = choose(test_inferred_iid, test_size)
test_inferred_facts_ds = choose(test_inferred_facts, test_size) #ood

all_atomics = id_atomic_facts + ood_atomic_facts
len(all_atomics)
len(id_atomic_facts)

38000

In [26]:
# downsampling train_inferred
# for phi in [18.0,12.6,9.0,7.2,5.4,3.6][:]:
for phi in [7.2][:]:
    dataset_name = "composition1_{}_{}_{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
    dataset_name = "composition1_{}_{}_{}_noise_{}".format(NUM_ENTITY_IN, NUM_RELATION, phi, lambda_noise)
    os.makedirs("data/{}".format(dataset_name), exist_ok=True)
    train_inferred_facts_ds = choose(train_inferred_facts, round(phi * len(id_atomic_facts)))

    probes = []
    for item in id_atomic_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "id_atomic"
    
    for item in ood_atomic_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "ood_atomic"

    for item in choose(train_inferred_facts_ds, test_size):
        probes.append(deepcopy(item))
        probes[-1]['type'] = 'train_inferred'

    for item in test_inferred_iid:
        probes.append(deepcopy(item))
        probes[-1]['type'] = 'test_inferred_iid'

    for item in test_inferred_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "test_inferred_ood"

    with open("data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:  #all_atomics, inferred_facts_iid
        json.dump(all_atomics + train_inferred_facts_ds, f)
    with open("data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:  #inferred_facts_ood
        json.dump(test_inferred_facts_ds, f)
    with open("data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f: #iid,ood,atomic,inferred
        json.dump(probes, f)
    # add vocab
    with open("data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(vocab, f)