# Simple BC task:

- p = A, B
- h = A [mask] B
    - [mask] = and 

- p = A, C
- h = A [mask] B
    - [mask] = or 

#### In this notebook we will create a df with columns 'sentence1', 'sentence2', 'sentence2_masked' 'label', and a txt for training

In [1]:
import pandas as pd
import numpy as np
from inference.text_generation.vocab import male_names, female_names, cities_and_states, countries
from inference.text_generation.util import get_new_item, get_n_different_items
from inference.text_generation.util import vi, not_vi

In [2]:
def and_entailment(person_list,
                   place_list,
                   n,
                   vi_function,
                   not_vi_function,
                   mask_token="[MASK]"):
    """
    $P:= pm V(x_1, y_1) , dots, pm V(x_n, y_n)$
    $H:= pm V(x_i, y_i) and pm V(x_j, y_j)$
    """
    Subjects = get_n_different_items(person_list, n)
    people_O = [get_new_item(Subjects, person_list) for _ in range(n)]
    places = get_n_different_items(place_list, n)
    Objects = get_n_different_items(people_O + places, n)
    fs = np.random.choice([vi_function, not_vi_function], n)
    sentence1 = [f(x, y) for f, x, y in zip(fs, Subjects, Objects)]
    ids = get_n_different_items(range(len(Subjects)), 2)
    sentence2 = sentence1[ids[0]] + " and " + sentence1[ids[1]]
    sentence2_masked = sentence1[ids[0]] + " {} ".format(mask_token) + sentence1[ids[1]] 
    sentence1 = ", ".join(sentence1)
    sentence1 += "." 
    label = "and"
    people_O = list(set(Objects).intersection(people_O))
    places = list(set(Objects).intersection(places))
    people = ", ".join(Subjects + people_O)
    Subjects = ", ".join(Subjects)
    Objects = ", ".join(Objects)
    places = ", ".join(places)
    ids.sort()
    ids = ", ".join(map(lambda x: str(x), ids))

    return sentence1, sentence2, sentence2_masked, label, Subjects, Objects, ids, people, places



In [3]:
and_entailment(person_list=male_names,
               place_list=cities_and_states,
               n=2,
               vi_function=vi,
               not_vi_function=not_vi)

("Andy didn't visit Donald, Cory has visited Olympia.",
 "Cory has visited Olympia and Andy didn't visit Donald",
 "Cory has visited Olympia [MASK] Andy didn't visit Donald",
 'and',
 'Andy, Cory',
 'Donald, Olympia',
 '0, 1',
 'Andy, Cory, Donald',
 'Olympia')

In [4]:
def or_entailment(person_list,
                  place_list,
                  n,
                  vi_function,
                  not_vi_function,
                  mask_token="[MASK]"):
    """
    $P:= pm V(x_1, y_1) , dots, pm V(x_n, y_n)$
    $H:= pm V(x_i, y_i) or pm V(x*, y*)$
    """
    Subjects = get_n_different_items(person_list, n)
    people_O = [get_new_item(Subjects, person_list) for _ in range(n)]
    places = get_n_different_items(place_list, n)
    Objects = get_n_different_items(people_O + places, n)
    fs = np.random.choice([vi_function, not_vi_function], n)
    sentence1 = [f(x, y) for f, x, y in zip(fs, Subjects, Objects)]
    fs2 = np.random.choice([vi_function, not_vi_function])
    ids = get_n_different_items(range(len(Subjects)), 1)
    Subject2 = get_new_item(Subjects + people_O, person_list)
    Object2 = [get_new_item(Subjects + people_O + [Subject2], person_list)]
    place2 = get_new_item(places, place_list)
    Object2 += [place2]
    Object2 = np.random.choice(Object2)
    sentence2_l = [sentence1[ids[0]], fs2(Subject2, Object2)]
    np.random.shuffle(sentence2_l)
    sentence2 = sentence2_l[0] + " or " + sentence2_l[1]
    sentence2_masked = sentence2_l[0] + " {} ".format(mask_token) + sentence2_l[1] 
    sentence1 = ", ".join(sentence1)
    sentence1 += "."
    label = "or"
    people_O = list(set(Objects).intersection(people_O))
    people = ", ".join(Subjects + people_O + [Subject2])
    places = list(set(Objects + [Object2]).intersection(places + [place2]))
    Subjects = ", ".join(Subjects + [Subject2])
    Objects = ", ".join(Objects + [Object2])
    places = ", ".join(places)
    ids.sort()
    ids = ", ".join(map(lambda x: str(x), ids))

    return sentence1, sentence2, sentence2_masked, label, Subjects, Objects, ids, people, places

In [5]:
or_entailment(person_list=male_names,
               place_list=cities_and_states,
               n=2,
               vi_function=vi,
               not_vi_function=not_vi)

('Raul has visited El Centro, Darren has visited Greg.',
 "Neil didn't visit Dwayne or Raul has visited El Centro",
 "Neil didn't visit Dwayne [MASK] Raul has visited El Centro",
 'or',
 'Raul, Darren, Neil',
 'El Centro, Greg, Dwayne',
 '0',
 'Raul, Darren, Greg, Neil',
 'El Centro')

In [6]:
def create_csv(out_path,
               size,
               type1_instances_list,
               type2_instances_list,
               person_list,
               place_list,
               n,
               min_n):

    sentence1 = []
    sentence2 = []
    sentence2_masked = []
    label = []
    subjects = []
    objects = []
    ids = []
    people = []
    places = []

    type1_examples = int(size / 2)
    type2_examples = int(size / 2)
    type1_len = len(type1_instances_list)
    type2_len = len(type2_instances_list)
    type1s = [int(type1_examples / type1_len) for _ in type1_instances_list]  # noqa
    type2s = [int(type2_examples / type2_len) for _ in type2_instances_list]  # noqa

    for i, f in zip(type1s, type1_instances_list):
        for _ in range(i):
            current_n = np.random.choice(range(min_n, n + 1))
            s1, s2, s2_m, l, s, o, id_, pe, pl = f(person_list, place_list, current_n)  # noqa
            sentence1.append(s1)
            sentence2.append(s2)
            sentence2_masked.append(s2_m)
            label.append(l)
            subjects.append(s)
            objects.append(o)
            ids.append(id_)
            people.append(pe)
            places.append(pl)

    for i, f in zip(type2s, type2_instances_list):
        for _ in range(i):
            current_n = np.random.choice(range(min_n, n + 1))
            s1, s2, s2_m, l, s, o, id_, pe, pl = f(person_list, place_list, current_n)  # noqa
            sentence1.append(s1)
            sentence2.append(s2)
            sentence2_masked.append(s2_m)
            label.append(l)
            subjects.append(s)
            objects.append(o)
            ids.append(id_)
            people.append(pe)
            places.append(pl)

    df = pd.DataFrame({"sentence1": sentence1,
                       "sentence2": sentence2,
                       "sentence2_masked": sentence2_masked,
                       "label": label,
                       "subjects": subjects,
                       "objects": objects,
                       "ids": ids,
                       "people": people,
                       "places": places})
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv(out_path, header=True, index=False)

In [7]:
def i2eng(f):
    return lambda x, y, z: f(x, y, z, vi_function=vi, not_vi_function=not_vi)  # noqa

In [8]:
create_csv(out_path='data/generation/BC_train.csv',
           size=10000,
           type1_instances_list=[i2eng(and_entailment)],
           type2_instances_list=[i2eng(or_entailment)],
           person_list=male_names,
           place_list=cities_and_states,
           n=2,
           min_n=2)

create_csv(out_path='data/generation/BC_test.csv',
           size=1000,
           type1_instances_list=[i2eng(and_entailment)],
           type2_instances_list=[i2eng(or_entailment)],
           person_list=female_names,
           place_list=countries,
           n=2,
           min_n=2)

In [9]:
def create_train_txt(in_path,
                     out_path):
    df = pd.read_csv(in_path)
    ps = df["sentence1"].values
    hs = df["sentence2"].values
    with open(out_path, "w") as file:
        for p,h in zip(ps, hs):
            line = p + "\n" + h + "\n"
            file.write(line)
        file.write("\n")

In [10]:
create_train_txt(in_path='data/generation/BC_train.csv',
                 out_path='data/generation/BC_train.txt')