# Count task:

- p = $\varphi_1, \dots, \varphi_m$, x visited $p_1 , \dots, p_n$
- h = x visited at least [mask] places
    - [mask] = $n$ 

#### In this notebook we will create a df with columns 'sentence1', 'sentence2', 'sentence2_masked' 'label', and a txt for training

In [1]:
import pandas as pd
import numpy as np
from inference.text_generation.vocab import male_names, female_names, cities_and_states, countries
from inference.text_generation.util import get_new_item, get_n_different_items
from inference.text_generation.util import vi, not_vi, num2word

In [2]:
def count_entailment(person_list,
                     place_list,
                     n,
                     number_of_other_sentences,
                     vi_function,
                     not_vi_function,
                     number2str,
                     complement,
                     mask_token="[MASK]",
                     replace_symbol="%"):
    """
    $P:= S_1, dots, S_m, V(x, p_1) , dots, pm V(x, p_n)$
    $H:= x$ has visited at least $n$ places
    
    S_j can be not V(x, p*)
    """
    m = number_of_other_sentences
    Subjects = get_n_different_items(person_list, m + 1)
    Objects = get_n_different_items(place_list, n + m)
    Subjects_before = Subjects[:m]
    Subjects_after = Subjects[m:] * n
    Objects_before = Objects[:m]
    Objects_after = Objects[m:]
    fs_before = np.random.choice([vi_function, not_vi_function], m)
    fs_after = [vi_function] * n
    
    add_Subjects_after_before = np.random.choice([True, False])
    
    if add_Subjects_after_before:
        Subjects.remove(Subjects_before[0])
        Subjects_before[0] = Subjects_after[0]
        fs_before[0] = not_vi_function
        
    sentence1_before = [f(x, y) for f, x, y in zip(fs_before, Subjects_before, Objects_before)]
    sentence1_after =  [f(x, y) for f, x, y in zip(fs_after, Subjects_after, Objects_after)]
    sentence1 = sentence1_before + sentence1_after
    indicator = vi_function(Subjects_after[0], "x")[:-1]
    np.random.shuffle(sentence1)
    ids = [i for i,s in enumerate(sentence1) if indicator in s]
    sentence1 = ", ".join(sentence1)
    sentence1 += "." 
    numeral = number2str[n]
    complement_true = complement.replace(replace_symbol, numeral)
    complement_masked = complement.replace(replace_symbol, mask_token)
    sentence2 = vi_function(Subjects_after[0], complement_true)
    sentence2_masked = vi_function(Subjects_after[0], complement_masked)
    label = numeral
    people = ", ".join(Subjects)
    Subjects = ", ".join(Subjects)
    Objects = ", ".join(Objects)
    places = Objects
    ids = ", ".join(map(lambda x: str(x), ids))

    return sentence1, sentence2, sentence2_masked, label, Subjects, Objects, ids, people, places

In [3]:
count_entailment(person_list=male_names,
                 place_list=cities_and_states,
                 n=4,
                 number_of_other_sentences=1,
                 vi_function=vi,
                 not_vi_function=not_vi,
                 number2str=num2word,
                 complement="at least % places")

("Jerome has visited Campbellsville, Jerome has visited Spring Green, Jerome has visited Park Ridge, Jerome didn't visit Demopolis, Jerome has visited Pagosa Springs.",
 'Jerome has visited at least four places',
 'Jerome has visited at least [MASK] places',
 'four',
 'Jerome',
 'Demopolis, Spring Green, Park Ridge, Campbellsville, Pagosa Springs',
 '0, 1, 2, 4',
 'Jerome',
 'Demopolis, Spring Green, Park Ridge, Campbellsville, Pagosa Springs')

In [4]:
def i2eng(f, m):
    return lambda x, y, z: f(x,
                             y,
                             z,
                             number_of_other_sentences=m,
                             vi_function=vi,
                             not_vi_function=not_vi,
                             number2str=num2word,
                             complement="at least % places")

In [5]:
type1_instances_list = [i2eng(count_entailment, 1), i2eng(count_entailment, 2), i2eng(count_entailment, 3), i2eng(count_entailment, 4)]
type2_instances_list = [i2eng(count_entailment, 5), i2eng(count_entailment, 6)]

In [6]:
def create_csv(out_path,
               size,
               type1_instances_list,
               type2_instances_list,
               person_list,
               place_list,
               n,
               min_n):

    sentence1 = []
    sentence2 = []
    sentence2_masked = []
    label = []
    subjects = []
    objects = []
    ids = []
    people = []
    places = []

    type1_examples = int(size / 2)
    type2_examples = int(size / 2)
    type1_len = len(type1_instances_list)
    type2_len = len(type2_instances_list)
    type1s = [int(type1_examples / type1_len) for _ in type1_instances_list]  # noqa
    type2s = [int(type2_examples / type2_len) for _ in type2_instances_list]  # noqa

    for i, f in zip(type1s, type1_instances_list):
        for _ in range(i):
            current_n = np.random.choice(range(min_n, n + 1))
            s1, s2, s2_m, l, s, o, id_, pe, pl = f(person_list, place_list, current_n)  # noqa
            sentence1.append(s1)
            sentence2.append(s2)
            sentence2_masked.append(s2_m)
            label.append(l)
            subjects.append(s)
            objects.append(o)
            ids.append(id_)
            people.append(pe)
            places.append(pl)

    for i, f in zip(type2s, type2_instances_list):
        for _ in range(i):
            current_n = np.random.choice(range(min_n, n + 1))
            s1, s2, s2_m, l, s, o, id_, pe, pl = f(person_list, place_list, current_n)  # noqa
            sentence1.append(s1)
            sentence2.append(s2)
            sentence2_masked.append(s2_m)
            label.append(l)
            subjects.append(s)
            objects.append(o)
            ids.append(id_)
            people.append(pe)
            places.append(pl)

    df = pd.DataFrame({"sentence1": sentence1,
                       "sentence2": sentence2,
                       "sentence2_masked": sentence2_masked,
                       "label": label,
                       "subjects": subjects,
                       "objects": objects,
                       "ids": ids,
                       "people": people,
                       "places": places})
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv(out_path, header=True, index=False)

In [7]:
create_csv(out_path='data/generation/count_train.csv',
           size=10000,
           type1_instances_list=type1_instances_list,
           type2_instances_list=type2_instances_list,
           person_list=male_names,
           place_list=cities_and_states,
           n=20,
           min_n=1)

create_csv(out_path='data/generation/count_test.csv',
           size=1000,
           type1_instances_list=type1_instances_list,
           type2_instances_list=type2_instances_list,
           person_list=female_names,
           place_list=countries,
           n=20,
           min_n=1)

In [8]:
def create_train_txt(in_path,
                     out_path):
    df = pd.read_csv(in_path)
    ps = df["sentence1"].values
    hs = df["sentence2"].values
    with open(out_path, "w") as file:
        for p,h in zip(ps, hs):
            line = p + "\n" + h + "\n"
            file.write(line)
        file.write("\n")

In [9]:
create_train_txt(in_path='data/generation/count_train.csv',
                 out_path='data/generation/count_train.txt')