# Synthetic QA data generation

Experiments:

* Single question
    * [noise] [1 word answer] [noise], limited number of possible answer words, no intersection with noise in dict
    * [noise] [key word] [1 word answer] [noise], limited number of possible answer words
    * [noise] [key word] [1 - 3 words] [noise], limited number of possible answer words
    * [noise] [many key words] [1 word answer] [noise], any word is possible answer
    * [noise] [key word] [many answer words] [stop word] [noise], any answer words
    * [noise] [many key words] [many answer words] [many stop words] [noise], any answer words
    * [noise] [many key words] [many answer words] [many stop words] [noise], any answer words, keywords should be properly ordered
    * [noise] [many key words] [many answer words] [many stop words] [noise], any answer words, some words should be replaced
    * [noise] [many key words] [many answer words] [many stop words] [noise], any answer words, some words should be replaced and real answer words should be reordered
* All previous + multiple questions

In [1]:
import numpy as np
import random
import string
import math

In [2]:
def gen_word(word_length):
    word_len = np.random.randint(*word_length)
    return ''.join(random.sample(string.ascii_lowercase, word_len))

In [3]:
def sentence_generator(
    noise_size=(1,2),
    noise_dict=None,
    keyword_size=(1,2),
    keyword_dict=None,
    answer_size=(1,2),
    answer_dict=None,
    answer_mapping_foo=None,
    stop_word_size=(0,1),
    stop_word_dict=None,
):
    noise_start = random.choices(noise_dict, k=random.randint(*noise_size))
    noise_end = random.choices(noise_dict, k=random.randint(*noise_size))
    
    keywords = random.choices(keyword_dict, k=random.randint(*keyword_size))
    stopwords = random.choices(stop_word_dict, k=random.randint(*stop_word_size))
    
    answer_text = random.choices(answer_dict, k=random.randint(*answer_size))
    
    if answer_mapping_foo:
        answer = answer_mapping_foo(answer_text)
    else:
        answer = answer_text
    
    
    return noise_start + keywords + answer_text + stopwords + noise_end, answer

In [52]:
noise_dict = [gen_word([3, 4]) for _ in range(100)]
answer_dict = [gen_word([4, 5]) for _ in range(10)]

sentence_generator(
    noise_size=(1,6),
    noise_dict=noise_dict, 
    keyword_size=(0,0),
    keyword_dict=[],
    answer_size=(1,1),
    answer_dict=answer_dict,
    stop_word_size=(0,0),
    stop_word_dict=[]
)

(['fkg', 'dzn', 'vra', 'bsmz', 'dpg', 'ovj', 'dzn', 'jsw', 'zob'], ['bsmz'])

In [55]:
keyword_dict = ['key', 'start', 'begin', 'init', 'here']
noise_dict = [gen_word([3, 4]) for _ in range(100)]
answer_dict = random.sample(noise_dict, 10)

sentence_generator(
    noise_size=(1,6),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,1),
    answer_dict=answer_dict,
    stop_word_size=(0,0),
    stop_word_dict=[]
)

(['cpm', 'start', 'kog', 'alh', 'feu'], ['kog'])

In [58]:
keyword_dict = ['key', 'start', 'begin', 'init', 'here']
noise_dict = [gen_word([3, 4]) for _ in range(100)]
answer_dict = random.sample(noise_dict, 10)

sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,3),
    answer_dict=answer_dict,
    stop_word_size=(0,0),
    stop_word_dict=[]
)

(['ldo', 'jfq', 'start', 'nvy', 'xuk', 'xgf', 'mbc', 'dpi'], ['nvy', 'xuk'])

In [74]:
keyword_dict = ['key', 'start', 'begin', 'init', 'here']
noise_dict = [gen_word([3, 4]) for _ in range(100)]

sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,1),
    answer_dict=noise_dict,
    stop_word_size=(0,0),
    stop_word_dict=[]
)

(['usr', 'init', 'epz', 'ejt', 'jfd'], ['epz'])

In [90]:
keyword_dict = ['key', 'start', 'begin', 'init', 'here']
stop_word_dict = ['stop', 'end', 'exit', 'finish']
noise_dict = [gen_word([3, 4]) for _ in range(100)]

sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,4),
    answer_dict=noise_dict,
    stop_word_size=(1,1),
    stop_word_dict=stop_word_dict
)

(['via',
  'mtj',
  'qiu',
  'osf',
  'here',
  'evh',
  'yvg',
  'bxl',
  'tbw',
  'end',
  'pea'],
 ['evh', 'yvg', 'bxl', 'tbw'])

In [91]:
noise_dict = [gen_word([3, 4]) for _ in range(100)]

keyword_dict = random.sample(noise_dict, 10)
stopword_dict = random.sample(noise_dict, 10)


sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(2,2),
    keyword_dict=keyword_dict,
    answer_size=(1,4),
    answer_dict=noise_dict,
    stop_word_size=(2,2),
    stop_word_dict=stopword_dict
)

(['oln',
  'rpl',
  'mzy',
  'dib',
  'dqg',
  'bdf',
  'hxp',
  'ika',
  'rzb',
  'lkw',
  'atv',
  'exp',
  'pfa',
  'fsw'],
 ['dqg', 'bdf', 'hxp', 'ika'])

In [95]:
noise_dict = [gen_word([3, 4]) for _ in range(100)]

keyword_subwords = random.sample(noise_dict, 10)
keyword_dict = list(map(" ".join, zip(keyword_subwords[1::2], keyword_subwords[::2])))

stopword_subwords = random.sample(noise_dict, 10)
stopword_dict = list(map(" ".join, zip(stopword_subwords[1::2], stopword_subwords[::2])))


sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,4),
    answer_dict=noise_dict,
    stop_word_size=(1,1),
    stop_word_dict=stopword_dict
)

(['hdp', 'twz njd', 'nib', 'wpf', 'qdy', 'uac', 'gln blz', 'frg', 'qdy'],
 ['nib', 'wpf', 'qdy', 'uac'])

In [102]:
noise_dict = [gen_word([3, 4]) for _ in range(100)]

keyword_subwords = random.sample(noise_dict, 10)
keyword_dict = list(map(" ".join, zip(keyword_subwords[1::2], keyword_subwords[::2])))

stopword_subwords = random.sample(noise_dict, 10)
stopword_dict = list(map(" ".join, zip(stopword_subwords[1::2], stopword_subwords[::2])))

answ_dict = random.sample(noise_dict, 20)
repl_map = dict(zip(random.sample(answ_dict, 20), random.sample(answ_dict, 20)))

def answer_mapping_foo(answer):
    return list(map(lambda x: repl_map[x], answer))

sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,4),
    answer_dict=answ_dict,
    stop_word_size=(1,1),
    stop_word_dict=stopword_dict,
    answer_mapping_foo=answer_mapping_foo
)

(['cgn', 'fzr', 'fny', 'sqv', 'wac gzo', 'tlr', 'akc', 'jmb jeb', 'wac'],
 ['rut', 'tlr'])

In [104]:
noise_dict = [gen_word([3, 4]) for _ in range(100)]

keyword_subwords = random.sample(noise_dict, 10)
keyword_dict = list(map(" ".join, zip(keyword_subwords[1::2], keyword_subwords[::2])))

stopword_subwords = random.sample(noise_dict, 10)
stopword_dict = list(map(" ".join, zip(stopword_subwords[1::2], stopword_subwords[::2])))

answ_dict = random.sample(noise_dict, 20)
repl_map = dict(zip(random.sample(answ_dict, 20), random.sample(answ_dict, 20)))

def answer_mapping_foo(answer):
    return sorted(list(map(lambda x: repl_map[x], answer)))

sentence_generator(
    noise_size=(1,4),
    noise_dict=noise_dict, 
    keyword_size=(1,1),
    keyword_dict=keyword_dict,
    answer_size=(1,4),
    answer_dict=answ_dict,
    stop_word_size=(1,1),
    stop_word_dict=stopword_dict,
    answer_mapping_foo=answer_mapping_foo
)

(['iqj',
  'gzf',
  'rsi',
  'udi',
  'sjt uvl',
  'ins',
  'hte',
  'oyj',
  'ins',
  'yfg rma',
  'kfa',
  'uhd',
  'wtf',
  'mkt'],
 ['ins', 'ins', 'jtx', 'voc'])

In [None]:
noise_dict = [gen_word([3, 4]) for _ in range(30)]


def g1():
    return sentence_generator(
        noise_size=(0,4),
        noise_dict=noise_dict, 
        keyword_size=(1,1),
        keyword_dict=['ans1'],
        answer_size=(1,1),
        answer_dict=noise_dict,
        stop_word_size=(0,0),
        stop_word_dict=[]
    )

def g2():
    return sentence_generator(
        noise_size=(0,4),
        noise_dict=noise_dict, 
        keyword_size=(1,1),
        keyword_dict=['ans2'],
        answer_size=(2,2),
        answer_dict=noise_dict,
        stop_word_size=(0,0),
        stop_word_dict=[]
    )

def g3():
    return sentence_generator(
        noise_size=(0,4),
        noise_dict=noise_dict, 
        keyword_size=(1,1),
        keyword_dict=['ans3'],
        answer_size=(3,3),
        answer_dict=noise_dict,
        stop_word_size=(0,0),
        stop_word_dict=[]
    )


with open('./data/train.txt', 'w') as out:
    for i in range(5000):
        p, a = random.choice([g1, g2, g3])()
        out.write(f"{' '.join(p)}\t{' '.join(a)}\n")
                      
with open('./data/test.txt', 'w') as out:
    for i in range(100):
        p, a = random.choice([g1, g2, g3])()
        out.write(f"{' '.join(p)}\t{' '.join(a)}\n")