In [1]:
from parrot import Parrot
import warnings
import pandas as pd

from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

warnings.filterwarnings("ignore")


class Program:
    model = None
    translator = None

    def init_model(self, use_gpu=True):
        if self.model is None:
            self.model = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=use_gpu)
            self.use_gpu = use_gpu

    def predict(self, phrases):
        self.init_model()
        sentences = []
        for phrase in phrases:
            print("Paraphrasing [" + phrase + "]")
            try:
                para_phrases = self.model.augment(
                    input_phrase=phrase,
                    do_diverse=True,
                    max_return_phrases=25,
                    max_length=256,
                    adequacy_threshold=0.50,
                    fluency_threshold=0.50,
                    use_gpu=self.use_gpu
                )
                if para_phrases is None:
                    return
                for para_phrase in para_phrases:
                    sentences.append(str(para_phrase[0])) 
            except:
                pass
        return sentences

In [2]:
parrot = Program()
parrot.predict(['This is a test sentence.'])

Paraphrasing [This is a test sentence.]


['these sentences are for testing purposes only',
 "it's a test",
 'these sentences are for testing',
 'these sentences are tests',
 "it's a test phrase",
 "it's a test sentence for me",
 'the test sentence is a test phrase',
 "i think it's a test phrase",
 'the test sentence is for the test sentence',
 'this is a test',
 'the test sentence is',
 'the test sentence',
 'i think this sentence is a test sentence',
 "it's a test sentence",
 "i think it's a test sentence",
 'this is the test sentence',
 'the test sentence is a test sentence',
 'it is a test sentence',
 "this is a test sentence ''",
 "all right it's a test sentence",
 'this is a test sentence']

In [3]:
parrot.predict(['How many apples does the man have?'])

Paraphrasing [How many apples does the man have?]


['how many apples does a man have? how many are there?',
 'can you list the number of apple trees that a man has?',
 'show the number of apples in a man?',
 'how many apples does a man have? how many apples?',
 'show the number of apples a man has?',
 'is there a limit to how many apples a person has?',
 'show the number of apples that a man has?',
 'how many apples does a man have in his hand?',
 'is there a limit to how many apples a man has?',
 'show the number of apples a man can have?',
 'how many apples does a man have? how many?',
 'how many apples does a man possess?',
 'is it true how many apples a man has?',
 'how many apples do a man have?',
 'how many apples does man have?',
 'how many apples does a man have?',
 'is it true how many apples do a man have?',
 'is it true how many apples does a man have?',
 'how many apples does the man have?']

In [3]:
data_dir='data/ACE - Word Problems/'
csv_path = data_dir+'ACE - Word Problems - Present Simple 2.csv'
present_simple_df = pd.read_csv(csv_path)
# past_simple_df = pd.read_csv(data_dir+'ACE - Word Problems - Past Simple.csv')
# gen_2_df = pd.read_csv(data_dir+'gen_2_pairs.csv')
# past_present_df = pd.DataFrame(zip(past_simple_df['problem'].tolist(), present_simple_df['problem'].tolist()), columns=['input_text', 'target_text'])
# past_present_df.head()

In [11]:
def form_pairs(target_text='ACE - Word Problems - Present Simple.csv', input_texts=['ACE - Word Problems - Past Simple.csv'], data_dir='data/', target_as_input=True):
    target_df = pd.read_csv(data_dir+target_text)
    output = []
    if target_as_input:
        output.extend(list(zip(target_df['problem'].tolist(), target_df['problem'].tolist())))
    for input_text in input_texts:
        input_df = pd.read_csv(data_dir+input_text)
        # gen_2_df = pd.read_csv(data_dir+'gen_2_pairs.csv')
        output.extend(list(zip(input_df['problem'].tolist(), target_df['problem'].tolist())))
    output_df = pd.DataFrame(output, columns=['input_text', 'target_text'])
    return output_df

form_pairs(target_text='ACE - Word Problems - Present Simple 2.csv', input_texts=['ACE - Word Problems - Past Simple.csv', 'ACE - Word Problems - Composition.csv'], data_dir='data/ACE - Word Problems/')

Unnamed: 0,input_text,target_text
0,A man has 3 apples. A woman gives 4 apples to ...,A man has 3 apples. A woman gives 4 apples to ...
1,A woman has 1 ball. A man gives 9 balls to the...,A woman has 1 ball. A man gives 9 balls to the...
2,A boy has 8 bananas. A girl gives 1 banana to ...,A boy has 8 bananas. A girl gives 1 banana to ...
3,A girl has 3 melons. A boy gives 6 melons to t...,A girl has 3 melons. A boy gives 6 melons to t...
4,A restaurant has 175 normal chairs. The restau...,A restaurant has 175 normal chairs. The restau...
...,...,...
235,There are 5 tables and if there is a table the...,There are 5 tables. If there is a table then i...
236,There are 3 chalkboards. Each chalkboard has 2...,There are 3 chalkboards. Each chalkboard has 2...
237,There are 5 cars. Each car has 4 wheels. A man...,There are 5 cars. Each car has 4 wheels. A man...
238,There are 2 spiders. Each spider has 8 legs an...,There are 2 spiders. Each spider has 8 legs. A...


In [27]:
gen_3_df = pd.concat([gen_2_df, past_present_df])
with open(data_dir+'gen_3_pairs.csv', 'w') as f:
    gen_3_df.to_csv(f, index=False)

In [15]:
parrot.model.rephrase('How many apples does the man have?', do_diverse=True, adequacy_threshold=0.8)[0]

'how many apples does a man have in his hand?'

In [9]:
def validate_sentence(key, sentence):
    dot = sentence.count('.')
    qmark = sentence.count('?')
    if dot > 1 or qmark > 2 or key == sentence.lower():
        return 0
    else:
        return 1

def check_sentence(sentence, adequacy_threshold=0.90, fluency_threshold=0.90, use_input=True):
    if isinstance(sentence, tuple):
        sentence = sentence[0]
    key = sentence.lower().strip()
    while adequacy_threshold >= 0.05 and not validate_sentence(key, sentence):
        adequacy_threshold = adequacy_threshold - 0.1
        sentence = parrot.model.rephrase(sentence, do_diverse=False, adequacy_threshold=adequacy_threshold)[0]
        if isinstance(sentence, tuple):
            sentence = sentence[0]
    if adequacy_threshold < 0.05 or not validate_sentence(key, sentence):
        while fluency_threshold >= 0.05 and not validate_sentence(key, sentence):
            fluency_threshold = fluency_threshold - 0.1
            sentence = parrot.model.rephrase(sentence, do_diverse=False, fluency_threshold=fluency_threshold)[0]
            if isinstance(sentence, tuple):
                sentence = sentence[0]
        if fluency_threshold < 0.05 or not validate_sentence(key, sentence):
            if use_input:
                sentence = input(f'Cant transform: "{sentence}" | manual = ')
            else:
                raise ValueError(f'Cant transform: {sentence}')
    return sentence

def format_sentence(sentence):
    sentence = sentence.strip().capitalize()
    if '?' in sentence:
        return sentence
    else:
        return sentence+'.'

def print_output(key, value):
    print(f'original: {key}\nnew:\n')
    for s in value:
        print(f'  {s}')


In [21]:
s1 = 'A man has 3 apples. A woman gives 4 apples to the man.'
s2 = 'How many apples does the man have?'
s = f'{s1} {s2}'
s1 = 'There are 5 trees. Each tree has 10 branches.'
# check_sentence(s)
parrot.model.rephrase(s1, do_diverse=False, adequacy_threshold=0.1)

('each tree has 10 branches', 32)

In [10]:
new_sentences = {}
for i, key in enumerate(tqdm(present_simple_df['problem'].tolist())):
    value = []
    sentences = key.split('.')
    sentences = [format_sentence(s) for s in sentences]
    single_add = []
    for i, sentence in enumerate(sentences):
        s_copy = sentences.copy()
        new_sentence = parrot.model.rephrase(sentence, do_diverse=True)[0]
        # print(f"new_sentence: {new_sentence}")
        new_sentence = check_sentence(new_sentence)
        new_sentence = format_sentence(new_sentence)
        s_copy[i] = new_sentence
        value.append(" ".join(s_copy))
        single_add.append(new_sentence)
    value.append(" ".join(single_add))
    new_sentences[key] = value
    # print_output(key, value)


100%|██████████| 80/80 [28:37<00:00, 21.46s/it]


In [33]:
parrot.model.rephrase('An influencer has two accounts.', do_diverse=False, adequacy_threshold=0.2)[0]
len(present_simple_df['problem'].tolist())

80

In [43]:
compound = dict()
compound.update(new_sentences1)
compound.update(new_sentences)

In [12]:
len(new_sentences)
# len(present_simple_df['problem'].tolist()[48:])
# ss = []
# for i, a in enumerate(present_simple_df['problem'].tolist()[48:]):
#     if a not in ss:
#         ss.append(a)
#     else:
#         print(a)
#         # print(a)


80

In [13]:
rows = []
for key, value in new_sentences.items():
    r = [key]
    r.extend(value)
    rows.append(r)
df1 = pd.DataFrame(rows)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6
0,A man has 3 apples. A woman gives 4 apples to ...,There's three apples. A woman gives 4 apples t...,A man has 3 apples. A woman gives him four app...,A man has 3 apples. A woman gives 4 apples to ...,There's three apples. A woman gives him four a...,,
1,A woman has 1 ball. A man gives 9 balls to the...,There's only one ball. A man gives 9 balls to ...,A woman has 1 ball. A man has nine balls to gi...,A woman has 1 ball. A man gives 9 balls to the...,There's only one ball. A man has nine balls to...,,
2,A boy has 8 bananas. A girl gives 1 banana to ...,A boy has 8 bananas in his hand. A girl gives ...,A boy has 8 bananas. A girl gives it to the bo...,A boy has 8 bananas. A girl gives 1 banana to ...,A boy has 8 bananas in his hand. A girl gives ...,,
3,A girl has 3 melons. A boy gives 6 melons to t...,I think a girl has three melons. A boy gives 6...,A girl has 3 melons. A boy gives her six melon...,A girl has 3 melons. A boy gives 6 melons to t...,I think a girl has three melons. A boy gives h...,,
4,A restaurant has 175 normal chairs. The restau...,The restaurant has 175 seats. The restaurant h...,A restaurant has 175 normal chairs. The restau...,A restaurant has 175 normal chairs. The restau...,The restaurant has 175 seats. The restaurant h...,,


In [14]:
with open(data_dir+'gen_4_perm_true_lt_a_9_f_9_table.csv', 'w') as f:
    df1.to_csv(f)

In [49]:
def form_pairs(df):
    df = df.where(pd.notnull(df), None)
    pairs = []
    for _, values in df.iterrows():
        inp = values[0]
        for out in values[1:]:
            if out is None:
                continue
            else:
                pairs.append([out, inp])
        pairs.append([inp,inp])
    return pd.DataFrame(pairs, columns=['input_text', 'target_text'])
    

        
df_pairs = form_pairs(df1)
df_pairs.head()

Unnamed: 0,input_text,target_text
0,I'll have three apples. A woman gives 4 apples...,A man has 3 apples. A woman gives 4 apples to ...
1,A man has 3 apples. She gives him four apples....,A man has 3 apples. A woman gives 4 apples to ...
2,A man has 3 apples. A woman gives 4 apples to ...,A man has 3 apples. A woman gives 4 apples to ...
3,I'll have three apples. She gives him four app...,A man has 3 apples. A woman gives 4 apples to ...
4,A man has 3 apples. A woman gives 4 apples to ...,A man has 3 apples. A woman gives 4 apples to ...


In [50]:
with open(data_dir+'gen_2_pairs.csv', 'w') as f:
    df_pairs.to_csv(f, index=False)

In [40]:
df_pairs.max()['input_text']

'There is a mother. There is a father. A bottle has 6 glasses. If a bottle has 2 glasses then the father gets 1 glass from the bottle and the mother gets 1 glass from the bottle. How many glasses does the mother have?'

In [42]:
df1 = pd.read_csv(data_dir+'gen_1_pairs.csv')
df1.head()


df1 = pd.read_csv(data_dir+'gen_1_perm_true_lt_a_9_f_9_table.csv')
df1.head()

Unnamed: 0,0,1,2,3,4,5,6
0,A man has 3 apples. A woman gives 4 apples to ...,I'll have three apples. A woman gives 4 apples...,A man has 3 apples. She gives him four apples....,A man has 3 apples. A woman gives 4 apples to ...,I'll have three apples. She gives him four app...,,
1,A woman has 1 ball. A man gives 9 balls to the...,There's a ball. A man gives 9 balls to the wom...,A woman has 1 ball. An unidentified man gave h...,A woman has 1 ball. A man gives 9 balls to the...,There's a ball. An unidentified man gave her n...,,
2,A boy has 8 bananas. A girl gives 1 banana to ...,One boy has 8 bananas in his hand. A girl give...,A boy has 8 bananas. A girl gave him a banana....,A boy has 8 bananas. A girl gives 1 banana to ...,One boy has 8 bananas in his hand. A girl gave...,,
3,A girl has 3 melons. A boy gives 6 melons to t...,A girl is wearing a lot of melons. A boy gives...,A girl has 3 melons. A boy gives her a handful...,A girl has 3 melons. A boy gives 6 melons to t...,A girl is wearing a lot of melons. A boy gives...,,
4,A restaurant has 175 normal chairs and 20 juni...,A restaurant has 175 standard seats and 20 jun...,A restaurant has 175 normal chairs and 20 juni...,A restaurant has 175 standard seats and 20 jun...,,,


In [6]:
df1.columns=['target_text', 'input_text']

In [8]:
df1  = df1[['input_text', 'target_text']]

In [10]:
with open(data_dir+'gen_1_pairs.csv', 'w') as f:
    df1.to_csv(f, index=False)

In [12]:
df1.head()

Unnamed: 0,input_text,target_text
0,I'll have three apples. A woman gives 4 apples...,A man has 3 apples. A woman gives 4 apples to ...
1,A man has 3 apples. She gives him four apples....,A man has 3 apples. A woman gives 4 apples to ...
2,A man has 3 apples. A woman gives 4 apples to ...,A man has 3 apples. A woman gives 4 apples to ...
3,I'll have three apples. She gives him four app...,A man has 3 apples. A woman gives 4 apples to ...
4,There's a ball. A man gives 9 balls to the wom...,A woman has 1 ball. A man gives 9 balls to the...


In [6]:
d = dict()
for k, v in df1.iterrows():
    if v[1] in d:
        d[v[1]].append(v[0])
    else:
        d[v[1]] = [v[0]]

ls = []
for k, v in d.items():
    ls.append([k]+v)

df2 = pd.DataFrame(ls)

In [7]:
with open(data_dir+'gen_1_perm_true_lt_a_9_f_9_table.csv', 'w') as f:
    df2.to_csv(f, index=False)