In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import numpy as np
import difflib

In [2]:
def avg_p(labels, preds):
    if isinstance(labels, list):
        labels = np.array(labels)
    if isinstance(preds, list):
        preds = np.array(preds)
    check = labels == preds
    invert = np.invert(check)
    return check.mean(), invert

class Experiment:
    def __init__(self, model_name="20211207153056", model_path="/home/fabian/Documents/2021-2022/word-problem-solver/Experiments/models/", use_cuda=True) -> None:
        self.model = self.init_model(model_name, model_path, use_cuda)
        self.prefix = "nltocl:"

    def init_model(self, model_name, model_path, use_cuda=True):
        # Model arguments
        model_args = Seq2SeqArgs()

        model_args.num_return_sequences = 1
        model_args.max_length = 256
        model_args.top_k = 50
        model_args.top_p = 0.95

        model_args.wandb_project = "NL to CL with BART"
        model = Seq2SeqModel(
            encoder_decoder_type="bart",
            encoder_decoder_name=model_path+model_name,
            args=model_args,
            use_cuda=use_cuda
        )
        return model

    def difference(self, labels, preds, show_output=True):
        cases = zip(preds, labels)
        outputs = []
        for a,b in cases:
            output = ['pred  : {}\n=>\nlabel : {}'.format(a,b)]
            if show_output:     
                print(output[0])
            del_word = []
            add_word = []
            for i,s in enumerate(difflib.ndiff(a, b)):
                if s[0]==' ':
                    if len(del_word):
                        output.append("".join(del_word))
                    if len(add_word):
                        output.append("".join(add_word))
                    if show_output:
                        if len(del_word):
                            print("".join(del_word))
                        if len(add_word):
                            print("".join(add_word))
                    del_word = []
                    add_word = []
                    continue
                elif s[0]=='-':
                    if not len(del_word):
                        del_word.append(f'-- {i} : ')
                    del_word.append(s[-1])
                elif s[0]=='+':
                    if not len(add_word):
                        add_word.append(f'++ {i} : ')
                    add_word.append(s[-1])
            outputs.append("\n".join(output))
            if show_output:
                print()
        return outputs

    def calculate_difference(self, labels, preds,f=avg_p, lower=False, show_output=False):
        if isinstance(labels, list):
            labels = np.array(labels)
        if isinstance(preds, list):
            preds = np.array(preds)
        if lower:
            labels = np.char.lower(labels)
            preds = np.char.lower(preds)

        score, mask = f(labels, preds)
        outputs = self.difference(labels[mask], preds[mask], show_output=show_output)
        print(f'- {f.__name__}: {score}')
        return outputs, mask

    def custom_eval(self, df, f=avg_p, lower=False):
        labels = np.array(df['target_text'].tolist())
        input_text = df['input_text'].tolist()
        preds = np.array(self.model.predict(input_text))
        input_text = np.array(input_text)
        outputs, mask = self.calculate_difference(labels, preds, lower=lower, show_output=False)
        return outputs, labels, preds, mask

    def eval_train_test(self, train_df, test_df, lower=False):
        print('Eval train:')
        outputs_train, labels_train, preds_train, mask_train = self.custom_eval(train_df, lower=lower)
        print('\nEval test:')
        outputs_test, labels_test, preds_test, mask_test = self.custom_eval(test_df, lower=lower)
        return {"train":{"outputs":outputs_train, "labels":labels_train, "preds":preds_train, "mask":mask_train},\
               "test":{"outputs":outputs_test, "labels":labels_test, "preds":preds_test, "mask":mask_test}}

    def predict(self, problems):
        if isinstance(problems, str):
            problems = [problems]
        return self.model.predict(problems)

E = Experiment(model_name="20211211182517") 

In [15]:
E.predict(["nltocl:A man Mike had 4 apples."])

Generating outputs: 100%|██████████| 1/1 [00:00<00:00,  4.61it/s]


['A man has 4 apples. A woman puts 4 apples in the apple. How many apples does the man have?']

In [3]:
E.predict(["A man used to have 1 apples. A woman gave 3 apples to the man. How many apples did the man have?"])

Generating outputs: 100%|██████████| 1/1 [00:00<00:00,  3.66it/s]


['A man has 1 apples. A woman gives 3 apples to the man. How many apples does the man have?']

In [5]:
data_dir = 'data/'
df = pd.read_csv(data_dir+'gen_2_pairs.csv')
df["prefix"] = "nltocl"
df = df[["prefix", "input_text", "target_text"]]
q_train, q_test = train_test_split(df, random_state=10)
print(q_train.shape, q_test.shape)
q_train.head()

(322, 3) (108, 3)


Unnamed: 0,prefix,input_text,target_text
27,nltocl,A store has 375 black shirts and 40 white shir...,A store has 375 black shirts and 40 white shir...
295,nltocl,An store has 52 cars. In each vehicle there ar...,An store has 52 cars. Each car has 4 doors. Ea...
116,nltocl,A man has a thousand ants in his stomach. He s...,A man has 1000 ants. He sells 213 ants. How ma...
105,nltocl,A dog has 2 birds. 1 bird flees from the dog. ...,A dog has 2 birds. 1 bird flees from the dog. ...
146,nltocl,Mom has ten cookies. Three cookies are given t...,A mother has 10 cookies. The mother gives 3 co...


In [6]:
present_simple =  pd.read_csv(data_dir+'ACE - Word Problems - Present Simple.csv')
past_simple = pd.read_csv(data_dir+'ACE - Word Problems - Past Simple.csv')
past_simple.head()

Unnamed: 0,index,problem,answer
0,1.1.1,A man had 3 apples. A woman gave 4 apples to t...,7
1,1.1.2,A woman had 1 ball. A man gave 9 balls to the ...,10
2,1.1.3,A boy had 8 bananas. A girl gave 1 banana to t...,9
3,1.1.4,A girl had 3 melons. A boy gave 6 melons to th...,9
4,1.2.1,A restaurant had 175 normal chairs and 20 juni...,195


In [10]:
labels = present_simple['problem'].tolist()
past_simple_pred = E.predict(past_simple['problem'].tolist())

past_simple_results = E.calculate_difference(labels=labels, preds=past_simple_pred)
print('lower:')
past_simple_results_lower = E.calculate_difference(labels=labels, preds=past_simple_pred, lower=True)

Generating outputs: 100%|██████████| 10/10 [00:03<00:00,  2.81it/s]

- avg_p: 0.7125
lower:
- avg_p: 0.775





In [11]:
results = E.eval_train_test(train_df=q_train, test_df=q_test, lower=True)

Eval train:


Generating outputs: 100%|██████████| 41/41 [00:17<00:00,  2.40it/s]


- avg_p: 0.7515527950310559

Eval test:


Generating outputs: 100%|██████████| 14/14 [00:05<00:00,  2.46it/s]

- avg_p: 0.7222222222222222





In [136]:
print(results['train']['outputs'][0])

pred  : a mother has 10 cookies. the mother gives 3 cookies to a son. the woman gives 6 cookies to an aunt. how many cookies does the mother have?
=>
label : a mother has 10 cookies. the mother gives 3 cookies to a son. the mother gives 6 cookies to a daughter. how many cookies does the mother have?
-- 66 : w
++ 67 : m
-- 69 : man
++ 72 : ther
-- 97 : n
++ 99 : d
-- 102 : n
++ 103 : gh
++ 106 : er


In [8]:
E2 = Experiment(model_name="20211207153056")

In [14]:
results = E2.eval_train_test(train_df=q_train, test_df=q_test, lower=True)

Eval train:


Generating outputs: 100%|██████████| 41/41 [00:17<00:00,  2.38it/s]


- avg_p: 0.7763975155279503

Eval test:


Generating outputs: 100%|██████████| 14/14 [00:05<00:00,  2.48it/s]

- avg_p: 0.6666666666666666





In [15]:
labels = present_simple['problem'].tolist()
present_simple_pred = E.predict(present_simple['problem'].tolist())

present_simple_results = E.calculate_difference(labels=labels, preds=present_simple_pred)
print('lower:')
present_simple_results_lower = E.calculate_difference(labels=labels, preds=present_simple_pred, lower=True)

Generating outputs: 100%|██████████| 10/10 [00:03<00:00,  2.80it/s]

- avg_p: 0.7375
lower:
- avg_p: 0.8125





In [16]:
present_simple_pred = E2.predict(present_simple['problem'].tolist())

present_simple_results = E2.calculate_difference(labels=labels, preds=present_simple_pred)
print('lower:')
present_simple_results_lower = E2.calculate_difference(labels=labels, preds=present_simple_pred, lower=True)

Generating outputs: 100%|██████████| 10/10 [00:03<00:00,  2.79it/s]

- avg_p: 0.75
lower:
- avg_p: 0.8



