In [1]:
import torch

print(torch.cuda.is_available())
device = torch.device("cpu")


False


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
import time

In [6]:
from torch.utils.data import Dataset

class GeneralDataset(Dataset):
    """
    Ideally, should work with any dataset.
    Just pass the inference lines to the constructor.
    """
    def __init__(self, **kwargs):
        # create attribute for all kwargs
        for k, v in kwargs.items():
            setattr(self, k, v)

    def __len__(self):
        for k in self.__dict__:
            return len(self.__dict__[k])
        return -1

    def __getitem__(self, item):
        # print all attributes
        return dict((k, self.__dict__[k][item]) for k in self.__dict__)

def generate_sample(model, tokenizer, dataloader, device):
    """
    generate negative samples using the model for revise training
    """
    samples = []
    model = model.module if hasattr(model, "module") else model
    model.eval()
    beam_size = 10
    samples_list = []
    for data in tqdm(dataloader):
        prob, label = data["prob"], data["label"]
        gen_prob = prob
        batch = tokenizer.prepare_seq2seq_batch(gen_prob, return_tensors="pt")
        for k, v in batch.items():
            batch[k] = v.to(device)

        text = model.generate(
            **batch,
            num_beams=beam_size,
            early_stopping=True,
            max_length=64,
            num_return_sequences=beam_size,
        )  # batch * 10, len
        text = tokenizer.batch_decode(text, skip_special_tokens=True)
        text = [clean_text(t) for t in text]

        label = [clean_text(t) for t in label]
        
        idx = 0


        for p, e in zip(prob, label):
            local_samples_list = dict()
            local_samples_list["prob"] = p
            local_samples_list["equations"] = []
            local_samples_list["gt"] = []
            local_samples_list["correct_answer"] = e

            samples.append((p, "<mask>", e, 0))
            samples.append((p, e, e, 1))
            beam = text[idx * beam_size : (idx + 1) * beam_size]
            for b in beam:
                if is_equal(e, b, number_filler=True):
                    samples.append((p, b, b, 1))
                    local_samples_list["equations"].append(b)
                    local_samples_list["gt"].append(1)
                else:
                    samples.append((p, b, e, 0))
                    local_samples_list["equations"].append(b)
                    local_samples_list["gt"].append(0)
            
            samples_list.append(local_samples_list)

            idx += 1

    return samples_list




In [4]:
import sys
sys.path.append("t5_codet5_based/")

from t5_GenerateRankModel import MyT5ForSequenceClassificationAndGeneration
from transformers import T5Config, T5Tokenizer
from utils import read_json, clean_text, is_equal
from tqdm import tqdm
from data_utils import extract_text_label


# open a json file and read it where text is in "text" key and infix equation is in "template_equ" key
train_file = "data/mawps_asdiv-a_svamp/testset_nodup.json"
data_limit = -1
batch_size = 16
eqn_order = "infix"
model_path = "debugmodels/svamp_t5_batch_output/generator_Mar_03_2023_svamp_infix/saved_model/"


data = read_json(train_file)
lines = []
labels = []
numbers_list = []
for i, item in enumerate(tqdm(data, desc="Prepare train data")):
    goal, proof, numbers = extract_text_label(item, eqn_order)
    lines.append(goal)
    labels.append(proof)
    numbers_list.append(numbers)
    if data_limit > 0 and i > data_limit:
        break
raw_train_dataset = GeneralDataset(
    prob=lines, label=labels, numbers=numbers_list
)

extra_args = {}
raw_train_dataloader = torch.utils.data.DataLoader(
    raw_train_dataset,
    batch_size=batch_size,
    drop_last=False,
    **extra_args,
    )

tokenizer = T5Tokenizer.from_pretrained(
    model_path, do_lower_case=False
)

print(f"load model from {model_path}")
config = T5Config.from_pretrained(model_path)
config.num_labels = 2
config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}
config.label2id = {"LABEL_0": 0, "LABEL_1": 1}

model = MyT5ForSequenceClassificationAndGeneration(
    modelpath= model_path, config=config, d_model=config.d_model, num_labels=2
)
model.resize_token_embeddings(len(tokenizer))
print("model load done")

config = model.config
model.to(device)

# model size
size = 0
for n, p in model.named_parameters():
    size += p.nelement()
print("Total parameters: {}".format(size))


Prepare train data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 151500.96it/s]


load model from debugmodels/svamp_t5_batch_output/generator_Mar_03_2023_svamp_infix/saved_model/
model load done
Total parameters: 1476393986


In [5]:
# Training
# generate samples for revise

print("start generate samples")
gen_start_time = time.time()
samples_ans = generate_sample(
    model, tokenizer, raw_train_dataloader, device
)
print(f"finish generate samples in {time.time() - gen_start_time}")


start generate samples


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

  2%|██▋                                                                                                                                                                    | 1/63 [00:24<25:20, 24.52s/it]


KeyboardInterrupt: 

In [6]:
import json
with open("t5_preds.json", "w") as fh:
    json.dump(samples_ans, fh, indent=4)