In [1]:
from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer,
                          DataCollatorForSeq2Seq, HfArgumentParser,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments,
                          default_data_collator, set_seed)
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
dataset = load_dataset('../seq2seq/load_sqlike_dataset_both.py')
val_dataset = dataset['validation']

Reusing dataset load_sqlike_dataset_both (/root/.cache/huggingface/datasets/load_sqlike_dataset_both/plain_text/1.0.0/9b47ac056002c467291a47147df7101617c2444e1c614525f177ec2a19c39023)
100%|██████████| 2/2 [00:00<00:00, 738.95it/s]


In [4]:
from transformers import AutoModelWithLMHead, AutoTokenizer

path = '../experiments/000-squad-clickbait-t5-base'

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSeq2SeqLM.from_pretrained(path)

def get_answer(question, context):
  input_text = "question : %s  context : %s" % (question, context)
  features = tokenizer([input_text], return_tensors='pt')

  out = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'],max_length=500)
      

  answer = tokenizer.decode(out[0],skip_special_tokens=True)
  return answer



context = "In Norse mythology, Valhalla is a majestic, enormous hall located in Asgard, ruled over by the god Odin."
question = "What is Valhalla ?"

get_answer(question, context)
# output: 'a majestic, enormous hall located in Asgard, ruled over by the god Odin'

'In Norse mythology, Valhalla is a majestic, enormous hall located in Asgard, ruled over by the god Odin.'

In [4]:
def create_input(i):
    data = dataset['validation'][i]
    return data['question'],data['context']

In [10]:
results = []
for i in tqdm(range(len(val_dataset))):
    uuid = val_dataset[i]['id']
    question,context = val_dataset[i]['question'],val_dataset[i]['context']
    answer = get_answer(question,context)
    results.append(
        {'uuid':uuid,'spoiler':answer}
    )

  0%|          | 0/800 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 7/800 [00:22<42:57,  3.25s/it]


KeyboardInterrupt: 

In [73]:
val_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answer'],
    num_rows: 800
})

In [16]:
def get_answer_batched(questions, contexts):
  input_texts = ["question : %s  context : %s" % (question, context) for question,context in zip(questions,contexts)]
  features = tokenizer(input_texts, return_tensors='pt',padding=True)

  outs = model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'],max_length=500)
      

  #answers = tokenizer.decode(outs,skip_special_tokens=True)
  return outs