In [4]:
from datasets import load_dataset

In [7]:
# Let's use the QASC dataset from AllenAI Institute
qasc = load_dataset("qasc", split="train")

In [8]:
# All the answers in the QASC dataset come in this format
keys = ["A", "B", "C", "D", "E", "F", "G", "H"]

In [9]:
qasc

Dataset({
    features: ['id', 'question', 'choices', 'answerKey', 'fact1', 'fact2', 'combinedfact', 'formatted_question'],
    num_rows: 8134
})

In [33]:
qasc["test"][9]

{'id': '3TMSXRD2X6Z77PSX9W0GF5UB1E9W1E',
 'question': 'What animal can hunt at night?',
 'choices': {'text': ['food',
   'fish',
   'bird',
   'owl',
   'deer',
   'cows',
   'rhinos',
   'frog'],
  'label': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']},
 'answerKey': '',
 'fact1': '',
 'fact2': '',
 'combinedfact': '',
 'formatted_question': 'What animal can hunt at night? (A) food (B) fish (C) bird (D) owl (E) deer (F) cows (G) rhinos (H) frog'}

In [26]:
# We must remove any questions for which the answerKey is not present
qasc.filter(lambda example: example["answerKey"] in keys)

Filter:   0%|          | 0/8134 [00:00<?, ? examples/s]

Filter:   0%|          | 0/920 [00:00<?, ? examples/s]

Filter:   0%|          | 0/926 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'fact1', 'fact2', 'combinedfact', 'formatted_question'],
        num_rows: 8134
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'fact1', 'fact2', 'combinedfact', 'formatted_question'],
        num_rows: 0
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey', 'fact1', 'fact2', 'combinedfact', 'formatted_question'],
        num_rows: 926
    })
})

In [6]:
# Just to get an idea of what each sample looks like!
qasc["train"][0]

{'id': '3E7TUJ2EGCLQNOV1WEAJ2NN9ROPD9K',
 'question': 'What type of water formation is formed by clouds?',
 'choices': {'text': ['pearls',
   'streams',
   'shells',
   'diamonds',
   'rain',
   'beads',
   'cooled',
   'liquid'],
  'label': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']},
 'answerKey': 'F',
 'fact1': 'beads of water are formed by water vapor condensing',
 'fact2': 'Clouds are made of water vapor.',
 'combinedfact': 'Beads of water can be formed by clouds.',
 'formatted_question': 'What type of water formation is formed by clouds? (A) pearls (B) streams (C) shells (D) diamonds (E) rain (F) beads (G) cooled (H) liquid'}

In [7]:
# For our model of choice, let's use the T5 small
checkpoint = "google-t5/t5-small"

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [17]:
def generate_targets(examples):
    """
    Each statement should correspond to a "Question? Answer" output
    """
    # Answers are always listed alphabetically in series of 8
    keys = ["A", "B", "C", "D", "E", "F", "G", "H"]
    
    # We get each question to each sentence
    questions = examples["question"]
    # We extract the answers to each question
    answers = list(map(
        lambda choices, answerKey: choices[keys.index(answerKey)],
        [x["text"] for x in examples["choices"]],
        examples["answerKey"]
    ))

    # The form is "question? answer"
    targets = list(map(
        lambda q, a: f"{q} {a}",
        questions,
        answers
    ))
    
    return targets

In [23]:
print(qasc["train"][:3]["combinedfact"])
print(generate_targets(qasc["train"][:4]))

['Beads of water can be formed by clouds.', 'Vapor turning into a liquid leaves behind beads of water', 'Steam forms beads of water.']
['What type of water formation is formed by clouds? beads', 'Where do beads of water come from? Vapor turning into a liquid', 'What forms beads of water?  Steam.', 'what kind of beads are formed from vapor condensing? h2o']


In [21]:
def preprocess_function(examples):
    """ 
    The objective of our model is to transform a sentence into a question.
    To do so, the input to the model will be the sentence itself.
    The output of the model, then, must be the question with the answer.
    The model inputs will be the combined facts.
    The model targets are the questions followed by the answer.
    """
    prompt = "ask: " # Each input will be formatted as "ask: sentence..."
    inputs = [prompt + sen for sen in examples["combinedfact"]]
    # We get each question to each sentence
    questions = examples["question"]
    # We generate a target for each statement, a question and its answer
    targets = generate_targets(examples)

    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_qasc = qasc.map(preprocess_function, batched=True)

Map:   0%|          | 0/8134 [00:00<?, ? examples/s]

Map:   0%|          | 0/920 [00:00<?, ? examples/s]

ValueError: '' is not in list

In [56]:
tokenized_qasc["train"][:4]

{'id': ['3U0SRXB7CD45D0I0FPO8PDZXRHSRNK',
  '351SEKWQS0G5U8EVLNEO79TTV8MMDS',
  '39LNWE0K4UV5FRZQM36LPGQ02Y3IUO',
  '3H8DHMCCW9AA4KES0B18SW1P5OLKDK'],
 'question': ['Harming a certain kind of animal will cause the population of that animal to do what?',
  'Meiosis is the type of cell division that produces:',
  'Blowing on a fire increases what near a fire?',
  'What reduces heat necessary for maximum predatory activity?'],
 'choices': [{'text': ['decrease',
    'eat more',
    'increase',
    'break off',
    'threatened',
    'reproduce',
    'reduce',
    'kill them'],
   'label': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']},
  {'text': ['animals',
    'Plant reproduction',
    'Most plants',
    'peachleaf willow',
    'Plants growth',
    'haploid cells',
    'spread flower seeds',
    'rapid expansion'],
   'label': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']},
  {'text': ['the amount of oxygen',
    'the sounds',
    'kinetic energy',
    'chlorofluorocarbons',
    'the firewood',
   

In [39]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [40]:
import evaluate

rouge = evaluate.load("rouge")

In [41]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [42]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_qasc",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    logging_steps=10,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_qasc["train"],
    eval_dataset=tokenized_qasc["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4517,1.238196,0.8311,0.5492,0.6899,0.6895,13.7027
2,1.2303,1.137372,0.8317,0.5525,0.6951,0.6948,13.5614
3,1.3259,1.102398,0.8328,0.5537,0.6969,0.6968,13.5897
4,1.3673,1.094201,0.8334,0.5565,0.6986,0.6985,13.57




TrainOutput(global_step=1832, training_loss=1.462356892735677, metrics={'train_runtime': 869.3841, 'train_samples_per_second': 33.679, 'train_steps_per_second': 2.107, 'total_flos': 186238777688064.0, 'train_loss': 1.462356892735677, 'epoch': 4.0})

In [169]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

mcq_model = AutoModelForSeq2SeqLM.from_pretrained("results_old_qasc/checkpoint-1374")

In [3]:
# Don't forget to add the "ask: " prompt!
input_text = "ask: Genetic engineering is manipulation of an organism's genes using technology."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = mcq_model.generate(input_ids, max_new_tokens=100, do_sample=False)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

NameError: name 'tokenizer' is not defined