In [1]:
import numpy as np
import random
import torch
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
ans, qc, spc, corsp, wpc, corwp = 0, 0, 0, 0, 0, 0
wp, sp = False, False

In [3]:
sp_train = np.load('/content/SP-train.npy', allow_pickle=True)
wp_train = np.load('/content/WP-train.npy', allow_pickle=True)

In [4]:
print(len(sp_train))
print(len(wp_train))

507
396


In [5]:
print(sp_train[0])
print(wp_train[0])

{'id': 'SP-0', 'question': 'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?', 'answer': 'Each daughter shares the same brother.', 'distractor1': 'Some daughters get married and have their own family.', 'distractor2': 'Some brothers were not loved by family and moved away.', 'distractor(unsure)': 'None of above.', 'label': 1, 'choice_list': ['Some daughters get married and have their own family.', 'Each daughter shares the same brother.', 'Some brothers were not loved by family and moved away.', 'None of above.'], 'choice_order': [1, 0, 2, 3]}
{'id': 'WP-0', 'question': 'How do you spell COW in thirteen letters?', 'answer': 'SEE O DOUBLE YOU.', 'distractor1': 'COWCOWCOWCOWW', 'distractor2': 'SEE OH DEREFORD', 'distractor(unsure)': 'None of above.', 'label': 1, 'choice_list': ['SEE OH DEREFORD', 'SEE O DOUBLE YOU.', 'COWCOWCOWCOWW', 'None of above.'], 'choice_order': [2, 0, 1, 3]}


In [6]:
def format(data):
    id = data['id']
    ques = data['question']
    ch_list = data['choice_list']
    ch_order = data['choice_order']
    ch = [f"{chr(97 + i)}. {ch_list[idx]}" for i, idx in enumerate(ch_order)]
    qtn = f"""Question: {ques}
    {ch[0]}
    {ch[1]}
    {ch[2]}
    {ch[3]}"""
    return qtn, id

In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 200

In [10]:
for epoch in range(num_epochs):
  for data in np.concatenate((sp_train, wp_train)):
    wp = False
    sp = False
    qtn, id = format(data)
    if '_' in id:
      continue
    if "WP" in id:
      wp = True
    if "SP" in id:
      sp = True
    qc += 1
    if wp:
      wpc += 1
    if sp:
      spc += 1
    inp = tokenizer(qtn, return_tensors="pt", padding=True, truncation=True).to(device)
    if data['answer'] not in data['choice_list']:
      print(f"Skipping question {id} because answer '{data['answer']}' is not in the list of choices.")
      continue
    lbl = data['choice_list'].index(data['answer'])
    op = model(input_ids = inp.input_ids, attention_mask = inp.attention_mask, labels = torch.tensor([lbl]).unsqueeze(0).to(device))
    loss = op.loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    plbl = torch.argmax(op.logits, dim=-1)
    if plbl == lbl:
      ans += 1
      if wp:
        corwp += 1
      if sp:
        corsp += 1
    print(f"Epoch {epoch+1}, Question {qc}: Loss: {loss.item()}")

accuracy = ans / qc
print(f"Total Accuracy: {accuracy:.2f}")
print(f"WP Accuracy: {corwp / wpc:.2f}")
print(f"SP Accuracy: {corsp / spc:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 184, Question 55204: Loss: 6.198863957251888e-06
Epoch 184, Question 55205: Loss: 4.768370445162873e-07
Epoch 184, Question 55206: Loss: 6.55629628454335e-05
Epoch 184, Question 55207: Loss: 0.0
Epoch 184, Question 55208: Loss: 0.006190530024468899
Epoch 184, Question 55209: Loss: 0.00027259447961114347
Epoch 184, Question 55210: Loss: 5.006777428206988e-06
Epoch 184, Question 55211: Loss: 0.0
Epoch 184, Question 55212: Loss: 1.6689286894688848e-06
Epoch 184, Question 55213: Loss: 4.136476854910143e-05
Epoch 184, Question 55214: Loss: 0.0
Epoch 184, Question 55215: Loss: 2.3841830625315197e-06
Epoch 184, Question 55216: Loss: 1.0371154530730564e-05
Epoch 184, Question 55217: Loss: 1.2636104656849056e-05
Epoch 184, Question 55218: Loss: 3.6954811548639555e-06
Epoch 184, Question 55219: Loss: 1.6689286894688848e-06
Epoch 184, Question 55220: Loss: 7.152555099310121e-07
Epoch 184, Question 55221: Loss: 4.74441731057595

In [11]:
# Prediction
print('Test Example:')
print(qtn)
print('Answer:', data['answer'])
print('Predicted:', chr(97 + plbl.item()))

Test Example:
Question: What kind of ice doesn't contain water?
    a. Dry ice.
    b. Flaked ice.
    c. Glacier ice.
    d. None of above.
Answer: Dry ice.
Predicted: a
