In [None]:
from transformers import AutoTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer
import torch.optim as optim
import torch
import math
import time
import sys
import json
import numpy as np
import torch.nn as nn
import shutil
import random
import os
from google.colab import drive
import argparse

drive.mount('/content/drive')
cs461dir = '/content/drive/My Drive/Colab Notebooks/CS461/'
file_paths = ['train_complete.jsonl', 'dev_complete.jsonl', 'test_complete.jsonl']
if any(not os.path.exists(filepath) for filepath in file_paths):
  !cp -r "{cs461dir}"* /content/
  print("Files imported")

Mounted at /content/drive
Files imported


In [None]:
# Cell to load data, SURPPRESSED PRINTING, RECOMMENT IF NEED
def load_data():
    torch.manual_seed(0)
    answers = ['A','B','C','D']

    train = []
    test = []
    valid = []

    file_name = 'train_complete.jsonl'
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)

        base = result['fact1'] + ' [SEP] ' + result['question']['stem']
        ans = answers.index(result['answerKey'])

        obs = []
        for j in range(4):
            text = base + result['question']['choices'][j]['text'] + ' [SEP]'
            if j == ans:
                label = 1
            else:
                label = 0
            obs.append([text,label])
        train.append(obs)

        # print(obs)
        # print(' ')

        # print(result['question']['stem'])
        # print(' ',result['question']['choices'][0]['label'],result['question']['choices'][0]['text'])
        # print(' ',result['question']['choices'][1]['label'],result['question']['choices'][1]['text'])
        # print(' ',result['question']['choices'][2]['label'],result['question']['choices'][2]['text'])
        # print(' ',result['question']['choices'][3]['label'],result['question']['choices'][3]['text'])
        # print('  Fact: ',result['fact1'])
        # print('  Answer: ',result['answerKey'])
        # print('  ')

    file_name = 'dev_complete.jsonl'
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)

        base = result['fact1'] + ' [SEP] ' + result['question']['stem']
        ans = answers.index(result['answerKey'])

        obs = []
        for j in range(4):
            text = base + result['question']['choices'][j]['text'] + ' [SEP]'
            if j == ans:
                label = 1
            else:
                label = 0
            obs.append([text,label])
        valid.append(obs)

    file_name = 'test_complete.jsonl'
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)

        base = result['fact1'] + ' [SEP] ' + result['question']['stem']
        ans = answers.index(result['answerKey'])

        obs = []
        for j in range(4):
            text = base + result['question']['choices'][j]['text'] + ' [SEP]'
            if j == ans:
                label = 1
            else:
                label = 0
            obs.append([text,label])
        test.append(obs)

    return train, valid, test

# Add code to fine-tune and test your MCQA classifier.

In [None]:
# Classification Approach
def train(model, opt):
  print("training model...")
  model.to(opt.device)
  model.train()
  opt.classifier.train()

  data = opt.train
  criterion = nn.CrossEntropyLoss()
  num_samples = len(data)

  for epoch in range(opt.epochs):
    total_loss = 0.0
    random.shuffle(data)

    for i in range(0, num_samples, opt.batchsize):
      batch = data[i : i + opt.batchsize]
      curr_batch_size = len(batch)
      texts = []
      targets = []

      for question in batch:
        candidate_texts = [cand[0] for cand in question]
        correct_idx = next(idx for idx, cand in enumerate(question) if cand[1] == 1)
        texts.extend(candidate_texts)
        targets.append(correct_idx)

      # tokenize texts
      inputs = opt.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
      for input in inputs:
        inputs[input] = inputs[input].to(opt.device)

      # pass through BERT
      outputs = model(**inputs)

      # Extract embeddings and then pass through classifier head
      embeddings = outputs.last_hidden_state[:, 0, :]
      logits = opt.classifier(embeddings)
      logits = logits.view(curr_batch_size, 4)

      targets_tensor = torch.tensor(targets, device=opt.device)
      loss = criterion(logits, targets_tensor)

      opt.optimizer.zero_grad()
      loss.backward()
      opt.optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / (num_samples / opt.batchsize)
    print(f"Epoch {epoch+1}/{opt.epochs}, Loss: {avg_loss:.4f}")


def test(model, opt, file_path='dev_complete.jsonl'):
  model.to(opt.device)
  opt.classifier.to(opt.device)
  model.eval()
  opt.classifier.eval()

  # need to get and load data here
  if file_path == 'dev_complete.jsonl':
    data = opt.valid
  else:
    data = opt.test

  num_samples = len(data)/opt.batchsize
  criterion = nn.CrossEntropyLoss()


  criterion = nn.CrossEntropyLoss()
  total_correct = 0
  total_loss = 0.0
  total_questions = 0

  batch_size = opt.batchsize
  num_batches = (len(data) + batch_size - 1) // batch_size

  with torch.no_grad():
    for batch_idx in range(num_batches):
        batch = data[batch_idx * batch_size : (batch_idx+1) * batch_size]
        texts = []
        targets = []

        for question in batch:
            candidate_texts = [cand[0] for cand in question]
            correct_idx = next(idx for idx, cand in enumerate(question) if cand[1] == 1)
            texts.extend(candidate_texts)
            targets.append(correct_idx)

        inputs = opt.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        for input in inputs:
            inputs[input] = inputs[input].to(opt.device)

        # pass through BERT
        outputs = model(**inputs)

        # Extract embeddings and then pass through classifier head
        embeddings = outputs.last_hidden_state[:, 0, :]
        logits = opt.classifier(embeddings)

        current_batch_size = len(batch)
        logits = logits.view(current_batch_size, 4)

        targets_tensor = torch.tensor(targets, device=opt.device)
        loss = criterion(logits, targets_tensor)
        total_loss += loss.item() * current_batch_size

        # Compute predictions
        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == targets_tensor).sum().item()
        total_questions += current_batch_size

  avg_loss = total_loss / total_questions
  accuracy = total_correct / total_questions
  # print(f"Loss: {avg_loss:.4f} - Accuracy: {accuracy:.4f}")
  return accuracy, avg_loss

def main():
  random.seed(10)

  parser = argparse.ArgumentParser()
  parser.add_argument('-no_cuda', action='store_true')
  parser.add_argument('-SGDR', action='store_true')
  parser.add_argument('-epochs', type=int, default=20)
  parser.add_argument('-batchsize', type=int, default=16)
  parser.add_argument('-printevery', type=int, default=100)
  parser.add_argument('-lr', type=float, default=0.00001)
  parser.add_argument('-savename', type=str, default="bert-base-uncased.pth")
  parser.add_argument('-dir_name', type=str,default='model')
  parser.add_argument('-norm', type=float, default=2.0)

  if "google.colab" in sys.modules:
    sys.argv = ["notebook"]

  opt = parser.parse_args()
  opt.verbose = False

  opt.device = torch.device("cuda" if torch.cuda.is_available() and not opt.no_cuda else "cpu")

  time_name = time.strftime("%y%m%d_%H%M%S")
  opt.time_name = time_name
  dir_name = "saved/%s" % (opt.dir_name)
  if not os.path.exists(dir_name):
      os.makedirs(dir_name)

  print(str(opt))
  os.makedirs("./model_weights", exist_ok=True)

  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
  model = BertModel.from_pretrained("bert-base-uncased")
  optimizer = optim.Adam(model.parameters(), lr=3e-5)
  classifier = nn.Linear(768, 1)
  opt.tokenizer = tokenizer
  opt.classifier = classifier
  opt.optimizer = torch.optim.Adam(list(model.parameters()) + list(classifier.parameters()),
                                 lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

  # data is in train, valid, and test, all []
  train_data, valid_data, test_data = load_data()
  opt.train = train_data
  opt.valid = valid_data
  opt.test = test_data

  print("testing model pre-trained model...")
  zero_shot_valid_acc, zero_shot_valid_loss = test(model, opt, file_path='dev_complete.jsonl')
  zero_shot_test_acc, zero_shot_test_loss = test(model, opt, file_path='test_complete.jsonl')
  print(f'Zero-shot Valid Accuracy: {zero_shot_valid_acc} - Zero-shot Valid Loss: {zero_shot_valid_loss}')
  print(f'Zero-shot Test Accuracy: {zero_shot_test_acc} - Zero-shot Test Loss: {zero_shot_test_loss}')
  print("Now fine-tuning...")

  train(model, opt)

  print("testing model fine-tuned model...")
  finetune_valid_acc, finetune_valid_loss = test(model, opt, file_path='dev_complete.jsonl')
  finetune_test_acc, finetune_test_loss = test(model, opt, file_path='test_complete.jsonl')
  print(f'Fine-tune Valid Accuracy: {finetune_valid_acc} - Fine-tune Valid Loss: {finetune_valid_loss}')
  print(f'Fine-tune Test Accuracy: {finetune_test_acc} - Fine-tune Test Loss: {finetune_test_loss}')

  # save a copy to drive
  torch.save({
    'model_state_dict': model.state_dict(),
    'classifier_state_dict': classifier.state_dict()
  }, "./model_weights/bert-base-uncased.pth")

  shutil.copy(os.path.join("./model_weights", opt.savename), cs461dir)
  print("Saving model")

if __name__ == "__main__":
    main()

Namespace(no_cuda=False, SGDR=False, epochs=20, batchsize=16, printevery=100, lr=1e-05, savename='bert-base-uncased.pth', dir_name='model', norm=2.0, verbose=False, device=device(type='cuda'), time_name='250305_193844')
testing model pre-trained model...
Zero-shot Valid Accuracy: 0.276 - Zero-shot Valid Loss: 1.3842072486877441
Zero-shot Test Accuracy: 0.272 - Zero-shot Test Loss: 1.3835228042602539
Now fine-tuning...
training model...
Epoch 1/20, Loss: 1.2250
Epoch 2/20, Loss: 0.9568
Epoch 3/20, Loss: 0.7412
Epoch 4/20, Loss: 0.5326
Epoch 5/20, Loss: 0.3669
Epoch 6/20, Loss: 0.2473
Epoch 7/20, Loss: 0.1716
Epoch 8/20, Loss: 0.1238
Epoch 9/20, Loss: 0.1030
Epoch 10/20, Loss: 0.0817
Epoch 11/20, Loss: 0.0576
Epoch 12/20, Loss: 0.0519
Epoch 13/20, Loss: 0.0415
Epoch 14/20, Loss: 0.0335
Epoch 15/20, Loss: 0.0228
Epoch 16/20, Loss: 0.0254
Epoch 17/20, Loss: 0.0231
Epoch 18/20, Loss: 0.0243
Epoch 19/20, Loss: 0.0212
Epoch 20/20, Loss: 0.0165
testing model fine-tuned model...
Fine-tune Valid