In [4]:
pip install --upgrade transformers huggingface-hub


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/ed/ad/c9b96572ab7994e73c64588f8875741823f2daba70e746547fff9a2d9a54/transformers-4.46.2-py3-none-any.whl.metadata
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m232.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Obtaining dependency information for tokenizers<0.21,>=0.20 from https://files.pythonhosted.org/packages/50/f6/2841de926bc4118af996eaf0bdf0ea5b

In [4]:

import json
def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('spoken_test-v1.1.json')

In [5]:

def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
            
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [6]:
from transformers import DistilBertTokenizerFast
# initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [7]:

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [8]:

import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [9]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

2024-11-08 12:05:55.181613: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 12:05:55.192214: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 12:05:55.204478: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 12:05:55.208075: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 12:05:55.217806: I tensorflow/core/platform/cpu_feature_guar

In [10]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=2e-6)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(10):
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 0: 100%|██████████| 2320/2320 [02:34<00:00, 14.99it/s, loss=2.18]
Epoch 1: 100%|██████████| 2320/2320 [02:34<00:00, 15.03it/s, loss=3.36]
Epoch 2: 100%|██████████| 2320/2320 [02:34<00:00, 15.03it/s, loss=2.25]
Epoch 3: 100%|██████████| 2320/2320 [02:34<00:00, 15.03it/s, loss=1.04] 
Epoch 4: 100%|██████████| 2320/2320 [02:34<00:00, 15.03it/s, loss=1.36] 
Epoch 5: 100%|██████████| 2320/2320 [02:34<00:00, 15.03it/s, loss=1.31] 

In [15]:
# import os
# if not os.path.exists('../models'):
#    os.makedirs('../models')
# model_path = 'models/distilbert-custom'
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

In [12]:
import os

# Create a directory in your home directory
home_dir = os.path.expanduser("~")
model_dir = os.path.join(home_dir, "models")

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the model
model_path = os.path.join(model_dir, "distilbert-custom")
model.save_pretrained(model_path)


In [16]:
model = DistilBertForQuestionAnswering.from_pretrained("hw3-bert-base-model-dataset")
model.to(device)

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
     

In [None]:
# # switch model out of training mode
# model.eval()

# #val_sampler = SequentialSampler(val_dataset)
# val_loader = DataLoader(val_dataset, batch_size=16)

# true_starts = []
# true_ends = []
# pred_starts = []
# pred_ends = []

# # initialize loop for progress bar
# loop = tqdm(val_loader)
# # loop through batches
# for batch in loop:
#     # we don't need to calculate gradients as we're not training
#     with torch.no_grad():
#         # pull batched items from loader
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         start_true = batch['start_positions'].to(device)
#         end_true = batch['end_positions'].to(device)
#         # make predictions
#         outputs = model(input_ids, attention_mask=attention_mask)
#         # pull preds out
#         start_pred = torch.argmax(outputs['start_logits'], dim=1)
#         end_pred = torch.argmax(outputs['end_logits'], dim=1)
#         # append predictions and true values to lists
#         true_starts.extend(start_true.cpu().numpy())
#         true_ends.extend(end_true.cpu().numpy())
#         pred_starts.extend(start_pred.cpu().numpy())
#         pred_ends.extend(end_pred.cpu().numpy())
# import numpy as np
# # calculate precision and recall
# true_starts = np.array(true_starts)
# true_ends = np.array(true_ends)
# pred_starts = np.array(pred_starts)
# pred_ends = np.array(pred_ends)

# true_pos_starts = np.sum(np.logical_and(true_starts == pred_starts, true_starts != -1))
# true_pos_ends = np.sum(np.logical_and(true_ends == pred_ends, true_ends != -1))
# false_pos_starts = np.sum(np.logical_and(true_starts != pred_starts, pred_starts != -1))
# false_pos_ends = np.sum(np.logical_and(true_ends != pred_ends, pred_ends != -1))
# false_neg_starts = np.sum(np.logical_and(true_starts != pred_starts, true_starts != -1))
# false_neg_ends = np.sum(np.logical_and(true_ends != pred_ends, true_ends != -1))

# precision_starts = true_pos_starts / (true_pos_starts + false_pos_starts + 1e-9)
# recall_starts = true_pos_starts / (true_pos_starts + false_neg_starts + 1e-9)
# precision_ends = true_pos_ends / (true_pos_ends + false_pos_ends + 1e-9)
# recall_ends = true_pos_ends / (true_pos_ends + false_neg_ends + 1e-9)

# # calculate F1 score
# f1_starts = 2 * (precision_starts * recall_starts) / (precision_starts + recall_starts + 1e-9)
# f1_ends = 2 * (precision_ends * recall_ends) / (precision_ends + recall_ends + 1e-9)
# f1 = (f1_starts + f1_ends) / 2

# print("F1 score: {:.4f}".format(f1))

In [17]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
answers = []
references = []
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[start_pred[i] : end_pred[i]+1])
            ref = ' '.join(all_tokens[start_true[i] : end_true[i]+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            answers.append(answer)
            references.append(ref)
# calculate average accuracy in total


100%|██████████| 993/993 [00:30<00:00, 32.25it/s]


In [31]:

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    if len(scores_for_ground_truths)==0: return 0
    return max(scores_for_ground_truths)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
          f1_score, prediction, [ground_truths])
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'f1': f1}
     

In [32]:
evaluate(references,answers)

{'f1': 52.25009544921429}