In [1]:
from transformers import AutoTokenizer,AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoConfig
from transformers import DefaultDataCollator
from transformers import TrainingArguments
from transformers import HfArgumentParser
from transformers import Trainer
from datasets import load_dataset
import torch
from transformers import DistilBertModel
from datasets import load_dataset
from transformers import PreTrainedModel,PretrainedConfig
from transformers.modeling_outputs import QuestionAnsweringModelOutput
import numpy as np
import re
import string
import collections

In [4]:
squad = load_dataset("squad", split="train[:5000]")

squad = squad.train_test_split(test_size=0.2)


my_dataset = squad

Found cached dataset squad (/home/vp.shivasan/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["context"] = examples["context"]
    inputs["answer"] = answers
    return inputs

tokenized_data = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
class DistillBERTQA(PreTrainedModel):
    def __init__(self,config: PretrainedConfig):
        # super(DistillBERTQA, selfconfig).__init__()
        super().__init__(config)
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.qa_outputs = torch.nn.Linear(768, 2)
        self.dropout = torch.nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask,start_positions=None,end_positions=None,return_dict=None):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = distilbert_output[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)

        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


In [6]:
config = AutoConfig.from_pretrained("distilbert-base-uncased")
model = DistillBERTQA(config)

# arguments for Trainer
test_args = TrainingArguments(
    output_dir = "/home/vp.shivasan/interiit/task2/training_dir/squad_test",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
              model = model, 
              args = test_args, 
                )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

# Metric calculation taken from https://rajpurkar.github.io/SQuAD-explorer/
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0,0,0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return precision,recall,f1

preds,labels,metrics = trainer.predict(tokenized_data['test'])
start_idxs = np.argmax(preds[0],axis=1)
end_idxs = np.argmax(preds[1],axis=1)
contexts = tokenized_data['test']['context']
answers = tokenized_data['test']['answer']
assert len(contexts) == len(start_idxs) == len(answers) 
F1s = []
Precs = []
Recs = []
for i,(sidx,eidx) in enumerate(zip(start_idxs,end_idxs)):
    context_para = contexts[i]
    pred_answer = context_para[sidx:eidx+1]
    gold_answer = answers[i]['text'][0]
    p,r,f = compute_f1(gold_answer,pred_answer)
    Precs.append(p)
    Recs.append(r)
    F1s.append(f)

print("Average Recall score: ",np.mean(Recs))
print("Average Precision score: ",np.mean(Precs))
print("Average F1 score: ",np.mean(F1s))


In [25]:
import pandas as pd
test_df = pd.read_csv('/home/vp.shivasan/interiit/data/Task2dataSet_test.csv')

In [23]:

class DistillBERTQA(PreTrainedModel):
    def __init__(self,config: PretrainedConfig):
        # super(DistillBERTQA, config).__init__()
        super().__init__(config)
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.qa_outputs = torch.nn.Linear(768, 2)
        self.dropout = torch.nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask,start_positions=None,end_positions=None,return_dict=None):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = distilbert_output[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


In [24]:

tokenizer = AutoTokenizer.from_pretrained("/home/vp.shivasan/interiit/task2/training_dir/task2_50epochs_2e-5_ES/checkpoint-5076/")
config = AutoConfig.from_pretrained("/home/vp.shivasan/interiit/task2/training_dir/task2_50epochs_2e-5_ES/checkpoint-5076/")
model = DistillBERTQA(config)
state_dict = torch.load('/home/vp.shivasan/interiit/task2/training_dir/task2_50epochs_2e-5_ES/checkpoint-5076/pytorch_model.bin')
model.load_state_dict(state_dict)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [35]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,AutoModel
tokenizer = AutoTokenizer.from_pretrained('/home/vp.shivasan/interiit/task2/training_dir/task2_MiniLM_50epochs_2e-5_FINAL_ES/checkpoint-5076')
model = AutoModel.from_pretrained('/home/vp.shivasan/interiit/task2/training_dir/task2_MiniLM_50epochs_2e-5_FINAL_ES/checkpoint-5076',local_files_only = True,return_dict = True)
config = AutoConfig.from_pretrained('/home/vp.shivasan/interiit/task2/training_dir/task2_MiniLM_50epochs_2e-5_FINAL_ES/checkpoint-5076')
# model = AutoModel.from_config('/home/vp.shivasan/interiit/task2/training_dir/task2_MiniLM_50epochs_2e-5_FINAL_ES/checkpoint-5076/config.json')


Some weights of the model checkpoint at /home/vp.shivasan/interiit/task2/training_dir/task2_MiniLM_50epochs_2e-5_FINAL_ES/checkpoint-5076 were not used when initializing BertModel: ['distilbert.encoder.layer.10.output.dense.weight', 'distilbert.encoder.layer.0.attention.self.value.bias', 'distilbert.encoder.layer.8.output.LayerNorm.bias', 'distilbert.encoder.layer.2.output.LayerNorm.bias', 'distilbert.embeddings.position_ids', 'distilbert.encoder.layer.0.output.dense.weight', 'distilbert.encoder.layer.11.attention.self.value.weight', 'distilbert.encoder.layer.1.output.dense.weight', 'distilbert.encoder.layer.3.attention.self.query.bias', 'distilbert.encoder.layer.11.attention.output.dense.bias', 'distilbert.encoder.layer.4.output.dense.bias', 'distilbert.pooler.dense.weight', 'distilbert.encoder.layer.8.intermediate.dense.bias', 'distilbert.encoder.layer.6.attention.output.LayerNorm.bias', 'distilbert.encoder.layer.9.attention.self.value.bias', 'distilbert.encoder.layer.4.attention.sel

In [41]:
from transformers import pipeline
pipelined_QA = pipeline(task = "question-answering",model= model, config = config,tokenizer = tokenizer )

The model 'BertModel' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswering', 'LongformerForQuestionAnswering', 'LukeForQuestionAnswering', 'LxmertForQuestionAnswering', 'MarkupLMForQuestionAnswering', 'MBartForQuestionAnswering', 'MegatronBert

In [38]:
para = "Tennis is a racket sport that is played either individually against a single opponent (singles) or between two teams of two players each (doubles). Each player uses a tennis racket that is strung with cord to strike a hollow rubber ball covered with felt over or around a net and into the opponent's court. The object of the game is to manoeuvre the ball in such a way that the opponent is not able to play a valid return. The player who is unable to return the ball validly will not gain a point, while the opposite player will."
question = "What is the ball made of?"

In [15]:
pipelined_QA(question=question, context=para)['answer']
tokenizer(question,return_tensors='pt')

{'input_ids': tensor([[ 101, 2054, 2003, 1996, 3608, 2081, 1997, 1029,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [22]:
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    outtt = model(**inputs,return_dict=True)

    answer_start_scores, answer_end_scores = model(**inputs)['start_logits'],model(**inputs)['end_logits']

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")
    
outtt#['last_hidden_state']

BaseModelOutput(last_hidden_state=tensor([[[ 0.3559,  0.6228,  0.0090,  ..., -0.7749,  0.9461,  0.0315],
         [ 0.6609,  1.2548,  0.4564,  ..., -1.0720,  0.4900,  0.0694],
         [ 0.5250,  0.8569,  0.3972,  ..., -0.5089,  0.6350,  0.1445],
         ...,
         [ 0.1561,  0.3075,  0.5793,  ..., -0.9216,  0.4530,  0.3805],
         [ 0.8335,  0.6102,  0.7760,  ..., -0.7382,  0.5135,  0.1184],
         [ 1.2206,  0.8561,  0.8853,  ..., -0.9149,  0.8376, -0.0383]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [27]:
import ast
ast.literal_eval(test_df['answers'][0])['text'][0]

'Ricoh'

In [30]:
F1s = []
for i in range(len(test_df)):
    context = test_df['context'][i]
    question = test_df['question'][i]
    gold_answer = ast.literal_eval(test_df['answers'][0])['text'][0]
    inputs = tokenizer(question, context, return_tensors="pt",truncation=True)
    with torch.no_grad():
        outputs = model(**inputs,return_dict =True)
    # print(outputs)
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()
    # print(answer_start_index,answer_end_index)
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    pred_answer = tokenizer.decode(predict_answer_tokens)
    f = compute_f1(gold_answer,pred_answer)
    F1s.append(f)
    # break
print("Average F1 score: ",np.mean(F1s))


Average F1 score:  0.00019948134849391582


In [29]:
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1