# ABOUT:
- this notebook further trains a fine-tuned Question Answering Model to detect negation span
    - e.g "It has a well thought out design and it is clean but it certainly feels a bit hollow and fake."
    - the model is trained to predict "but" as the negation span
    - e.g "I love food"
    - the model also predicts no span if there are no negation found

In [1]:
model_checkpoint  = "deepset/roberta-base-squad2"
max_length = 30
batch_size = 10

#### import data

In [2]:
import json
path = r"C:\Users\tanch\Documents\NTU\NTU Year 3\Sem 1\CZ4045 Natural Language Processing\Assignment 1\local\data\answers.json"
with open(path) as f:
    data = json.load(f)

In [3]:
import json
from pathlib import Path
def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    is_impossible = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if qa['is_impossible']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append({'text': [],'answer_start': []})
                    is_impossible.append(qa['is_impossible'])
                else:
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answer['answer_start']  = [answer['answer_start']]
                        answer['text']  = [answer['text']]
                        answers.append(answer)
                        is_impossible.append(qa['is_impossible'])
    return contexts, questions, answers, is_impossible

In [4]:
from datasets import Dataset
import pandas as pd
dataset = pd.DataFrame(dict(zip(["context", "question", "answers", "is_impossible"],  read_squad(path))))
dataset = Dataset.from_pandas(dataset)
dataset

Dataset({
    features: ['context', 'question', 'answers', 'is_impossible'],
    num_rows: 466
})

#### some helper functions:
- read_squad: reads squad format data
- show_random_elements: shows show samples in a Dataset object
- prepare_train_features: 
    - tokenizes dataset 
    - prepares data as a Question Answering dataset

In [5]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples = 10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [6]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    pad_on_right = tokenizer.padding_side == "right"
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        # stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",                                                                                    
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

#### original dataset in SQUAD format

In [7]:
show_random_elements(dataset,5)

Unnamed: 0,context,question,answers,is_impossible
0,I should also say that even though the sale was done about two months ago... Darlene still had her hands in everything helping to ensure we were taken care of!!,negation,"{'answer_category': None, 'answer_id': None, 'answer_start': [], 'document_id': None, 'question_id': None, 'text': []}",True
1,It has a well thought out design and it is clean but it certainly feels a bit hollow and fake.,negation,"{'answer_category': None, 'answer_id': 191675, 'answer_start': [49], 'document_id': 245424, 'question_id': 124928, 'text': ['but']}",False
2,"If these problems remain unacknowledged and unaddressed, the country may lose its predominance and endanger its security.\tCopy \n",negation,"{'answer_category': None, 'answer_id': 192580, 'answer_start': [25], 'document_id': 260367, 'question_id': 124928, 'text': ['unacknowledged']}",False
3,"So pretty much at the cash out, the server was debating why I didn't follow up with him and in my mind I was like ""uhhh, I know your dirty secret of how you cover things up so..."" pretty much it confirmed a lot of my experiences before at this location.",negation,"{'answer_category': None, 'answer_id': 191663, 'answer_start': [62], 'document_id': 245412, 'question_id': 124928, 'text': ['didn't']}",False
4,"Impulsiveness, impatience, senseless rebellion, and extravagance are the traits that so often undermine their work and dreams.\tCopy \n",negation,"{'answer_category': None, 'answer_id': 192654, 'answer_start': [0], 'document_id': 260432, 'question_id': 124928, 'text': ['Impulsiveness']}",False


## AutoTokenizer
- tokenizes sentences into subwords/words

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
dataset = Dataset.from_dict(prepare_train_features(dataset))
dataset = dataset.train_test_split(0.2, seed = 0)
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 480
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 120
    })
})

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 480
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 120
    })
})

#### Question Answering dataset
- model takes "input_ids" as input and outputs "start_positions" and "end_positions"
- if there are no negation, model needs to predict token index 0 

In [11]:
show_random_elements(dataset["train"],5)

Unnamed: 0,input_ids,attention_mask,start_positions,end_positions
0,"[0, 23156, 1258, 2, 2, 1106, 47, 214, 2600, 42, 47, 214, 1153, 608, 5, 6089, 276, 631, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0
1,"[0, 23156, 1258, 2, 2, 9497, 33, 36939, 196, 4905, 8, 156, 33424, 1635, 14, 32, 12030, 4, 50117, 48233, 1437, 50118, 2, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]",7,8
2,"[0, 23156, 1258, 2, 2, 170, 33, 7154, 450, 143, 3250, 101, 2431, 4, 50118, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7,7
3,"[0, 23156, 1258, 2, 2, 18, 77, 38, 554, 21252, 5, 647, 165, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0
4,"[0, 23156, 1258, 2, 2, 100, 269, 1034, 5, 813, 8, 1945, 1516, 1058, 11, 2111, 544, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0


#### Format of input
- the question asked to the model is "negation" 
- the user's input sentence (that contains the negation) is concatenated thereafter

In [12]:
tokenizer.decode([0, 23156, 1258, 2, 2, 19, 5, 10228, 142, 3255, 32, 6, 51, 1153, 18774, 7, 10064, 24, 11, 8, 51, 40, 45, 190, 1137, 47, 14, 4, 2, 1])

'<s>negation</s></s> with the server because chances are, they probably forgot to punch it in and they will not even tell you that.</s><pad>'

## AutoModelForQuestionAnswering
- model architecture:
    - BERT/BERT variants
    - 2 classifier heads:
        1. predict start token classifier
        2. predict end token classifier
    

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [14]:
args = TrainingArguments(
    f"test-squad",
    evaluation_strategy = "epoch",          # evaluation_strategy = "epoch" means to evaluate at the end of each epoch
    # eval_steps = 100,                       # evaluate every n number of STEPS - steps = num samples/batch size
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy = "epoch",
    logging_strategy = "epoch",
    weight_decay = 0.002,
    seed  = 0,
    load_best_model_at_end = True
)

In [15]:
from transformers import default_data_collator
data_collator = default_data_collator

## Trainer
- executes the training

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [17]:
trainer.train()

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Epoch,Training Loss,Validation Loss
1,1.3057,0.802366
2,0.5757,0.727961
3,0.3434,0.758608
4,0.2068,0.823224
5,0.1559,0.803756


TrainOutput(global_step=240, training_loss=0.5175093213717142, metrics={'train_runtime': 10399.1977, 'train_samples_per_second': 0.023, 'total_flos': 0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 16035840, 'init_mem_cpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 2048618496, 'train_mem_cpu_peaked_delta': 0})