In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

import collections

import numpy as np
import pandas as pd

import evaluate
from transformers import AutoModelForQuestionAnswering
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from datasets import Dataset


In [None]:
!nvidia-smi

torch.cuda.is_available()

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



False

# Set Experiment Parameters

In [None]:
#set expt to run
expt_id = 0
#expt_id = 3 #use this to run all 3 Roberta experiments
#expt_id = 6 #use this to run all 3 Scibert experiments

#expt set up: model/checkpoint id (int), use_full_context (True/False), merge (True/False)
expts=[
    [0, True, False], #bert, full context, not merged
    [0, False, False], #bert, shortened context, not merged
    [0, True, True], #bert, shortened context, merged
    [1, True, False], #roberta, full context, not merged
    [1, False, False], #roberta, shortened context, not merged
    [1, True, True], #roberta, shortened context, merged
    [2, True, False], #scibert, full context, not merged
    [2, False, False], #scibert, shortened context, not merged
    [2, True, True], #scibert, shortened context, merged
]
models = ["bert-base-cased", "roberta-base", "allenai/scibert_scivocab_uncased"]
squad_models = ["bert-finetuned-squad", "roberta-finetuned-squad", "allenai/scibert-finetuned-squad"]

curr_expt = expts[expt_id] 
model_id = curr_expt[0]

#use_full_context = curr_expt[1]
merge = curr_expt[2]

model_checkpoint = str(models[model_id])
squad_checkpoint = squad_models[model_id]
trained_model_checkpoint = np.where(merge, model_checkpoint, str(squad_checkpoint))

#print('model name:' + str(model_checkpoint) + ', training model name: ' + str(trained_model_checkpoint) + ', use full context: ' + str(use_full_context) + ', merged dataset: ' + str(merge))
print('model name:' + str(model_checkpoint) + ', training model name: ' + str(trained_model_checkpoint) + ', merged dataset: ' + str(merge))

model name:bert-base-cased, training model name: bert-finetuned-squad, use full context: True, merged dataset: False


# Load CovidQA Dataset

In [None]:
# NON-SHORTENED CONTEXT VERSION
df = pd.read_json('https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/question-answering/COVID-QA.json')
#df = pd.read_json('COVID-QA.json')

qa_list = []
for index, row in df.iterrows():
  for i in range(len(row['data']['paragraphs'][0]['qas'])):
    qa_temp_list = []
    qa_temp_list.append(row['data']['paragraphs'][0]['context'])
    qa_temp_list.append(row['data']['paragraphs'][0]['qas'][i]['question'])
    qa_temp_list.append(row['data']['paragraphs'][0]['qas'][i]['id'])
    answer = {}
    answer['text'] = []
    answer['answer_start'] = []
    for j in range(len(row['data']['paragraphs'][0]['qas'][i]['answers'])):
      answer['text'].append(row['data']['paragraphs'][0]['qas'][i]['answers'][j]['text'])
      answer['answer_start'].append(row['data']['paragraphs'][0]['qas'][i]['answers'][j]['answer_start'])
    qa_temp_list.append(answer)
    qa_list.append(qa_temp_list)
long_qa_df = pd.DataFrame(qa_list, columns = ['context', 'question', 'id', 'answers'])
long_qa_df

Unnamed: 0,context,question,id,answers
0,Functional Genetic Variants in DC-SIGNR Are As...,What is the main cause of HIV-1 infection in c...,262,{'text': ['Mother-to-child transmission (MTCT)...
1,Functional Genetic Variants in DC-SIGNR Are As...,What plays the crucial role in the Mother to C...,276,{'text': ['DC-SIGNR plays a crucial role in MT...
2,Functional Genetic Variants in DC-SIGNR Are As...,How many children were infected by HIV-1 in 20...,278,"{'text': ['more than 400,000 children were inf..."
3,Functional Genetic Variants in DC-SIGNR Are As...,What is the role of C-C Motif Chemokine Ligand...,316,"{'text': ['High copy numbers of CCL3L1, a pote..."
4,Functional Genetic Variants in DC-SIGNR Are As...,What is DC-GENR and where is it expressed?,305,{'text': ['Dendritic cell-specific ICAM-grabbi...
...,...,...,...,...
2014,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What is the structure of the Ebolavirus?,5315,"{'text': ['single-strand RNA filoviruses'], 'a..."
2015,"Ebola Virus Maintenance: If Not (Only) Bats, W...",When was the West African Ebolavirus outbreak?,5316,"{'text': ['2013-2016'], 'answer_start': [2546]}"
2016,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What animals are considered to be maintenance ...,5317,"{'text': ['African bats'], 'answer_start': [40..."
2017,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What do circles indicate in Figure 1?,5318,{'text': ['a maintenance function play by the ...


## Shortened Context Version

In [None]:
# SHORTENED CONTEXT VERSION
df = pd.read_json('https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/question-answering/COVID-QA.json')

qa_list = []
for index, row in df.iterrows():
  paragraph_list = row['data']['paragraphs'][0]['context'].split('\n\n')
  for i in range(len(row['data']['paragraphs'][0]['qas'])):
    qa_temp_list = []
    short_context = '\n\n'.join(list(filter(lambda x : row['data']['paragraphs'][0]['qas'][i]['answers'][0]['text'] in x, paragraph_list)))
    qa_temp_list.append(short_context)
    qa_temp_list.append(row['data']['paragraphs'][0]['qas'][i]['question'])
    qa_temp_list.append(row['data']['paragraphs'][0]['qas'][i]['id'])

    ans_text = row['data']['paragraphs'][0]['qas'][i]['answers'][0]['text']
    ans_start = row['data']['paragraphs'][0]['qas'][i]['answers'][0]['answer_start']
    if (len(short_context)) > 0:
      ans_start = short_context.index(str(ans_text))
    answer = {'text': [ans_text],
              'answer_start': [ans_start]}
    qa_temp_list.append(answer)
    qa_list.append(qa_temp_list)
new_qa_df = pd.DataFrame(qa_list, columns = ['context', 'question', 'id', 'answers'])
new_qa_df = new_qa_df.drop(new_qa_df['context'].loc[new_qa_df['context'].str.len() <= 0].index)
short_qa_df = new_qa_df
short_qa_df.head()

Unnamed: 0,context,question,id,answers
0,Abstract: BACKGROUND: Mother-to-child transmis...,What is the main cause of HIV-1 infection in c...,262,{'text': ['Mother-to-child transmission (MTCT)...
1,Abstract: BACKGROUND: Mother-to-child transmis...,What plays the crucial role in the Mother to C...,276,{'text': ['DC-SIGNR plays a crucial role in MT...
2,"Text: Without specific interventions, the rate...",How many children were infected by HIV-1 in 20...,278,"{'text': ['more than 400,000 children were inf..."
3,"Beside DC-SIGNR, other HIV-1 receptors are kno...",What is the role of C-C Motif Chemokine Ligand...,316,"{'text': ['High copy numbers of CCL3L1, a pote..."
4,Dendritic cell-specific ICAM-grabbing non-inte...,What is DC-GENR and where is it expressed?,305,{'text': ['Dendritic cell-specific ICAM-grabbi...


In [None]:
len(short_qa_df['id'].unique())

2010

In [None]:
len(short_qa_df)

2010

## Select context for initial model

In [None]:
def split_qa_df(qa_df):
  qa_dataset = Dataset.from_pandas(qa_df)
  qa_dataset = qa_dataset.train_test_split(train_size = 0.7, seed = 42)
  split_qa_dataset = qa_dataset['test'].train_test_split(train_size = 0.8, seed = 42)
  split_qa_dataset['validation'] = split_qa_dataset.pop('test')
  split_qa_dataset['test'] = split_qa_dataset.pop('train')
  split_qa_dataset['train'] = qa_dataset['train']
  return split_qa_dataset

In [None]:
qa_dataset = split_qa_df(long_qa_df)
qa_dataset

DatasetDict({
    validation: Dataset({
        features: ['context', 'question', 'id', 'answers'],
        num_rows: 122
    })
    test: Dataset({
        features: ['context', 'question', 'id', 'answers'],
        num_rows: 484
    })
    train: Dataset({
        features: ['context', 'question', 'id', 'answers'],
        num_rows: 1413
    })
})

In [None]:
print('Id: ', qa_dataset['train'][0]['id'])
print('Context: ', qa_dataset['train'][0]['context'])
print('Question: ', qa_dataset['train'][0]['question'])
print('Answer: ', qa_dataset['train'][0]['answers'])

Id:  2512
Context:  Chikungunya: A Potentially Emerging Epidemic?

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2860491/

SHA: f7c3160bef4169d29e2a8bdd79dd6e9056d4774c

Authors: Thiboutot, Michelle M.; Kannan, Senthil; Kawalekar, Omkar U.; Shedlock, Devon J.; Khan, Amir S.; Sarangan, Gopalsamy; Srikanth, Padma; Weiner, David B.; Muthumani, Karuppiah
Date: 2010-04-27
DOI: 10.1371/journal.pntd.0000623
License: cc-by

Abstract: Chikungunya virus is a mosquito-borne emerging pathogen that has a major health impact in humans and causes fever disease, headache, rash, nausea, vomiting, myalgia, and arthralgia. Indigenous to tropical Africa, recent large outbreaks have been reported in parts of South East Asia and several of its neighboring islands in 2005–07 and in Europe in 2007. Furthermore, positive cases have been confirmed in the United States in travelers returning from known outbreak areas. Currently, there is no vaccine or antiviral treatment. With the threat of an emerging global pan

# Import Pre-Trained Models

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/vocab.txt
loading file tokenize

In [None]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length = max_length,
        truncation = "only_second",
        stride = stride,
        return_overflowing_tokens = True,
        return_offsets_mapping = True,
        padding = "max_length",
    )

    # Each member of offset_mapping is list of (start, stop) tuples representing span in context that tokens correspond to
    # Sample map is list of indices of samples that each feature is mapped to
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    
    # Iterate through features
    # Multiple features can be mapped to one sample due to truncation
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # sequence_ids is list of 1s and 0s - 0 for question token and 1 for context token
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length = max_length,
        truncation = "only_second",
        stride = stride,
        return_overflowing_tokens = True,
        return_offsets_mapping = True,
        padding = "max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
metric = evaluate.load("squad")

n_best = 20
max_answer_length = 200

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    for x in range(len(predicted_answers)):
      predicted_answers[x]['id'] = str(predicted_answers[x]['id'])

    for y in range(len(theoretical_answers)):
      theoretical_answers[y]['id'] = str(theoretical_answers[y]['id'])
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/pytorch_model.b

## Set Training Arguments (full context)

In [None]:
train_dataset = qa_dataset['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns = qa_dataset['train'].column_names
    )

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
validation_dataset = qa_dataset["validation"].map(    
    preprocess_validation_examples,
    batched = True,
    remove_columns = qa_dataset["validation"].column_names,
    )

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(trained_model_checkpoint)

bert-finetuned-squad


In [None]:
args = TrainingArguments(
    str(trained_model_checkpoint),
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
#print(trainer)
trainer.train()

***** Running training *****
  Num examples = 45436
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 17040
  Number of trainable parameters = 107721218
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: ignored

In [None]:
trainer.save_model('./content/saved_model'+str(expt_id))

Saving model checkpoint to ./content/saved_model0
Configuration saved in ./content/saved_model0/config.json
Model weights saved in ./content/saved_model0/pytorch_model.bin
tokenizer config file saved in ./content/saved_model0/tokenizer_config.json
Special tokens file saved in ./content/saved_model0/special_tokens_map.json


## Results for Non-shortened context

In [None]:
#using normal context
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, qa_dataset["validation"])

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4154
  Batch size = 8


Step,Training Loss


KeyboardInterrupt: ignored

## Set Training Arguments (shortened context)

In [None]:
expt_id += 1

In [None]:
qa_dataset = split_qa_df(short_qa_df)
qa_dataset

DatasetDict({
    validation: Dataset({
        features: ['context', 'question', 'id', 'answers', '__index_level_0__'],
        num_rows: 121
    })
    test: Dataset({
        features: ['context', 'question', 'id', 'answers', '__index_level_0__'],
        num_rows: 482
    })
    train: Dataset({
        features: ['context', 'question', 'id', 'answers', '__index_level_0__'],
        num_rows: 1407
    })
})

In [None]:
train_dataset = qa_dataset['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns = qa_dataset['train'].column_names
    )

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
validation_dataset = qa_dataset["validation"].map(    
    preprocess_validation_examples,
    batched = True,
    remove_columns = qa_dataset["validation"].column_names,
    )

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(trained_model_checkpoint)

bert-finetuned-squad


In [None]:
args = TrainingArguments(
    str(trained_model_checkpoint),
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
#print(trainer)
trainer.train()

***** Running training *****
  Num examples = 3031
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1137
  Number of trainable parameters = 107721218


Step,Training Loss


KeyboardInterrupt: ignored

In [None]:
trainer.save_model('./content/saved_model'+str(expt_id))

Saving model checkpoint to ./content/saved_model1
Configuration saved in ./content/saved_model1/config.json
Model weights saved in ./content/saved_model1/pytorch_model.bin
tokenizer config file saved in ./content/saved_model1/tokenizer_config.json
Special tokens file saved in ./content/saved_model1/special_tokens_map.json


## Results for shortened context

In [None]:
#using shortened context
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, qa_dataset["validation"])

# Merge SQuAD and CovidQA Datasets
* Upsample CovidQA to be same size as SQuAD

## Set Training Arguments (merged dataset)

In [None]:
expt_id += 1

In [None]:
trained_model_checkpoint = model_checkpoint
print(model_checkpoint)
print(trained_model_checkpoint)

bert-base-cased
bert-base-cased


In [None]:
from datasets import load_dataset
from sklearn.utils import resample

def label_dataset(input, label):
    return {"type": label}

squad_dataset = load_dataset("squad")
squad_train_df = pd.DataFrame(squad_dataset['train'])
# squad_train_df = squad_dataset["train"][:]
qa_train_df = pd.DataFrame(qa_dataset['train'])
squad_train_df['id'] = squad_train_df['id'].astype(str)
qa_train_df['id'] = qa_train_df['id'].astype(str)

#  squad_validation_df = pd.DataFrame(squad_dataset["validation"])
#  qa_validation_df = pd.DataFrame(short_qa_df['validation'])
#  squad_validation_df['id'] = squad_validation_df['id'].astype(str)
#  qa_validation_df['id'] = qa_validation_df['id'].astype(str)

# # Merge train
# # Upsample
qa_train_upsample = resample(qa_train_df,
            replace=True,
            n_samples=len(squad_train_df),
            random_state=42)
merged_train = pd.concat([squad_train_df,qa_train_upsample],ignore_index=True)
merged_train_dataset =  Dataset.from_pandas(merged_train)
# # Merge validation
# # Upsample
#  qa_validation_upsample = resample(qa_validation_df,
#              replace=True,
#              n_samples=len(squad_validation_df),
#              random_state=42)

#  merged_validation = pd.concat([squad_validation_df,qa_validation_upsample],ignore_index=True)
#  merged_validation_dataset = Dataset.from_pandas(merged_validation)
#else:
#  merged_train_dataset = qa_dataset['train']
#  merged_validation_dataset = qa_dataset['validation']



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
merged_validation_dataset = qa_dataset['validation']

In [None]:
print(merged_train_dataset.shape)
print(merged_validation_dataset.shape)

(175198, 6)
(121, 5)


In [None]:
len(set(merged_validation_dataset['id']))

121

In [None]:
train_dataset = merged_train_dataset.map(
    preprocess_training_examples,
    batched = True,
    remove_columns = merged_train_dataset.column_names
)

  0%|          | 0/176 [00:00<?, ?ba/s]

In [None]:
validation_dataset = merged_validation_dataset.map(
    preprocess_validation_examples,
    batched = True,
    remove_columns = merged_validation_dataset.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(trained_model_checkpoint)

bert-base-cased


In [None]:
args = TrainingArguments(
    str(trained_model_checkpoint),
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
#print(trainer)
trainer.train()

***** Running training *****
  Num examples = 277938
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104229
  Number of trainable parameters = 107721218
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: ignored

In [None]:
trainer.save_model('./content/saved_model'+str(expt_id))

Saving model checkpoint to ./content/saved_model2
Configuration saved in ./content/saved_model2/config.json
Model weights saved in ./content/saved_model2/pytorch_model.bin
tokenizer config file saved in ./content/saved_model2/tokenizer_config.json
Special tokens file saved in ./content/saved_model2/special_tokens_map.json


## Results for merged shortened context

In [None]:
#using merged shortened context
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, qa_dataset["validation"])

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 279
  Batch size = 8


Step,Training Loss


  0%|          | 0/121 [00:00<?, ?it/s]

{'exact_match': 0.0, 'f1': 14.188800610327043}