In [1]:
!pip install transformers==4.28.0
import os
import torch
from datasets import load_dataset
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering


Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manyl

Loading the NewsQA dataset

In [2]:
import pandas as pd
splits = {'train': 'data/train-00000-of-00001-ec54fbe500fc3b5c.parquet', 'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'}
df = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["train"])


In [3]:
df.head()

Unnamed: 0,context,question,answers,key,labels
0,"NEW DELHI, India (CNN) -- A high court in nort...",What was the amount of children murdered?,[19],da0e6b66e04d439fa1ba23c32de07e50,"[{'end': [295], 'start': [294]}]"
1,"NEW DELHI, India (CNN) -- A high court in nort...",When was Pandher sentenced to death?,[February.],724f6eb9a2814e4fb2d7d8e4de846073,"[{'end': [269], 'start': [261]}]"
2,"NEW DELHI, India (CNN) -- A high court in nort...",The court aquitted Moninder Singh Pandher of w...,[rape and murder],d64cbb90e5134081acfa83d3e702408c,"[{'end': [638], 'start': [624]}]"
3,"NEW DELHI, India (CNN) -- A high court in nort...",who was acquitted,[Moninder Singh Pandher],fd7177ee6f1f4d62becd983a0305f503,"[{'end': [216], 'start': [195]}]"
4,"NEW DELHI, India (CNN) -- A high court in nort...",who was sentenced,[Moninder Singh Pandher],cd25c69f631349748ccdeccaace66463,"[{'end': [216], 'start': [195]}]"


In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict

splits = {
    'train': 'data/train-00000-of-00001-ec54fbe500fc3b5c.parquet',
    'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'
}
df_train = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["validation"])
df_train = df_train

def df_to_flat_dataset(df):
    flat = []
    for _, row in df.iterrows():
        context = str(row["context"])
        question = str(row["question"])
        answer_text = str(row["answers"][0])
        answer_start = int(row["labels"][0]["start"][0])
        qid = str(row["key"])
        flat.append({
            "context": context,
            "question": question,
            "answers": [{"text": answer_text, "answer_start": answer_start}],
            "id": qid
        })
    return flat

train_dataset = Dataset.from_list(df_to_flat_dataset(df_train))
val_dataset   = Dataset.from_list(df_to_flat_dataset(df_val))
raw_datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

print(raw_datasets)
print(raw_datasets["train"][0])


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 74160
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 4212
    })
})
{'context': 'NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."\n\n\n\nMoninder Singh Pandher was sentenced to death by a lower court in February.\n\n\n\nThe teen was one of 19 victims -- children and young women -- in one of the most gruesome serial killings in India in recent years.\n\n\n\nThe Allahabad high court has acquitted Moninder Singh Pandher, his lawyer Sikandar B. Kochar told CNN.\n\n\n\nPandher and his domestic employee Surinder Koli were sentenced to death in February by a lower court for the rape and murder of the 14-year-old.\n\n\n\nThe high court upheld Koli\'s death sentence, Kochar said.\

In [5]:
from transformers import RobertaTokenizerFast

MODEL_NAME = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

MAX_LENGTH = 256
DOC_STRIDE = 64
NUM_PROC = None 

def prepare_features(examples):
    tokenized_list = {
        "input_ids": [],
        "attention_mask": [],
        "start_positions": [],
        "end_positions": []
    }

    for i in range(len(examples["context"])):
        context = examples["context"][i]
        question = examples["question"][i]
        answer = examples["answers"][i][0]
        answer_text = answer["text"]
        answer_start = answer["answer_start"]
        answer_end = answer_start + len(answer_text)

        encodings = tokenizer(
            question,
            context,
            truncation="only_second",
            max_length=MAX_LENGTH,
            stride=DOC_STRIDE,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length"
        )

        overflow_sample_mapping = encodings.pop("overflow_to_sample_mapping")
        offset_mapping = encodings.pop("offset_mapping")

        for j, offsets in enumerate(offset_mapping):
            input_ids = encodings["input_ids"][j]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            start_token, end_token = cls_index, cls_index
            for idx, (start_off, end_off) in enumerate(offsets):
                if start_off <= answer_start < end_off:
                    start_token = idx
                if start_off < answer_end <= end_off:
                    end_token = idx

            tokenized_list["input_ids"].append(input_ids)
            tokenized_list["attention_mask"].append(encodings["attention_mask"][j])
            tokenized_list["start_positions"].append(start_token)
            tokenized_list["end_positions"].append(end_token)

    return tokenized_list




tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [6]:

tokenized_datasets = raw_datasets.map(
    prepare_features,
    batched=True,
    num_proc=NUM_PROC, 
    remove_columns=raw_datasets["train"].column_names
)

print(tokenized_datasets)
print(tokenized_datasets["train"][0])


Map:   0%|          | 0/74160 [00:00<?, ? examples/s]

Map:   0%|          | 0/4212 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 287260
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 16232
    })
})
{'input_ids': [0, 2264, 21, 5, 1280, 9, 408, 9694, 116, 2, 2, 5341, 16286, 15473, 6, 666, 36, 16256, 43, 480, 83, 239, 461, 11, 3285, 666, 15, 273, 17871, 10, 8581, 8950, 2114, 5, 744, 3645, 13, 5, 2429, 9, 10, 6066, 11, 10, 403, 9260, 22, 627, 790, 9, 30178, 72, 50140, 50118, 50118, 17312, 7026, 3657, 13163, 1843, 21, 4018, 7, 744, 30, 10, 795, 461, 11, 902, 4, 50140, 50118, 50118, 133, 6066, 21, 65, 9, 753, 1680, 480, 408, 8, 664, 390, 480, 11, 65, 9, 5, 144, 25988, 13603, 8798, 11, 666, 11, 485, 107, 4, 50140, 50118, 50118, 133, 20788, 7826, 239, 461, 34, 17871, 3385, 7026, 3657, 13163, 1843, 6, 39, 2470, 17209, 463, 271, 163, 4, 14296, 271, 174, 3480, 4, 50140, 50118, 50118, 45741, 1843

In [7]:
import torch
from transformers import RobertaForQuestionAnswering

model = RobertaForQuestionAnswering.from_pretrained(MODEL_NAME)


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [10]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./newsqa_fast",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=0.3,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10000,
    report_to="none",
    disable_tqdm=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)


In [11]:
trainer.train()

{'loss': 0.8558, 'learning_rate': 2.1776663881927038e-06, 'epoch': 0.28}
{'eval_loss': 0.800955593585968, 'eval_runtime': 129.5208, 'eval_samples_per_second': 125.323, 'eval_steps_per_second': 15.665, 'epoch': 0.3}
{'train_runtime': 2514.5087, 'train_samples_per_second': 34.272, 'train_steps_per_second': 4.284, 'train_loss': 0.8535968409068824, 'epoch': 0.3}


TrainOutput(global_step=10773, training_loss=0.8535968409068824, metrics={'train_runtime': 2514.5087, 'train_samples_per_second': 34.272, 'train_steps_per_second': 4.284, 'train_loss': 0.8535968409068824, 'epoch': 0.3})

In [12]:
trainer.save_model("./newsqa_roberta_final")  # saves model + config
tokenizer.save_pretrained("./newsqa_roberta_final")

('./newsqa_roberta_final/tokenizer_config.json',
 './newsqa_roberta_final/special_tokens_map.json',
 './newsqa_roberta_final/vocab.json',
 './newsqa_roberta_final/merges.txt',
 './newsqa_roberta_final/added_tokens.json',
 './newsqa_roberta_final/tokenizer.json')

In [13]:
import torch

def answer_question(model, tokenizer, question, context, max_len=256):
    model.eval()
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=max_len
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits)
    end_idx   = torch.argmax(outputs.end_logits)

    answer_ids = inputs["input_ids"][0][start_idx:end_idx+1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)
    return answer


In [15]:
context = """The Amazon rainforest is often called the "lungs of the Earth" because it produces a large portion of the planet’s oxygen. However, deforestation has caused a significant decrease in its size over the past decades."""

question = "Why is the Amazon rainforest called the lungs of the Earth?"

print(answer_question(model, tokenizer, question, context))


 it produces a large portion of the planet’s oxygen.
