In [None]:
!pip uninstall keras
!pip install keras==2.11.0


In [9]:
pip install transformers[torch] -U


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers[torch]
  Obtaining dependency information for transformers[torch] from https://files.pythonhosted.org/packages/75/d5/294a09a62bdd88da9a1007a341d4f8fbfc43be520c101e6afb526000e9f4/transformers-4.46.1-py3-none-any.whl.metadata
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m834.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers[torch])
  Obtaining dependency information for huggingface-hub<1.0,>=0.23.2 from https://files.pythonhosted.org/packages/60/bf

In [1]:
import json
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments, default_data_collator
from datasets import Dataset

with open("spoken_train-v1.1.json", "r") as f:
    data = json.load(f)

# Preparing data for Hugging Face datasets
qa_pairs = []
for article in data["data"]:
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            question = qa["question"]
            answer_text = qa["answers"][0]["text"]
            answer_start = qa["answers"][0]["answer_start"]
            qa_pairs.append({
                "context": context,
                "question": question,
                "answers": {
                    "text": [answer_text],
                    "answer_start": [answer_start]
                }
            })

# converting to Hugging Face Dataset
dataset = Dataset.from_list(qa_pairs)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# # Initializes tokenizer and model
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

In [2]:
from transformers import BertTokenizerFast, BertForQuestionAnswering

# Initializes the fast tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def preprocess(examples):
    questions = examples["question"]
    contexts = examples["context"]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        if i >= len(examples["answers"]):
            start_positions.append(0)
            end_positions.append(0)
            continue
        
        if len(examples["answers"][i]["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        answer_start = examples["answers"][i]["answer_start"][0]
        answer_text = examples["answers"][i]["text"][0]
        answer_end = answer_start + len(answer_text)

        start_position = end_position = 0
        for j, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_position = j
            if start < answer_end <= end:
                end_position = j
                break

        start_positions.append(start_position)
        end_positions.append(end_position)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    # learning_rate=3e-5,
    learning_rate=15e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

# Trains the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")


Map:   0%|          | 0/33399 [00:00<?, ? examples/s]

Map:   0%|          | 0/3712 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,2.759,3.16704
2,2.4027,3.067386
3,2.2389,3.138374




('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

## Load model

In [6]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer
import torch
from datasets import Dataset, load_dataset
import json

# Load the trained model and tokenizer
model_path = "model"
tokenizer_path = "tokenizer"

model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)


### Prepares the evaluation data

In [7]:
with open("spoken_test-v1.1.json", "r") as f:
    squad_data = json.load(f)


contexts = []
questions = []
answers = []

for article in squad_data["data"]:
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            question = qa["question"]
            answer = qa["answers"][0]
            contexts.append(context)
            questions.append(question)
            answers.append(answer)

# Converts to Hugging Face Dataset format
dataset_dict = {
    "context": contexts,
    "question": questions,
    "answers": answers
}
dataset = Dataset.from_dict(dataset_dict)


In [8]:
def preprocess_validation(examples):
    questions = examples["question"]
    contexts = examples["context"]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    

    inputs["offset_mapping"] = [
        offset if i < len(inputs["overflow_to_sample_mapping"]) else None
        for i, offset in enumerate(inputs["offset_mapping"])
    ]
    
    return inputs

# Applies the modified preprocessing
tokenized_eval = dataset.map(preprocess_validation, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

### Predictions

In [9]:
trainer = Trainer(model=model)

predictions = trainer.predict(tokenized_eval)
start_logits, end_logits = predictions.predictions

def get_predicted_answers(context, offset_mapping, start_logits, end_logits):
    start_index = torch.argmax(torch.tensor(start_logits)).item()
    end_index = torch.argmax(torch.tensor(end_logits)).item()
    
    if start_index < len(offset_mapping) and end_index < len(offset_mapping):
        start_char = offset_mapping[start_index][0]
        end_char = offset_mapping[end_index][1]
        return context[start_char:end_char]
    return ""


for i, example in enumerate(dataset):
    question = example["question"]
    context = example["context"]
    actual_answer = example["answers"]["text"][0]
    
    offset_mapping = tokenized_eval[i]["offset_mapping"]
    
    # gets predicted answer
    predicted_answer = get_predicted_answers(
        context, offset_mapping, start_logits[i], end_logits[i]
    )
    
    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"Actual Answer: {actual_answer}")
    print("\n")


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Question: Which NFL team represented the AFC at Super Bowl 50?
Predicted Answer: 
Actual Answer: d


Question: Which NFL team represented the NFC at Super Bowl 50?
Predicted Answer: carolina panthers
Actual Answer: c


Question: Where did Super Bowl 50 take place?
Predicted Answer: 
Actual Answer: s


Question: Which NFL team won Super Bowl 50?
Predicted Answer: 
Actual Answer: d


Question: What color was used to emphasize the 50th anniversary of the Super Bowl?
Predicted Answer: roman
Actual Answer: g


Question: What was the theme of Super Bowl 50?
Predicted Answer: champion of the national football league nfl
Actual Answer: g


Question: What is the AFC short for?
Predicted Answer: 
Actual Answer: a


Question: What was the theme of Super Bowl 50?
Predicted Answer: champion of the national football league nfl
Actual Answer: g


Question: What does AFC stand for?
Predicted Answer: 
Actual Answer: a


Question: Who won Super Bowl 50?
Predicted Answer: 
Actual Answer: d


Question: Wh

## F1 Score

In [10]:
from sklearn.metrics import f1_score
from collections import Counter

In [11]:
def f1_score_based_on_tokens(predictions, actuals):
    def f1_score_single(pred, actual):
        if pred == "":
            return 0 if actual else 1
        pred_tokens = pred.split()
        actual_tokens = actual.split()
        
        common = Counter(pred_tokens) & Counter(actual_tokens)
        num_same = sum(common.values())
        
        if num_same == 0:
            return 0
        
        precision = num_same / len(pred_tokens)
        recall = num_same / len(actual_tokens)
        
        return (2 * precision * recall) / (precision + recall)
    
    f1_scores = [f1_score_single(pred, actual) for pred, actual in zip(predictions, actuals)]
    return sum(f1_scores) / len(f1_scores)  # Average F1 score


In [12]:
predicted_answers = []
actual_answers = []

for i, example in enumerate(dataset):
    actual_answer = example["answers"]["text"][0] 
    offset_mapping = tokenized_eval[i]["offset_mapping"]
    predicted_answer = get_predicted_answers(
        example["context"], offset_mapping, start_logits[i], end_logits[i]
    )

    predicted_answers.append(predicted_answer)
    actual_answers.append(actual_answer)


In [17]:
f1 = f1_score_based_on_tokens(predicted_answers, actual_answers)
print(f"F1 Score: {f1 * 100:.2f}")


F1 Score: 0.73
