In [55]:
import pandas as pd
import transformers
from evaluate import *

from overlap_evaluate import (
    _print_score,
    get_scores,
    read_references,
    read_annotations,
    ANNOTATIONS,
)
from sklearn.model_selection import train_test_split

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch

import ast

In [56]:
dataset = "nq"
data_df = pd.read_csv(f"data/{dataset}-test-ctxs.qa.csv")
annotations = read_annotations(f"data/{dataset}-annotations.jsonl")
annotation_df = pd.DataFrame(annotations)

In [57]:
data_df = data_df.drop('ctxs', axis=1)
data_df = data_df.drop('question', axis=1)
data_df['labels'] = annotation_df['labels']

In [58]:
data_df["answers"] = data_df["answers"].apply(
            lambda x: ast.literal_eval(x)
        )

In [60]:
data_df.iloc[0]['answers']

['Wilhelm Conrad Röntgen']

In [63]:
# ANSWER and NO ANSWER OVERLAP
results = []
for index, row in data_df.iterrows():
    overlap = None
    # print(row["labels"])
    if "no_answer_overlap" in row['labels']:
        overlap = 0
    elif "answer_overlap" in row['labels']:
        overlap = 1

    if overlap is not None:
        # for answer in row["answers"]:
        results.append({
            "answer": row['answers'][0],
            "overlap": overlap
        })

In [65]:
data_df = pd.DataFrame(results)

In [66]:
data_df['overlap'].value_counts()

1    2297
0    1313
Name: overlap, dtype: int64

In [67]:
train_df, test_df = train_test_split(data_df, train_size=.8, random_state=42)

In [68]:
train_df['overlap'].value_counts()

1    1830
0    1058
Name: overlap, dtype: int64

In [69]:
test_df['overlap'].value_counts()

1    467
0    255
Name: overlap, dtype: int64

In [70]:
train_df = pd.concat([train_df, train_df.loc[train_df['overlap'] == 0].sample(n=772, random_state=42, replace=True)])

In [71]:
train_df['overlap'].value_counts()

1    1830
0    1830
Name: overlap, dtype: int64

In [79]:
test_df['overlap'].value_counts()

1    467
0    255
Name: overlap, dtype: int64

In [74]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [81]:
train_encodings = tokenizer(list(train_df['answer']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['answer']), padding=True, truncation=True, return_tensors="pt")

In [82]:
class OverlapDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [83]:
train_dataset = OverlapDataset(train_encodings, list(train_df['overlap']))
test_dataset = OverlapDataset(test_encodings, list(test_df['overlap']))

In [84]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args = TrainingArguments("saved_models/overlap_bert_answers", num_train_epochs=2)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
start = time.time()
trainer.train()
print(f"{time.time() - start} seconds")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss
500,0.5624


Saving model checkpoint to saved_models/overlap_bert_answers/checkpoint-500
Configuration saved in saved_models/overlap_bert_answers/checkpoint-500/config.json
Model weights saved in saved_models/overlap_bert_answers/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




2026.3341250419617 seconds


In [89]:
import numpy as np
def bert_predict(model, text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    result = model(**inputs)
    return np.argmax(result.logits.detach().numpy())

bert_predict(model, "Are you an egg?")

0

In [92]:
test_df.shape

(722, 2)

In [90]:
predictions = []
for i in range(len(test_df)):
    predictions.append(bert_predict(model, test_df.iloc[i]['answer']))



In [91]:
from sklearn.metrics import classification_report
print(classification_report(list(test_df['overlap']), predictions, digits=4))

              precision    recall  f1-score   support

           0     0.6208    0.6549    0.6374       255
           1     0.8057    0.7816    0.7935       467

    accuracy                         0.7368       722
   macro avg     0.7133    0.7182    0.7154       722
weighted avg     0.7404    0.7368    0.7384       722



In [88]:
model.save_pretrained("saved_models/bert_answer_overlap_classifier")

Configuration saved in saved_models/bert_answer_overlap_classifier/config.json
Model weights saved in saved_models/bert_answer_overlap_classifier/pytorch_model.bin
