In [99]:
import pandas as pd
import transformers
from evaluate import *

from overlap_evaluate import (
    _print_score,
    get_scores,
    read_references,
    read_annotations,
    ANNOTATIONS,
)
from sklearn.model_selection import train_test_split

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch

In [100]:
dataset = "nq"
data_df = pd.read_csv(f"data/{dataset}-test-ctxs.qa.csv")
annotations = read_annotations(f"data/{dataset}-annotations.jsonl")
annotation_df = pd.DataFrame(annotations)

In [101]:
data_df = data_df.drop('ctxs', axis=1)
data_df = data_df.drop('answers', axis=1)
data_df['labels'] = annotation_df['labels']

In [102]:
data_df.head()

Unnamed: 0,question,labels
0,who got the first nobel prize in physics,"[total, answer_overlap]"
1,when is the next deadpool movie being released,"[total, answer_overlap]"
2,which mode is used for short wave broadcast se...,"[total, answer_overlap]"
3,the south west wind blows across nigeria between,"[total, no_answer_overlap]"
4,what does hp mean in war and order,"[total, no_answer_overlap]"


In [103]:
# ONLY QUESTION OVERLAP
results = []
for index, row in data_df.iterrows():
    overlap = None
    # print(row["labels"])
    if "question_overlap" in row['labels']:
        overlap = 1
    elif "no_question_overlap" in row['labels']:
        overlap = 0

    if overlap is not None:
        results.append({
            "question": row["question"],
            "overlap": overlap
        })

In [104]:
data_df = pd.DataFrame(results)

In [105]:
data_df['overlap'].value_counts()

0    672
1    324
Name: overlap, dtype: int64

In [106]:
train_df, test_df = train_test_split(data_df, train_size=.8, random_state=42)

In [107]:
train_df['overlap'].value_counts()

0    536
1    260
Name: overlap, dtype: int64

In [108]:
test_df['overlap'].value_counts()

0    136
1     64
Name: overlap, dtype: int64

In [120]:
train_df = pd.concat([train_df, train_df.loc[train_df['overlap'] == 1].sample(n=276, random_state=42, replace=True)])

In [121]:
train_df['overlap'].value_counts()

0    536
1    536
Name: overlap, dtype: int64

In [122]:
train_df.head()

Unnamed: 0,question,overlap
626,the temperature of gas is proportional to the,0
307,who missed the plane the day the music died,1
662,where did the allies go after north africa,0
394,who said i will not go quietly into the night,1
644,when does season 5 of ruby come out,0


In [125]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/calebkumar/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncase

In [126]:
train_encodings = tokenizer(list(train_df['question']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['question']), padding=True, truncation=True, return_tensors="pt")

In [127]:
class OverlapDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [128]:
train_dataset = OverlapDataset(train_encodings, list(train_df['overlap']))
test_dataset = OverlapDataset(test_encodings, list(test_df['overlap']))

In [129]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
training_args = TrainingArguments("saved_models/overlap_bert_2", num_train_epochs=2)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
start = time.time()
trainer.train()
print(f"{time.time() - start} seconds")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/calebkumar/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-bas

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




640.0495181083679 seconds


In [130]:
import numpy as np
def bert_predict(model, text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    result = model(**inputs)
    return np.argmax(result.logits.detach().numpy())

bert_predict(model, "Are you an egg?")

0

In [131]:
predictions = []
for i in range(len(test_df)):
    predictions.append(bert_predict(model, test_df.iloc[i]['question']))



In [132]:
from sklearn.metrics import classification_report
print(classification_report(list(test_df['overlap']), predictions, digits=4))

              precision    recall  f1-score   support

           0     0.7845    0.6691    0.7222       136
           1     0.4643    0.6094    0.5270        64

    accuracy                         0.6500       200
   macro avg     0.6244    0.6392    0.6246       200
weighted avg     0.6820    0.6500    0.6598       200



In [133]:
model.save_pretrained("saved_models/bert_question_overlap_classifier_2")

Configuration saved in saved_models/bert_question_overlap_classifier_2/config.json
Model weights saved in saved_models/bert_question_overlap_classifier_2/pytorch_model.bin
