In [1]:
import json
import numpy as np
import pandas as pd
import re
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pathlib import Path

In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_json("./results/multinli_transforms_bertscore_dev_matched.jsonl", lines=True)
df = df[df['gold_label'] != '-']

In [4]:
# pose sequence as a NLI premise and label as a hypothesis
device = "cuda:0" if torch.cuda.is_available() else "cpu"
nli_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-base').to(device)
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-base')

labels = []

for i in range(0, len(df), 100):
    sentence_pairs = [
        [row['sentence1_transform1'] if row['sentence1_transform1_bertscore'] > row['sentence1_transform2_bertscore'] else row['sentence1_transform2'],
         row['sentence2_transform1'] if row['sentence2_transform1_bertscore'] > row['sentence2_transform2_bertscore'] else row['sentence2_transform2']] 
        for index, row in list(df.iterrows())[i:i+100]
    ]

    # run through model pre-trained on MNLI
    features = tokenizer(sentence_pairs, padding=True, truncation=True, return_tensors='pt').to(device)

    nli_model.eval()
    with torch.no_grad():
        scores = nli_model(**features).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        labels.extend([label_mapping[score_max] for score_max in scores.argmax(dim=1)])

In [5]:
gold_labels = df['gold_label'].to_list()

In [6]:
accuracy = sum([prediction == gold_label for prediction, gold_label in zip(labels, gold_labels)]) / len(labels)

In [7]:
print(accuracy)

0.7374426897605706
