# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [2]:
model = model.to(device)

In [3]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [4]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [5]:
def get_prediction(pred_dict):
    return max(pred_dict, key=pred_dict.get)

## Load ImpPres Dataset

In [6]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [7]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [8]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
            'section': example['section']
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [9]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [10]:
from evaluate import combine

clf_metrics = combine(["accuracy", "f1", "precision", "recall"])

In [11]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [12]:
from datasets import load_from_disk

unified_pres = load_from_disk("unified_presupposition.hf")
unified_pres

Dataset({
    features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section'],
    num_rows: 17100
})

In [13]:
results = evaluate_on_dataset(unified_pres)

100%|██████████| 17100/17100 [10:01<00:00, 28.41it/s]


In [14]:
from evaluate import load
from collections import defaultdict

accuracy = load("accuracy")
macro_f1 = load("f1")
macro_precision = load("precision")
macro_recall = load("recall")

preds = defaultdict(list)
refs = defaultdict(list)
for res in results:
    preds[res['section']].append(label_names.index(res['pred_label']))
    refs[res['section']].append(label_names.index(res['gold_label']))


classification_results = {}
for section in preds:
    classification_results[section] = (
        accuracy.compute(predictions=preds[section], references=refs[section]) |
        macro_f1.compute(predictions=preds[section], references=refs[section], average='macro') |
        macro_precision.compute(predictions=preds[section], references=refs[section], average='macro') |
        macro_recall.compute(predictions=preds[section], references=refs[section], average='macro')
    )

classification_results['total'] = (
        accuracy.compute(predictions=[p for section in preds.values() for p in section], 
                        references=[r for section in refs.values() for r in section]) |
        macro_f1.compute(predictions=[p for section in preds.values() for p in section], 
                        references=[r for section in refs.values() for r in section], average='macro') |
        macro_precision.compute(predictions=[p for section in preds.values() for p in section], 
                                references=[r for section in refs.values() for r in section], average='macro') |
        macro_recall.compute(predictions=[p for section in preds.values() for p in section], 
                            references=[r for section in refs.values() for r in section], average='macro')
    )

In [15]:
import pandas as pd
df = pd.DataFrame(classification_results).T
df = df.rename(columns={
    'accuracy': 'Accuracy',
    'f1': 'Macro F1',
    'precision': 'Macro Precision',
    'recall': 'Macro Recall'
})
df.index.name = 'Section'
df.reset_index(inplace=True)
df

Unnamed: 0,Section,Accuracy,Macro F1,Macro Precision,Macro Recall
0,all_n_presupposition,0.540526,0.505641,0.516881,0.529861
1,both_presupposition,0.360526,0.304465,0.294527,0.326889
2,change_of_state,0.413158,0.407841,0.417169,0.404778
3,cleft_existence,0.686842,0.683611,0.716375,0.729889
4,cleft_uniqueness,0.223158,0.204172,0.211307,0.205556
5,only_presupposition,0.677895,0.676407,0.701898,0.714444
6,possessed_definites_existence,0.768947,0.776102,0.831856,0.814667
7,possessed_definites_uniqueness,0.399474,0.315871,0.281366,0.362778
8,question_presupposition,0.715263,0.711251,0.765572,0.765917
9,total,0.531754,0.534101,0.545635,0.53942


In [16]:
import json

mistakes = []
for result in results:
    if result['pred_label'] != result['gold_label']:
        mistakes.append(result)

with open('imppres_deberta_mistakes.json', 'w') as f:
    json.dump(mistakes, f, indent=2)