## Install Dependencies

In [1]:
!pip install accelerate==0.23.0 datasets evaluate transformers seqeval ipywidgets peft



In [2]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device != "cpu":
  torch.cuda.set_device(device)

In [3]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

## Preprocess CONLL data input

In [4]:
# labels
label_names = ['O', 'B-MethodName', 'I-MethodName', 'B-HyperparameterName', 'I-HyperparameterName', 'B-HyperparameterValue', 'I-HyperparameterValue',
               'B-MetricName', 'I-MetricName', 'B-MetricValue', 'I-MetricValue', 'B-TaskName', 'I-TaskName', 'B-DatasetName', 'I-DatasetName']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

from datasets import Dataset
import os
def generate_dataset(data_dir):
  dataset = []

  conll_files = os.listdir(data_dir)
  for filename in conll_files:
    if "conll" in filename:
      filename = os.path.join(data_dir, filename)
      dataset += parse_conll(filename, label2id)
  return dataset

def parse_conll(filename, label2id):
    with open(filename) as f:
      lines = f.readlines()
      lines_info_list = []
      line_info = {"tokens": [], "tags": []}
      for line in lines:
        if line == "\n":
          line_info["tags"] = [label2id[tag] for tag in line_info["tags"]]
          lines_info_list.append(line_info)
          line_info = {"tokens": [], "tags": []}
        else:
          if "DOCSTART" in line:
            continue
          try:
            token, _, _, tag = line.rstrip().split()

            line_info["tokens"].append(token)
            line_info["tags"].append(tag)
          except:
            continue
    return lines_info_list

data_list = generate_dataset(".")
raw_dataset = Dataset.from_list(data_list)

In [5]:
raw_dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 6027
})

## Tokenize Dataset

In [100]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

"""
Model choices:
- bert-base-cased
- bert-base-uncased
- dslim/bert-base-NER
- dbmdz/bert-large-cased-finetuned-conll03-english
- dslim/bert-large-NER
- QCRI/bert-base-multilingual-cased-pos-english
- Jean-Baptiste/roberta-large-ner-english
- sileod/deberta-v3-base-tasksource-nli
"""
model_checkpoint = "/Users/liuyinjia/Desktop/QCRI_bert-base-multilingual-cased-pos-english.ckpt"
lr = 1e-5
batch_size = 8
num_epochs = 1

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# build tokenized dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    # remove_columns=raw_dataset.column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

shuffle_seed = None

if shuffle_seed:
  tokenized_dataset = tokenized_dataset.shuffle(seed=shuffle_seed)

train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.1).values()

Map:   0%|          | 0/6027 [00:00<?, ? examples/s]

In [48]:
cnt = 0
for i in range(len(train_dataset["tokens"])):
    if 10 in train_dataset["labels"][i]:
        text = " ".join(train_dataset["tokens"][i])
        target = train_dataset["labels"][i]
        print(len(train_dataset["input_ids"][i]))
        print(len(target))
        print(text)
        print(target)
        cnt += 1
        if cnt == 3:
            break

87
87
All LR / BERT scores ( rows 3–8 ) have standard deviations between 0.1 and 1.1 , signiﬁcantly outperforming “ Length”.yThe average bootstrap accuracy after resampling 100 K times with sample size 200 — the standard deviations of P @ 1 and A @ 3 range between 2.1 and 3.5 .
[-100, 0, 1, 2, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 3, 4, 4, 4, 5, 6, 0, 0, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 8, 8, 0, 0, 9, 10, 10, 0, 9, 10, 10, 0, -100]
53
53
Nevertheless , in our task , the humans and the BERT model seem to make similar decisions ; the association between their choices of sentences is high , with odds ratios ranging between 3.43 ( top 1 ) and 3.33 ( top 3 ) .
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 9, 10, 10, 0, 0, 0, 0, 0, 9, 10, 10, 0, 0, 0, 0, 0, -100]
48
48
By directly leveraging a pre - trained model 

In [101]:
# Get example outputs
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
texts = ["TrufLL ( Task - LM ) what shape is the big thing that is to the right of the big cyan thing ?",
"Figures 4a and 4b plot the impact of the number of epochs on the F 1scores .",
"Methods are tested whether the similarity between the given two words in the embedding space is consistent with the ground truth , in terms of Spear-35",
"By directly leveraging a pre - trained model , GPT2 outperforms the previous models by a large margin , reaching 33.9 % on BLEU-4 and a micro accuracy of 60.4 % ."]

for text in texts:
    inputs = tokenizer(text, return_tensors="pt")

    # Predict token labels using the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted token labels (output from the model)
    token_labels = torch.argmax(outputs.logits, dim=2)
    labels = [model.config.id2label[label_id.item()] for label_id in token_labels[0]]

    # Print the input text and predicted labels
    print("Input Text:", len(text.split()), text)
    print("Predicted Labels:", len(labels), labels)

Input Text: 23 TrufLL ( Task - LM ) what shape is the big thing that is to the right of the big cyan thing ?
Predicted Labels: 28 ['O', 'B-MethodName', 'I-MethodName', 'I-MethodName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Input Text: 17 Figures 4a and 4b plot the impact of the number of epochs on the F 1scores .
Predicted Labels: 24 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-HyperparameterName', 'I-HyperparameterName', 'I-HyperparameterName', 'I-HyperparameterName', 'I-HyperparameterName', 'O', 'O', 'B-MetricName', 'I-MetricName', 'I-MetricName', 'I-MetricName', 'O', 'O']
Input Text: 26 Methods are tested whether the similarity between the given two words in the embedding space is consistent with the ground truth , in terms of Spear-35
Predicted Labels: 34 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

## Define Metrics

In [102]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # TODO: create per-class F1 score / binary cross entropy
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "true_labels": true_labels, 
        "true_predictions": true_predictions,
    }

# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=-1)
#     precision, recall, fscore, support = precision_recall_fscore_support(labels, predictions, average=None)
#     return {
#         "precision": precision.tolist(),
#         "recall": recall.tolist(),
#         "fscore": fscore.tolist(),
#         "support": support.tolist()
#     }

## Initialize Model

In [15]:
from transformers import AutoConfig, AutoModelForTokenClassification

configuration = AutoConfig.from_pretrained(model_checkpoint)
configuration.update({"_num_labels": 15, 'label2id': label2id, 'id2label':id2label})
model = AutoModelForTokenClassification.from_config(configuration)


In [9]:
# model._init_weights(model.classifier)

# for param in model.parameters():
#   param.require_grad = False
# for param in model.classifier.parameters():
#   param.require_grad = True
# model = AutoModelForTokenClassification.from_pretrained(
#     model_checkpoint, num_labels=15, id2label=id2label, label2id=label2id
# )

In [16]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
# model.classifier = torch.nn.Linear(model.classifier.in_features, 15, bias=model.classifier.bias is not None)
# model.config.update({'_num_labels': 15, 'label2id': label2id, 'id2label': id2label})
# model.num_labels = 15
# model.init_weights()

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [11]:
# from peft import TaskType, LoraConfig, get_peft_model


# peft_config = LoraConfig(
#     task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
# )
# model = get_peft_model(model, peft_config)

# # model.bert.print_trainable_parameters()

## Setup Trainer

In [103]:
from transformers import TrainingArguments, Trainer

model_output_dir = f"{model_checkpoint}-finetuned-ner"

args = TrainingArguments(
    output_dir=model_output_dir,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # learning_rate=2e-5,
    # num_train_epochs=10,
    # weight_decay=0.01,
    # push_to_hub=False,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard"
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [58]:
# evaluation
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # TODO: create per-class F1 score / binary cross entropy
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "true_labels": true_labels, 
        "true_predictions": true_predictions,
    }
    
results = trainer.evaluate()
true_labels = results['eval_true_labels']
true_labels = [item for sublist in true_labels for item in sublist]
true_predictions = results['eval_true_predictions']
true_predictions = [item for sublist in true_predictions for item in sublist]

from sklearn.metrics import classification_report

# Assuming 'y_true' is the true labels and 'y_pred' is the predicted labels
report = classification_report(true_labels, true_predictions)
print(report)
print(results)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/76 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-MethodName', 'I-MethodName', 'I-MethodName', 'I-MethodName', 'O', 'B-MethodName', 'I-MethodName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MethodNam

                       precision    recall  f1-score   support

        B-DatasetName       0.74      0.97      0.84        32
 B-HyperparameterName       0.90      0.56      0.69        16
B-HyperparameterValue       0.78      0.44      0.56        16
         B-MethodName       0.90      0.96      0.93       117
         B-MetricName       0.77      1.00      0.87        37
        B-MetricValue       0.30      1.00      0.46         3
           B-TaskName       0.81      0.94      0.87        31
        I-DatasetName       0.87      0.94      0.90        62
 I-HyperparameterName       0.94      0.68      0.79        25
I-HyperparameterValue       0.67      0.67      0.67         9
         I-MethodName       0.93      0.97      0.95       211
         I-MetricName       0.72      1.00      0.84        42
        I-MetricValue       0.48      1.00      0.65        10
           I-TaskName       0.80      0.93      0.86        43
                    O       1.00      0.99      0.99  

## Training

In [13]:
from transformers import TrainerCallback
import torch

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()


with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA],
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler('hf-training-trainer'),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:

    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()

# TODO: Record Eval

# TODO: Performance Analysis using torch profiler (tensorboardX)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2628,0.155386,0.397196,0.313653,0.350515,0.963613


[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
INFO:2023-10-27 13:15:16 606808:606808 init.cpp:149] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:321] Completed Stage: Post Processing
STAGE:2023-10-27 13:15:18 606808:606808 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-10-27 13:15:18 606808:606808 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-10-27 13:15:18 606808:606808 Acti

## Inference

In [104]:
def conll_format(results):
    conll_output = ""
    for result in results:
      for word, label in result:
        conll_output += f"{word}\t{label}\n"
      conll_output += "\n"
    return conll_output

def split_text(text, chunk_size):
    return [text[i:i+chunk_size]
          for i in range(0, len(text), chunk_size)]

def inference(input_file, output_file):
    text = []
    paragraph = []
    labels = []
    labels_tmp = []
    with open(input_file, "r") as file:
        for line in file:
            # text.append(line.split())
            line = line.split()
            if not line:
                text.append(paragraph)
                labels.append(labels_tmp)
                paragraph = []
                labels_tmp = []
            elif len(line) == 4:
                paragraph.append(line[0])
                labels_tmp.append(line[3])
        if paragraph:
            text.append(paragraph)
            labels.append(labels_tmp)
    results = []
    for sentence in text:
        result = []
            
        inputs = tokenizer(sentence, return_tensors='pt', is_split_into_words=True).to(device)
        word_ids = inputs.word_ids()
        # print(len(word_ids))
        
        s = inputs['input_ids'].shape[1]
        with torch.no_grad():
            logits = model(**inputs).logits
        tokens = inputs.tokens()
        
        predictions = torch.argmax(logits, dim=2)
        predictions = predictions[0].cpu().numpy()
        
        prev_word_id = None
        for token, prediction, word_id in zip(tokens, predictions, word_ids):
            if word_id != None and word_id != prev_word_id:
                prev_word_id = word_id
                result.append((sentence[word_id], model.config.id2label[prediction]))
        results.append(result)
    conll_predictions = conll_format(results)
    return labels, results

    # with open(output_file, "w") as f:
    #     f.write(conll_predictions)

model_name = model_checkpoint.replace('/', '_')
# inference(f"../data/bert.txt", f"../data/bert_output_{model_name}.conll")
# inference(f"../data/test.txt", f"../data/test_output_{model_name}.conll")
# inference(f"../data/test2.txt", f"../data/test2_output_{model_name}.conll")
true_labels1, predictions_multibert = inference("../src/all.conll", f"../data/all_output_{model_name}.conll")

In [111]:
trues, debertas, multiberts = [], [], []
for i in range(len(true_labels)):
    if len(true_labels[i]) == len(predictions_multibert[i]) == len(predictions_deberta[i]):
        trues += true_labels[i]
        multiberts += [item[1] for item in predictions_multibert[i]]
        debertas += [item[1] for item in predictions_deberta[i]]
print(len(trues), len(debertas), len(multiberts))

132011 132011 132011


In [110]:
print(trues)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MethodName', 'O', 'O', 'O', 'B-TaskName', 'O', 'B-TaskName', 'I-MethodName', 'I-TaskName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MethodName', 'I-MethodName', 'O', 'O', 'O', 'O', 'O'

In [124]:
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm

# Your ground truth and predictions for both models
ground_truth = trues[:10000]
predictions_model_A = debertas[:10000]
predictions_model_B = multiberts[:10000]

# Observed difference
observed_difference = f1_score(ground_truth, predictions_model_A, average="macro") - f1_score(ground_truth, predictions_model_B, average="macro")

# Number of bootstrap iterations
num_iterations = 100

# Create an array to store bootstrapped differences
bootstrapped_differences = np.zeros(num_iterations)

for i in tqdm(range(num_iterations)):
    # Resample with replacement
    sample_indices = np.random.choice(len(ground_truth), len(ground_truth), replace=True)
    
    # Calculate F1-scores for the resampled data
    resampled_A = [predictions_model_A[idx] for idx in sample_indices]
    resampled_B = [predictions_model_B[idx] for idx in sample_indices]
    
    f1_A = f1_score(ground_truth, resampled_A, average="macro")
    f1_B = f1_score(ground_truth, resampled_B, average="macro")
    
    # Store the difference
    bootstrapped_differences[i] = f1_A - f1_B
    print(bootstrapped_differences[i])

# Calculate the p-value
p_value = (np.abs(bootstrapped_differences) >= np.abs(observed_difference)).mean()

# Make a statistical inference
if p_value < 0.05:
    print("There is a statistically significant difference between the two models.", f"{p_value:.{5}f}")
else:
    print("There is no statistically significant difference between the two models.", f"{p_value:.{5}f}")


  4%|▍         | 4/100 [00:00<00:05, 16.60it/s]

-0.0014319490688451675
-0.0013474459227909436
-0.007060547249812851
-0.00918008584130886


  8%|▊         | 8/100 [00:00<00:05, 16.75it/s]

0.0009330974697460692
-0.0013530065894556131
-0.0008543677038066883
0.0007513250005231342


 12%|█▏        | 12/100 [00:00<00:05, 16.75it/s]

-0.006213663368699107
0.007283062781154218
-0.006476545120869703
-0.0003401517540334992


 16%|█▌        | 16/100 [00:00<00:05, 16.70it/s]

-8.505435057973298e-05
0.000602084944292508
-0.0012552185345412609
0.006000024537853116


 20%|██        | 20/100 [00:01<00:04, 16.56it/s]

0.004459709681419219
-0.0064884728346792375
-0.00018335957814873127
-0.006297105824124863


 24%|██▍       | 24/100 [00:01<00:04, 16.57it/s]

-0.0052993402558697
-0.006487889037492756
-0.00029255470967674835
0.0001139656309202125


 26%|██▌       | 26/100 [00:01<00:04, 15.00it/s]

-8.700672695899359e-05
-0.0003845549938506637
-0.001285692199508151


 30%|███       | 30/100 [00:01<00:04, 14.39it/s]

-0.0002464376101567328
-0.01007334406525541
-0.0006983859652499591
-0.0011703898593119627


 34%|███▍      | 34/100 [00:02<00:04, 15.52it/s]

-0.000290072388816523
0.004152528378651821
-0.002396149083004284
0.004164080022990221


 38%|███▊      | 38/100 [00:02<00:03, 16.13it/s]

-2.720498047383002e-05
-0.008430983448233703
-0.003394015567552866
0.0006388379550714973


 42%|████▏     | 42/100 [00:02<00:03, 16.55it/s]

-0.0008638614729498117
1.2091224036780979e-05
-0.0005352402390284144
-0.00031708383105190363


 46%|████▌     | 46/100 [00:02<00:03, 16.54it/s]

-0.00013056546597525753
8.099745004251646e-05
-0.006556816900542717
0.0049984278955248485


 50%|█████     | 50/100 [00:03<00:03, 16.47it/s]

-0.006321278920694048
-0.0002469557335677136
-0.001860462596437712
-0.00024569725555975697


 54%|█████▍    | 54/100 [00:03<00:02, 16.74it/s]

-0.006466746268101439
-0.0066117661891980095
-0.0002387986810729048
0.0031101999579969952


 58%|█████▊    | 58/100 [00:03<00:02, 16.85it/s]

-0.007558167293884463
-0.006355070688791195
-0.0014589787459594605
0.005977809494682296


 62%|██████▏   | 62/100 [00:03<00:02, 16.89it/s]

0.0071831024834735635
-7.992750841866181e-05
-0.0013352599219308148
-0.00032751297730555795


 66%|██████▌   | 66/100 [00:04<00:02, 16.89it/s]

-0.0003235115908492153
-0.000893471575969626
0.006013662668006486
-5.468511874774651e-05


 70%|███████   | 70/100 [00:04<00:01, 16.91it/s]

-0.0064153110388071866
-5.020788514827068e-05
-0.006413225149688964
-0.001063767674243904


 74%|███████▍  | 74/100 [00:04<00:01, 16.73it/s]

-0.001011475267526693
0.0008199094050399258
-0.00022707822632563046
-0.0015847526023808345


 78%|███████▊  | 78/100 [00:04<00:01, 16.86it/s]

-0.006412104768955387
-0.0054904759027678
0.0017688684255839637
0.006641107232994631


 82%|████████▏ | 82/100 [00:04<00:01, 16.61it/s]

-0.0016092020820726166
-0.006499828504156438
0.0008725272547576435
-0.0019793970009098355


 86%|████████▌ | 86/100 [00:05<00:00, 16.76it/s]

-0.0070782344574605305
-0.008677807203131091
-0.0006590267800597044
-0.00772522736147889


 90%|█████████ | 90/100 [00:05<00:00, 16.89it/s]

-0.007047687314833476
-0.005534979399222803
-0.0007639483334103275
0.0031008153245769693


 94%|█████████▍| 94/100 [00:05<00:00, 16.94it/s]

0.0060944567412363065
-0.00020595238302530128
-0.00523443021597339
-0.005394165368062656


 98%|█████████▊| 98/100 [00:05<00:00, 16.99it/s]

-0.0002758903791412609
-0.007710022748781187
-0.006720754217262953
-0.0007817964347815315


100%|██████████| 100/100 [00:06<00:00, 16.52it/s]

-0.0002746203050561363
There is a statistically significant difference between the two models. 0.00000





In [128]:
import numpy as np

rng = np.random.default_rng()
def eval_with_paired_bootstrap(gold, sys1, sys2, num_samples=10000, sample_ratio=0.5):
    """Evaluate with paired boostrap
    This compares two systems, performing a significance tests with
    paired bootstrap resampling to compare the accuracy of the two systems.

    Parameters
    ----------
    gold
      The correct labels
    sys1
      The output of system 1
    sys2
      The output of system 2
    num_samples
      The number of bootstrap samples to take
    sample_ratio
      The ratio of samples to take every time

    """
    assert len(gold) == len(sys1)
    assert len(gold) == len(sys2)

    gold = np.array(gold)
    sys1 = np.array(sys1)
    sys2 = np.array(sys2)

    sys1_scores = []
    sys2_scores = []
    wins = [0, 0, 0]
    n = len(gold)

    for _ in tqdm(range(num_samples)):
        # Subsample the gold and system outputs
        subset_idxs = rng.choice(n, int(n * sample_ratio), replace=True)
        sys1_score = (sys1[subset_idxs] == gold[subset_idxs]).mean()
        sys2_score = (sys2[subset_idxs] == gold[subset_idxs]).mean()

        if sys1_score > sys2_score:
            wins[0] += 1
        elif sys1_score < sys2_score:
            wins[1] += 1
        else:
            wins[2] += 1

        sys1_scores.append(sys1_score)
        sys2_scores.append(sys2_score)

    # Print win stats
    wins = [x / float(num_samples) for x in wins]
    print("Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f" % (wins[0], wins[1], wins[2]))
    if wins[0] > wins[1]:
        print("(sys1 is superior with p value p=%.3f)\n" % (1 - wins[0]))
    elif wins[1] > wins[0]:
        print("(sys2 is superior with p value p=%.3f)\n" % (1 - wins[1]))

    # Print system stats
    sys1_scores.sort()
    sys2_scores.sort()
    print(
        "sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            np.mean(sys1_scores),
            np.median(sys1_scores),
            sys1_scores[int(num_samples * 0.025)],
            sys1_scores[int(num_samples * 0.975)],
        )
    )
    print(
        "sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            np.mean(sys2_scores),
            np.median(sys2_scores),
            sys2_scores[int(num_samples * 0.025)],
            sys2_scores[int(num_samples * 0.975)],
        )
    )
eval_with_paired_bootstrap(trues, debertas, multiberts, num_samples=1000)

100%|██████████| 1000/1000 [00:09<00:00, 109.04it/s]

Win ratio: sys1=1.000, sys2=0.000, tie=0.000
(sys1 is superior with p value p=0.000)

sys1 mean=0.990, median=0.990, 95% confidence interval=[0.990, 0.991]
sys2 mean=0.989, median=0.989, 95% confidence interval=[0.988, 0.990]





In [87]:
# predictions_deberta = []
# with open("../data/all_output_sileod_deberta-v3-base-tasksource-nli.ckpt.conll", "r") as f:
#     for line in f:
#         line = line.split()
#         if len(line) == 2:
#             predictions_deberta.append(line[1])
# predictions_multibert = []
# with open("../data/all_output_QCRI_bert-base-multilingual-cased-pos-english.ckpt.conll", "r") as f:
#     for line in f:
#         line = line.split()
#         if len(line) == 2:
#             predictions_multibert.append(line[1])
# cnt = 0
# with open("../src/labels.txt", "r") as f:
#     for line in f:
#         cnt += 1
# print(len(predictions_deberta))
# print(len(predictions_multibert))
# print(cnt)

132486
132509
132502


## PEFT

In [None]:
# !pip install torch_tb_profiler

In [None]:
# !tensorboard --logdir=./hf-training-trainer

In [None]:
# from numba import cuda
# device = cuda.get_current_device()
# device.reset()