In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install -U adapter-transformers
!pip install -U datasets
!pip install pytorch-pretrained-bert
!pip install nlp

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from torch import nn
import torch
from torch.nn import functional as F
import codecs
import json
import spacy
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, \
    f1_score, precision_score, recall_score, average_precision_score, roc_auc_score, confusion_matrix, \
    brier_score_loss
import numpy as np

# Ihre Auswertungsfunktion
def evaluate_and_save_to_csv():
    # Annahme: Sie haben bereits predy, testy und andere erforderliche Variablen
    acc, f1, precision, recall, gmean, bss, roc_auc, info = binary_eval(predy, testy, verbose=True, return_f1=True, predscore=None)

    # Erstellen Sie ein DataFrame aus den Ergebnissen
    results_df = pd.DataFrame({
        'Accuracy': [acc],
        'F1_Score_0': [f1[0]],
        'F1_Score_1': [f1[1]],
        'Precision_0': [precision[0]],
        'Precision_1': [precision[1]],
        'Recall_0': [recall[0]],
        'Recall_1': [recall[1]],
        'G-Mean': [gmean],
        'Brier_Score': [bss],
        'ROC_AUC': [roc_auc]
    })

    # Speichern Sie das DataFrame in eine CSV-Datei
    results_df.to_csv('/content/drive/MyDrive/Modelle/results.csv', index=False)

def binary_eval(predy, testy, verbose=True, return_f1=False, predscore=None):
    acc = accuracy_score(testy, predy)
    f1 = f1_score(testy, predy, average=None)
    precision = precision_score(testy, predy, average=None)
    recall = recall_score(testy, predy, average=None)
    epsilon = 1e-8

    htn, hfp, hfn, htp = confusion_matrix(testy, predy).ravel()
    hsensi = htp / (htp + hfn + epsilon)
    hspec = htn / (hfp + htn + epsilon)
    gmean = np.sqrt(hsensi*hspec)


    info = "Acc : {}\nf1 : {}\nprecision : {}\nrecall : {}\nG-mean : {}".format(acc,
            " ".join([str(x) for x in f1]), " ".join([str(x) for x in precision]),
            " ".join([str(x) for x in recall]), gmean)

    if predscore is not None:
        bss = brier_score_loss(testy, predscore)
        roc_auc = roc_auc_score(testy, predscore)
        info += "\nbss : {}\nROC-AUC : {}".format(bss, roc_auc)

    if verbose:
        print(info)

    if return_f1:
        return acc, f1, precision, recall, gmean, bss, roc_auc, info
    else:
        return acc, info


def subsets(nums):
    """
    :type nums: List[int]
    :rtype: List[List[int]]
    """
    ans = []
    def dfs(curpos, tmp):
        if tmp:
            ans.append(tmp[:])
        for i in range(curpos, len(nums)):
            tmp.append(nums[i])
            dfs(i+1, tmp)
            tmp.pop(-1)
    dfs(0, [])
    return ans


def sent_ner_bounds(sen, nlp=None):
    if nlp is None:
        nlp = spacy.load('en')
    tokens, tags = [], []
    print(sen)
    for doc in nlp.pipe([sen]):
        for token in doc:
            tags.append(token.ent_iob_)
            tokens.append(str(token))

    rep_pos = []
    vis = [False for _ in range(len(tags))]
    for idx, tag in enumerate(tags):
        if tag == 'O':
            rep_pos.append([idx, idx])
            vis[idx] = True
        elif tag == 'B':
            end = idx
            for j in range(idx+1, len(tags)):
                if tags[j] == 'I':
                    end = j
                else:
                    break
            rep_pos.append([idx, end])
        elif tag == 'I':
            continue

    return ' '.join(tokens), rep_pos


def remove_marked_sen(sen, start_id, end_id):
    tokens = sen if type(sen) == list else sen.strip().split()
    if tokens[start_id].startswith("===") and tokens[end_id].endswith("==="):
        tokens[start_id] = tokens[start_id][3:]
        tokens[end_id] = tokens[end_id][:-3]
    return tokens

In [None]:
from torch.utils import data
import torch
import numpy as np
from tqdm import tqdm, trange
import collections
import codecs
import json


class InputExample(object):

    def __init__(self, guid, sen, idxs, label):
        self.guid = guid
        self.sen = sen
        self.idxs = idxs
        self.label = label


class InputFeatures(object):

    def __init__(self, guid, input_ids, input_mask, segment_ids,  predict_mask, label_id):
        self.guid = guid
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.predict_mask = predict_mask
        self.label_id = label_id


class DataProcessor(object):
    def __init__(self):
        self.num_consist = 0
        self.num_hallu = 0

    def _read_data(self, input_file, require_uidx=False):
        with open(input_file) as f:
            # out_lines = []
            out_lists = []
            entries = f.read().strip().split("\n")
            for entry in entries:
                example = json.loads(entry.strip())
                if "hallucination" not in example:
                    label = -1
                else:
                    label = example["hallucination"]
                    if label not in [0, 1]:
                        continue
                if require_uidx:
                    sen, token_ids, uidx = example["replaced"], example["replaced_ids"], example["idx"]
                    out_lists.append([sen, token_ids, label, uidx])
                else:
                    sen, token_ids = example["replaced"], example["replaced_ids"]
                    out_lists.append([sen, token_ids, label])
        return out_lists

    def _create_examples(self, all_lists):
        examples = []
        for (i, one_lists) in enumerate(all_lists):
            guid = i
            if len(one_lists) == 3:  # Don't contain key "idx" in json file
                sen, token_ids, label = one_lists
            elif len(one_lists) == 4:  # Contain key "idx" in json file
                sen, token_ids, label, guid = one_lists
            else:
                assert len(one_lists) == 3 or len(one_lists) == 4

            if label == 0:
                self.num_consist += 1
            elif label == 1:
                self.num_hallu += 1

            examples.append(InputExample(
                guid=guid, sen=sen, idxs=token_ids, label=label))
        return examples

    def get_examples(self, path, require_uidx=False):
        return self._create_examples(
            self._read_data(path, require_uidx))

    def get_label_dist(self):
        return [self.num_consist, self.num_hallu]

def truncate(rep_subtokens, predict_mask, max_seq_length, rep_start_id, rep_end_id, mode="online"):
    '''
    Truncate the sequence if given a fixed context window. For example, given the following input sentence:
    "he signed a professional contract and promoted to the ===senior team=== where he managed to play for almost 3 years ."
    if the context window length is set as 4, the function will truncate the input as follows:

    online mode: "and promoted to the ===senior team==="
    offline mode: "to the ===senior team=== where he"
    '''

    position = 0
    one = 0

    for x in predict_mask:
      if one == 0:
        if predict_mask[position] == 1:
          rep_start_id = position
          one = 1
          position = position + 1
        else:
          position = position + 1
      else:
        if predict_mask[position] == 1:
          position = position + 1
        else:
          rep_end_id = position - 1
          break

    if mode == "offline":
        if len(rep_subtokens) > max_seq_length - 2:
            mid_pt = int((rep_start_id + rep_end_id) / 2)
            left_seq_length = int(max_seq_length / 2)
            right_seq_length = max_seq_length - left_seq_length
            if mid_pt - left_seq_length >= 0 and mid_pt + right_seq_length < len(rep_subtokens):
                left_pt = mid_pt - left_seq_length
                right_pt = mid_pt + right_seq_length
            elif mid_pt - left_seq_length < 0 and mid_pt + right_seq_length < len(rep_subtokens):
                left_pt = 0
                right_pt = max_seq_length
            elif mid_pt - left_seq_length >= 0 and mid_pt + right_seq_length >= len(rep_subtokens):
                right_pt = len(rep_subtokens)
                left_pt = len(rep_subtokens) - max_seq_length
            elif mid_pt - left_seq_length < 0 and mid_pt + right_seq_length >= len(rep_subtokens):
                left_pt = 0
                right_pt = len(rep_subtokens)
            rep_subtokens = rep_subtokens[left_pt:right_pt - 1]
            predict_mask = predict_mask[left_pt:right_pt - 1]
    else: # online
        left_pt, right_pt = 0, rep_end_id + 1
        if right_pt > max_seq_length - 2:
            left_pt = right_pt - (max_seq_length - 2)
        rep_subtokens = rep_subtokens[left_pt:right_pt]
        predict_mask = predict_mask[left_pt:right_pt]
    return rep_subtokens, predict_mask


def example2feature(example, tokenizer, max_seq_length, model_name, mode="online"):
    rep_start_id, rep_end_id = example.idxs
    rep_tokens = remove_marked_sen(example.sen, rep_start_id, rep_end_id)

    if 'xlnet' in model_name.lower():
        rep_subtokens = []
        predict_mask = []

        for id, rep_token in enumerate(rep_tokens):
            rep_subtoken = tokenizer.tokenize(rep_token)
            if id >= rep_start_id and id <= rep_end_id:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [1])
            else:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [0])

        rep_subtokens, predict_mask = truncate(rep_subtokens, predict_mask, max_seq_length, rep_start_id, rep_end_id, mode=mode)

        rep_subtokens.extend(["<sep>", "<cls>"])
        predict_mask.extend([0, 0])

    elif 'gpt' not in model_name.lower():

        rep_subtokens = []
        predict_mask = []
        for id, rep_token in enumerate(rep_tokens):
            rep_subtoken = tokenizer.tokenize(rep_token)
            if id >= rep_start_id and id <= rep_end_id:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [1])
            else:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [0])

        rep_subtokens, predict_mask = truncate(rep_subtokens, predict_mask, max_seq_length, rep_start_id, rep_end_id, mode=mode)

        rep_subtokens.insert(0, "[CLS]")
        predict_mask.insert(0, 0)
        rep_subtokens.append('[SEP]')
        predict_mask.append(0)

    elif 'gpt' in model_name.lower():
        rep_subtokens = []
        predict_mask = []

        for id, rep_token in enumerate(rep_tokens):
            rep_token = " "+rep_token if id!=0 else rep_token
            rep_subtoken = tokenizer.tokenize(rep_token)
            if id >= rep_start_id and id <= rep_end_id:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [1])
            else:
                rep_subtokens.extend(rep_subtoken)
                predict_mask.extend(len(rep_subtoken) * [0])

        rep_subtokens, predict_mask = truncate(rep_subtokens, predict_mask, max_seq_length, rep_start_id, rep_end_id, mode=mode)

    input_ids = tokenizer.convert_tokens_to_ids(rep_subtokens)
    segment_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)

    feat=InputFeatures(
                guid=example.guid,
                # tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                predict_mask=predict_mask,
                label_id=example.label)
    return feat

def get_examples_from_sen_tuple(sen, rep_pos):
    examples = []
    for uid, pos in enumerate(rep_pos):
        examples.append(InputExample(guid=uid, sen=sen, idxs=pos, label=0))
    return examples

class HalluDataset(data.Dataset):
    def __init__(self, examples, tokenizer, max_seq_length, model_name, task_mode="online"):
        self.examples=examples
        self.tokenizer=tokenizer
        self.max_seq_length=max_seq_length
        self.model_name = model_name
        self.task_mode = task_mode

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        feat=example2feature(self.examples[idx], self.tokenizer, self.max_seq_length,
                             self.model_name, self.task_mode)
        return feat.input_ids, feat.input_mask, feat.segment_ids, feat.predict_mask, feat.label_id, feat.guid

    @classmethod
    def pad(cls, batch):

        seqlen_list = [len(sample[0]) for sample in batch]
        maxlen = np.array(seqlen_list).max()

        f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: X for padding
        input_ids_list = torch.LongTensor(f(0, maxlen))
        input_mask_list = torch.LongTensor(f(1, maxlen))
        segment_ids_list = torch.LongTensor(f(2, maxlen))
        predict_mask_list = torch.ByteTensor(f(3, maxlen))
        label_id = torch.LongTensor([sample[4] for sample in batch])
        guids = [sample[5] for sample in batch]

        return input_ids_list, input_mask_list, segment_ids_list, predict_mask_list, label_id, guids

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, AutoModelWithLMHead, AutoAdapterModel, AdapterConfig, AdapterType, AutoTokenizer, BertConfig, AutoConfig, \
    XLNetLMHeadModel, DebertaTokenizer, DebertaModel
from pytorch_pretrained_bert.optimization import BertAdam
from transformers import AdapterConfig
from transformers.adapters.composition import Stack
import torch
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import random, os, sys
import torch.nn as nn
import torch.nn.functional as F
import codecs
import argparse
import spacy
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as color
import matplotlib.pyplot as plt
from nlp import load_dataset
from collections import defaultdict
import json
import csv
import pandas as pd
import seaborn as sns
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from torch.utils import data
from torch.utils.tensorboard import SummaryWriter
import warnings
warnings.filterwarnings("ignore")

class ClfModel(nn.Module):
      def __init__(self, args):
            super().__init__()

            self.load_model = args.load_model

            if "xlnet" in args.load_model:
                self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
                self.model = XLNetLMHeadModel.from_pretrained(self.load_model, mem_len=1024).to(args.device)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
                config = AutoConfig.from_pretrained(self.load_model)
                config.output_hidden_states = True
                self.model = AutoAdapterModel.from_pretrained(self.load_model, config=config)

            lang_adapter_config = AdapterConfig.load("pfeiffer", non_linearity="gelu", reduction_factor=2)
            adapter_en = self.model.load_adapter("en/wiki@ukp", config=lang_adapter_config)

            self.model.add_adapter("halluClass")
            self.model.add_classification_head("halluClass")
            self.model.train_adapter(["halluClass"])
            self.model.active_adapters = Stack("en", "halluClass")
            self.model = self.model.to(args.device)

            hidden_size = 1024 if "large" in self.load_model or self.load_model=="gpt2-medium" else 768

            self.hidden2label = nn.Sequential(
                                    nn.Linear(hidden_size, hidden_size//2),
                                    nn.Sigmoid(),
                                    nn.Linear(hidden_size//2, 2)).to(args.device)

            # self.hidden2label = nn.Linear(hidden_size, 2).to(args.device)
            self.dropout = torch.nn.Dropout(args.dropout)
            self.layer = args.bert_layer

            self.eval()
            self.device = args.device
            self.args = args

      def replace_adapter(self):
        lang_adapter_config = AdapterConfig.load("pfeiffer", non_linearity="gelu", reduction_factor=2)
        adapter_de = self.model.load_adapter("de/wiki@ukp", config=lang_adapter_config)
        print("deutscher Sprachadapter")
        self.model.active_adapters = Stack("de", "halluClass")

      def english_again(self):
        lang_adapter_config = AdapterConfig.load("pfeiffer", non_linearity="gelu", reduction_factor=2)
        adapter_en = self.model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
        print("englischer Sprachadapter")
        self.model.active_adapters = Stack("en", "halluClass")

      def save_model(self, path):
        checkpoint = {
            "model_state_dict": self.state_dict(),
            "optim_state_dict": optimizer.state_dict(),
            "args": self.args,
        }
        torch.save(checkpoint, path)

      def model_run(self, optim):
            trainpath = os.path.join(self.args.data_path, "noisyT4.txt")

            prefix = "runs/{}_lr_{}_dp_{}_{}_clen{}/".format(self.load_model, self.args.lr,
                                            self.args.dropout, self.args.task_mode, self.args.context_len)
            bestmodelpath = prefix + "best_model.pt"
            epoch, epoch_start = self.args.train_epoch, 1
            if os.path.exists(bestmodelpath) and self.args.continue_train:
                checkpoint = torch.load(bestmodelpath)
                self.load_state_dict(checkpoint["model_state_dict"])
                epoch_start = checkpoint["epoch"] + 1

            writer = SummaryWriter(prefix)
            csvlogger = prefix + "valid_log.csv"

            if not os.path.exists(csvlogger):
                csvfile = open(csvlogger, 'w+')
                fileHeader = ["epoch", "H_p", "H_r", "H_f1", "C_p", "C_r", "C_f1", "Gmean",
                              "Acc", "BSS", "ROC_AUC"]
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(fileHeader)
            else:
                csvfile = open(csvlogger, 'a')
                csvwriter = csv.writer(csvfile)

            dp = DataProcessor()
            train_examples = dp.get_examples(trainpath)

            train_dataset = HalluDataset(train_examples, self.tokenizer, self.args.context_len,
                                         self.load_model, self.args.task_mode)

            train_dataloader = data.DataLoader(dataset=train_dataset,
                                               batch_size=self.args.batch_size,
                                               shuffle=True,
                                               num_workers=4,
                                               collate_fn=HalluDataset.pad)
            nSamples = dp.get_label_dist()
            print("====Train label : {}".format(nSamples))
            normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
            normedWeights = torch.FloatTensor(normedWeights).to(self.args.device)
            loss_func = nn.CrossEntropyLoss(weight=normedWeights).to(self.args.device)
            fwd_func = self.model_train
            best_acc, best_f1_score = -1, -1
            for ei in range(epoch_start, epoch+1):
                cnt = 0
                self.train()
                train_loss = 0
                predy, trainy, hallu_sm_score = [], [], []
                for step, batch in enumerate(train_dataloader):
                    batch = tuple(t.to(self.device) for t in batch[:-1])
                    input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
                    score = fwd_func(input_ids, input_mask, segment_ids, predict_mask)
                    hallu_sm = F.softmax(score, dim=1)[:, 1]
                    _, pred = torch.max(score, dim=1)
                    # print("pred {}".format(pred.size()))
                    # print(label_ids.tolist())
                    # print(pred.tolist())
                    trainy.extend(label_ids.tolist())
                    predy.extend(pred.tolist())
                    hallu_sm_score.extend(hallu_sm.tolist())
                    loss = loss_func(score, label_ids)
                    train_loss += loss.item()
                    optim.zero_grad()
                    loss.backward()
                    optim.step()
                    cnt += 1
                    if cnt % 10 == 0:
                        print("Training Epoch {} - {:.2f}% - Loss : {}".format(ei, 100.0 * cnt/len(train_dataloader), train_loss/cnt))
                print("Training Epoch {} ...".format(ei))
                acc, f1, precision, recall, _, _, _, _ = \
                    binary_eval(predy, trainy, return_f1=True, predscore=hallu_sm_score)
                writer.add_scalar('Loss/train_epoch', train_loss, ei)
                writer.add_scalar('F1/train_consistent_epoch', f1[0], ei)
                writer.add_scalar('Precision/train_consistent_epoch', precision[0], ei)
                writer.add_scalar('Recall/train_consistent_epoch', recall[0], ei)
                writer.add_scalar('F1/train_hallucination_epoch', f1[1], ei)
                writer.add_scalar('Precision/train_hallucination_epoch', precision[1], ei)
                writer.add_scalar('Recall/train_hallucination_epoch', recall[1], ei)
                writer.add_scalar('Acc/train_epoch', acc, ei)
                print("Train Epoch {} end ! Loss : {}".format(ei, train_loss))

                if ei % 4 == 0:
                    savemodel_path = prefix + "model_{}_{}_{}.pt".format(ei, f1[0], f1[1])
                    torch.save(
                    {"model_state_dict": self.state_dict(),
                     "optim_state_dict": optim.state_dict(),
                     "train_f1": f1,
                     "train_precision": precision,
                     "train_recall": recall,
                     "train_acc": acc,
                     "epoch": epoch},
                     savemodel_path)

                validpath = os.path.join(self.args.data_path, "noisyV4.txt")
                valid_examples = dp.get_examples(validpath)
                valid_dataset = HalluDataset(valid_examples, self.tokenizer, self.args.context_len,
                                             self.load_model, self.args.task_mode)
                valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                                   batch_size=self.args.batch_size//2,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   collate_fn=HalluDataset.pad)

                self.eval()
                predy, validy, hallu_sm_score = [], [], []
                valid_loss = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(t.to(self.device) for t in batch[:-1])
                    input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
                    score = fwd_func(input_ids, input_mask, segment_ids, predict_mask)
                    hallu_sm = F.softmax(score, dim=1)[:, 1]
                    _, pred = torch.max(score, dim=1)
                    validy.extend(label_ids.tolist())
                    predy.extend(pred.tolist())
                    hallu_sm_score.extend(hallu_sm.tolist())
                    loss = loss_func(score, label_ids)
                    valid_loss += loss.item()
                print("Valid Epoch {} ...".format(ei))

                acc, f1, precision, recall, gmean, bss, roc_auc, info = \
                    binary_eval(predy, validy, return_f1=True, predscore=hallu_sm_score)

                if writer:
                    writer.add_scalar('Loss/valid_epoch', valid_loss, ei)
                    writer.add_scalar('F1/valid_consistent_epoch', f1[0], ei)
                    writer.add_scalar('Precision/valid_consistent_epoch', precision[0], ei)
                    writer.add_scalar('Recall/valid_consistent_epoch', recall[0], ei)
                    writer.add_scalar('F1/valid_hallucination_epoch', f1[1], ei)
                    writer.add_scalar('Precision/valid_hallucination_epoch', precision[1], ei)
                    writer.add_scalar('Recall/valid_hallucination_epoch', recall[1], ei)
                    writer.add_scalar('Acc/valid_epoch', acc, ei)

                if csvwriter:
                    rowdata = [ei, precision[1], recall[1], f1[1], precision[0], recall[0], f1[0], gmean, \
                               acc, bss, roc_auc]
                    rowdata = [str(f) for f in rowdata]
                    csvwriter.writerow(rowdata)

                f1_score = f1[0] + f1[1]
                if f1_score > best_f1_score:
                    best_f1_score = f1_score
                    torch.save({"model_state_dict": self.state_dict(),
                                "optim_state_dict": optim.state_dict(),
                                "valid_f1": f1,
                                "valid_precision": precision,
                                "valid_recall": recall,
                                "valid_acc": acc,
                                "epoch": epoch},
                                prefix + "best_model.pt")


      def model_train(self, input_ids, input_mask, segment_ids, predict_mask):

            if "xlnet" in self.load_model:
                _, hidden_states = self.model(input_ids=input_ids, attention_mask=input_mask)
                hidden_states = [h.transpose(0, 1) for h in hidden_states]
            elif "gpt" in self.load_model:
                _, _, hidden_states = self.model(input_ids=input_ids, attention_mask=input_mask)
            else:
                model_output = self.model(input_ids=input_ids, attention_mask=input_mask)
                prediction_scores = model_output.logits
                hidden_states = model_output.hidden_states


            features = hidden_states[self.layer]
            state = features * predict_mask.unsqueeze(-1)
            maxpool_state = 1.0 * torch.max(state, dim=1)[0]
            maxpool_state = self.dropout(maxpool_state)
            score = self.hidden2label(maxpool_state)

            return score


      def model_eval(self, model_path, data_path):
          dp = DataProcessor()
          testpath = data_path
          test_examples = dp.get_examples(testpath)
          test_dataset = HalluDataset(test_examples, self.tokenizer, self.args.context_len,
                                       self.load_model, self.args.task_mode)
          test_dataloader = data.DataLoader(dataset=test_dataset,
                                             batch_size=self.args.batch_size,
                                             shuffle=False,
                                             num_workers=4,
                                             collate_fn=HalluDataset.pad)

          if os.path.exists(model_path):
              checkpoint = torch.load(model_path)["model_state_dict"]
              model_dict = self.state_dict()
              checkpoint = {k: v for k, v in checkpoint.items() if k in model_dict}
              model_dict.update(checkpoint)
              self.load_state_dict(model_dict)
              fwd_func = self.model_train
              predy, testy, hallu_sm_score = [], [], []
              self.eval()
              for step, batch in enumerate(test_dataloader):
                  batch = tuple(t.to(self.device) for t in batch[:-1])
                  input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
                  score = fwd_func(input_ids, input_mask, segment_ids, predict_mask)
                  hallu_sm = F.softmax(score, dim=1)[:, 1]
                  _, pred = torch.max(score, dim=1)
                  testy.extend(label_ids.tolist())
                  predy.extend(pred.tolist())
                  hallu_sm_score.extend(hallu_sm.tolist())
              print("Test ...")

              binary_eval(predy, testy, return_f1=True, predscore=hallu_sm_score)
          else:
              print("Invaild model path ...")

args = argparse.Namespace()
args.load_model = "bert-base-multilingual-cased"
args.data_path = "/content/drive/MyDrive/Wiki-Hades"
args.train_epoch = 10
args.batch_size = 8
args.lr = 5e-3
args.dropout = 0.2
args.continue_train = False
args.task_mode = "online"
args.context_len = 200
args.bert_layer = -1
args.num_epoch = 20
args.params = ""
args.device = "cuda"  # Set to "cpu" if you want to use CPU instead
args.inf_model = "frozen"
#args.inf_data = "/content/drive/MyDrive/Wiki-Hades/validDe.jsonl"

In [None]:
model = ClfModel(args)
model.replace_adapter()
model = model.to(args.device)

learning_rate0 = args.lr
weight_decay_finetune = 1e-5

if "all" in args.params:
    named_params = list(model.hidden2label.named_parameters()) + \
                   list(model.model.named_parameters())
else:
    named_params = list(model.hidden2label.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optim_func = torch.optim.Adam if "gpt" in args.load_model else BertAdam
optimizer = optim_func(optimizer_grouped_parameters, lr=learning_rate0)

try:
    model.model_run(optimizer)
except KeyboardInterrupt:
    print("Stop by Ctrl-C ...")

model.save_model("/content/drive/MyDrive/Modelle/savedModel.pt")

#model.replace_adapter()

#model.save_model("/content/drive/MyDrive/Modelle/savedModelDe.pt")

In [None]:
import pandas as pd

# Define a list of model paths
model_paths = [
    "/content/drive/MyDrive/Modelle/savedModel.pt"
]

results = []

for model_path in model_paths:
    # Load the models
    loaded_model = ClfModel(args)
    loaded_model.replace_adapter()
    loaded_model.load_state_dict(torch.load(model_path)["model_state_dict"])

    # Load the test data
    inf_data_path = "/content/drive/MyDrive/Wiki-Hades/validDe.txt"  # Ersetzen Sie dies durch den tatsächlichen Pfad

    loaded_model = loaded_model.to(args.device)
    result = loaded_model.model_eval(model_path, inf_data_path)

    # Add to list
    results.append({'Model_Path': model_path, 'Result': result})

results_df = pd.DataFrame(results)