In [None]:
from transformers import (
    AutoTokenizer,
)
from adapters import (
    AutoAdapterModel,
    AdapterTrainer,
)
from datasets import (
    Dataset,
    DatasetDict
)

import numpy as np
import pandas as pd
import torch
import gc
import os

In [None]:
DATA_PATH = "../../data/processed"
MODEL_PATH = "../../models"

testset_state = [
    "unperturbed",
    "perturbed"
]

dimensions = [
    "gender", 
    "race", 
    "religion", 
    "nationality", 
    "country",
    "merged"
]

models_name = [
    "roberta-base",
    "facebook/FairBERTa",
    "microsoft/deberta-base",
    "bert-base-cased",
    "distilbert-base-cased"
]

tasks = [
    "clef22",
    "fingerprints",
    "shadesoftruth",
    "basil",
    "webis",
    "pheme",
    "clickbait",
    "propaganda",
    "politifact",
    "buzzfeed",
    "twittercovidq2"
]

In [None]:
only_ent_preds = False

for model_name in models_name:
    for dimension in dimensions:
        for task in tasks:
            for state in testset_state:

                # DIMENSION
                model_folder_path = f"../..{os.sep}models{os.sep}{model_name}{os.sep}{dimension}{os.sep}{task}"
                CONFIG = {
                    "task_name": task,
                    "model_name": model_name,
                    "model_path": f"{model_folder_path}{os.sep}{os.listdir(model_folder_path)[0]}{os.sep}{task}",
                    "max_length": 128,
                }

                dataset_path = f"{DATA_PATH}{os.sep}{dimension}"

                test_df = pd.read_csv(f"{dataset_path}{os.sep}{CONFIG['task_name']}_test.csv")
                
                if only_ent_preds:
                    test_df = test_df.dropna()
                if state == "perturbed":
                    test_df = test_df.drop(columns=['text'])
                    test_df.rename(columns = {'perturbed_text':'text'}, inplace = True)

                test = Dataset.from_pandas(test_df)

                dataset = DatasetDict({"test": test})
                dataset = dataset.class_encode_column("labels")

                tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

                def tokenize_function(examples):
                    return tokenizer(
                        examples["text"], padding="max_length", truncation=True, max_length=CONFIG["max_length"]
                    )

                tokenized_datasets = dataset.map(tokenize_function, batched=True)
                test_dataset = tokenized_datasets["test"]

                model = AutoAdapterModel.from_pretrained(CONFIG['model_name'])
                model.load_adapter(CONFIG['model_path'])
                model.set_active_adapters(task)
                
                trainer = AdapterTrainer(model=model)

                trainer.model = model.cuda()
                y = trainer.predict(test_dataset)
                preds = y.predictions[0] if isinstance(y.predictions, tuple) else y.predictions
                if only_ent_preds:
                    filename = f'../../data/interim/predictions{os.sep}{model_name}{os.sep}{dimension}{os.sep}{task}{os.sep}{state}.npy'
                else:
                    filename = f'../../data/interim/tot_predictions{os.sep}{model_name}{os.sep}{dimension}{os.sep}{task}{os.sep}{state}.npy'
                if preds is not None:
                    preds = np.argmax(preds, axis=1)
                    os.makedirs(os.path.dirname(filename), exist_ok=True)
                    with open(filename, 'wb') as f:
                        np.save(f, preds)
                else:
                    print(f"No examples for: {filename}")

                del model
                gc.collect()
                torch.cuda.empty_cache()