install libraries

In [2]:
import os
import torch
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from torch.optim import Adam, RMSprop
from transformers  import  get_scheduler
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import KFold
import wandb
from datetime import datetime
import os
import shutil
import random
import numpy as np
from itertools import product


  from .autonotebook import tqdm as notebook_tqdm


utils

In [4]:
# utils.py
def set_seed(seed):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)

    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

def remove_previous_model(folder):
    dirs = [x for x in os.listdir(folder) if os.path.isdir(folder+os.sep+x)]
    for x in dirs:
        shutil.rmtree(folder+os.sep+x, ignore_errors=False, onerror=None)
        
        
def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    for instance in product(*vals):
        yield dict(zip(keys, instance))
        
# mydataset.py
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, mode='train'):
        self.encodings = encodings
        if mode !="train":
            self.labels=  [0]*len(encodings)
        else: self.labels = labels


    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
# datareader.py
import pandas as pd
import json

BINARY_MAPPING_CRITICAL_POS = {'CONSPIRACY': 0, 'CRITICAL': 1}
BINARY_MAPPING_CONSPIRACY_POS = {'CRITICAL': 0, 'CONSPIRACY': 1}

CATEGORY_MAPPING_CRITICAL_POS_INVERSE = {0: 'CONSPIRACY', 1: 'CRITICAL'}
CATEGORY_MAPPING_CONSPIRACY_POS_INVERSE = {0: 'CRITICAL', 1: 'CONSPIRACY'}

TRAIN_DATASET_ES="../dataset_oppositional/training/dataset_oppositional/dataset_es_train.json"
TRAIN_DATASET_EN="../dataset_oppositional/training/dataset_oppositional/dataset_en_train.json"
#TEST_DATASET_EN ="./dataset_oppositional/test/dataset_oppositional_test_nolabels/dataset_en_official_test_nolabels.json"
#TEST_DATASET_ES ="./dataset_oppositional/test/dataset_oppositional_test_nolabels/dataset_en_official_test_nolabels.json"


class PAN24Reader:
    def __init__(self):
        pass
    def read_json_file(self, path):
        dataset=[]
        print(f'Loading official JSON {path} dataset')
        with open(path, 'r', encoding='utf-8') as file:
            dataset = json.load(file)
        return dataset

    def load_dataset_classification(self, path, string_labels=False, positive_class='conspiracy'):
        dataset = self.read_json_file(path)
        # convert to a format suitable for classification
        texts = pd.Series([doc['text'] for doc in dataset])
        if string_labels:
            classes = pd.Series([doc['category'] for doc in dataset])
        else:
            if positive_class == 'conspiracy':
                binmap = BINARY_MAPPING_CONSPIRACY_POS
            elif positive_class == 'critical':
                binmap = BINARY_MAPPING_CRITICAL_POS
            else:
                raise ValueError(f'Unknown positive class: {positive_class}')
            classes = [binmap[doc['category']] for doc in dataset]
            classes = pd.Series(classes)
        ids = pd.Series([doc['id'] for doc in dataset])
        data = pd.DataFrame({"text": texts, "id": ids, "label": classes})
        return data


myReader=PAN24Reader()
es_train_df = myReader.load_dataset_classification(TRAIN_DATASET_ES, string_labels=False, positive_class='conspiracy')
en_train_df = myReader.load_dataset_classification(TRAIN_DATASET_EN, string_labels=False, positive_class='conspiracy')



Loading official JSON ../dataset_oppositional/training/dataset_oppositional/dataset_es_train.json dataset
Loading official JSON ../dataset_oppositional/training/dataset_oppositional/dataset_en_train.json dataset


fine_tunning.py

In [5]:
def training(_wandb, _model, _train_data, _val_data, _learning_rate, _optimizer_name, _schedule, _epochs,
             _tokenizer, _batch_size=32, _padding="max_length", _max_length=512, _truncation=True,
             _patience=10, _measure= "accuracy", _out=None):
    train_encodings = _tokenizer(_train_data["text"].tolist(), max_length=_max_length, truncation=_truncation, padding=_padding, return_tensors="pt")
    val_encodings = _tokenizer(_val_data["text"].tolist(), max_length=_max_length, truncation=_truncation, padding=_padding, return_tensors="pt")

    train, val = MyDataset(train_encodings, _train_data["label"].tolist()), MyDataset(val_encodings, _val_data["label"].tolist())

    train_dataloader, val_dataloader  = torch.utils.data.DataLoader(train, batch_size=_batch_size, shuffle=True), torch.utils.data.DataLoader(val, batch_size=_batch_size)


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    _wandb.log({"divice": str(device)})
    if use_cuda:
        model = _model.cuda()
    best_measure, best_model_name, patience = None, None, 0
    training_stats = []
    # train_eval = evaluate.load("accuracy")
    train_eval = evaluate.load(f"Yeshwant123/{_measure}")

    lr_scheduler, optimizer = None, None
    #Here we can specify different methods to optmize the paarameters, initially we can consider Adam and RmsProp

    _wandb.log({"info": "Creating the Optimizer and Schedule "})

    lr_scheduler, optimizer = None, None
    if _optimizer_name == "adam":
        optimizer = Adam(_model.parameters(), lr=_learning_rate)
    elif _optimizer_name == "rmsprop":
        optimizer = RMSprop(_model.parameters(), lr=_learning_rate)

    #Here we can define different learning rate schedules, to variate de learning rate in each training step. Initially we use
    # can use linear learning rate schedule
    num_training_steps = _epochs * len(_train_data)
    if _schedule=="linear":
        lr_scheduler = get_scheduler(_schedule, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


    for epoch in range(_epochs):
        if patience >= _patience: break
        total_loss_train, total_acc_train = 0, 0
        total_train_step = 0
        for batch in train_dataloader:
            total_train_step += 1
            # print("Epoch ", epoch, "Batch", i)
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            total_loss_train += loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            train_eval.add_batch(predictions=predictions, references=batch["labels"])
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        total_acc_train = train_eval.compute()

        total_eval_steps = 0
        total_loss_val, total_acc_val = 0, 0
        eval_metric = evaluate.load(f"Yeshwant123/{_measure}")
        model.eval()
        for batch in val_dataloader:
            total_eval_steps += 1
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
                loss = outputs.loss
                total_loss_val += loss.item()
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
            eval_metric.add_batch(predictions=predictions, references=batch["labels"])
        total_acc_val = eval_metric.compute()

        if best_measure is None or (best_measure < total_acc_val[_measure]):  # here you must set your save weights
            if best_measure == None: _wandb.log({"info": "It's the first time (epoch) ******************"})
            elif best_measure < total_acc_val[_measure]:
                _wandb.log({"info": "In this epoch an improvement was achieved. (epoch) ******************"})

            best_measure = total_acc_val[_measure]
            try:
                os.makedirs(_out + os.sep + 'models', exist_ok=True)
            except OSError as error:
                _wandb.log({"info": "Directory '%s' can not be created"})
            # remove the directories
            remove_previous_model(_out + os.sep + 'models')
            best_model_name = _out + os.sep + 'models/bestmodel_epoch_{}'.format(epoch +1)
            _wandb.log({"info": "The current best model is " + best_model_name + " "+ best_measure})

            os.makedirs(best_model_name, exist_ok=True)
            model.save_pretrained(best_model_name)
            patience = 0
        else:
            patience += 1
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': total_loss_train / total_train_step,
                'Valid. Loss': total_loss_val / total_eval_steps,
                f'Valid.{_measure}': total_acc_val[_measure],
                f'Training.{_measure}': total_acc_train[_measure]
            }
        )
        
        _wandb.log({
            'epoch': epoch + 1,
            'train_loss': total_loss_train / len(train_dataloader),
            f'train_{_measure}': total_acc_train[_measure],
            'val_loss': total_loss_val / len(val_dataloader),
            f'val_{_measure}': total_acc_val[_measure]
        })

    if best_model_name != None:
        model = model.from_pretrained(best_model_name)
        _wandb.log({"info": "The final model used to predict the labels of the testing datasets is "+ best_model_name})

    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('epoch')
    df_stats.to_csv(_out + os.sep + "training_stats.csv")

    _wandb.log({"info": df_stats})
    myplot = sns.lineplot(data=df_stats, palette="tab10", linewidth=2.5)
    fig = myplot.get_figure()
    fig.savefig(_out + os.sep + 'loss-figue.png')
    plt.close()
    return model

############################################################################################################################################################################3
#VALIDATION ON THE TEST SET

def validate(_wandb, _model, _test_data, _tokenizer, _batch_size=32, _padding="max_length", _max_length=512, _truncation=True, _measure="accuracy", evaltype=True):
    test_encodings = _tokenizer(_test_data['text'].tolist(), max_length=_max_length, truncation=_truncation, padding=_padding, return_tensors="pt")
    _mode = "train" if evaltype else "test"
    test = MyDataset(test_encodings, _test_data["label"].tolist(), mode=_mode)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=_batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = _model.cuda()
    eval_metric, out, k = None, None, 0
    if evaltype==True:
        eval_metric = evaluate.load(f"Yeshwant123/{_measure}")

    model.eval()
    for batch in test_dataloader:

        batch = {k: v.to(device) for k, v in batch.items()}
        
    with torch.no_grad():
                outputs = _model(**batch)
                loss = outputs.loss
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                if k == 0:
                    out = predictions
                else:
                    out = torch.cat((out, predictions), 0)
                k += 1
                total_loss += loss.item()
                if evaltype:
                    eval_metric.add_batch(predictions=predictions, references=batch["labels"])

    if evaltype==True:
        total_acc_test = eval_metric.compute()
        test_mesure = total_acc_test[_measure]
        test_accuarcy = total_acc_test["accuracy"]
        avg_test_loss = total_loss / len(test_dataloader)        # Log the test accuracy and loss to wandb
        _wandb.log({
            f'test_{_measure}': test_mesure,
            'test_accuracy': test_accuarcy,
            'test_avg_loss': avg_test_loss
        })
    return out


main.py

In [6]:
# Get current date and time
current_datetime = datetime.now()

# Format it to include hours, minutes, and seconds
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")


wandb.login()

SEED=1234
set_seed(SEED)

preconfig = {
    0: {
        "lang": "english",
        "model_name": "roberta-base",
    },
    1: {
        "lang": "english",
        "model_name": "microsoft/deberta-base",
    },
    # 2: {
    #     "lang": "spanish",
    #     "model_name": "dccuchile/bert-base-spanish-wwm-uncased",
    # },
    # 3: {
    #     "lang": "spanish",
    #     "model_name": "PlanTL-GOB-ES/roberta-base-bne",
    # },
    # 4: {
    #     "lang": "spanish",
    #     "model_name": "bert-base-multilingual-uncased"
    # }
}

hyperparams = {
    "optimizer_name": ["adam", "rmsprop"], # ["adam", "rmsprop", "sgd"]
    "learning": [0.5e-5, 1e-6], # [0.5e-5, 1e-5, 0.5e-6, 1e-6
    "schedule": ["linear", "cosine"], # ["linear", "cosine", "constant"]
    "patience": [5, 10], # [3, 5, 10]
    "epochs": [5, 20], # [5, 10, 20]
    "measure": ["mcc"],
    "batch_size": [32], # [16, 32, 64, 128]
    "max_length": [128]
}

# epochs = 5 #[5, 10, 20]
# batch_size = 32 #[16, 32, 64, 128]
# measure = "mcc"
# patience = 3 #[5, 10]
# max_length = 128 #[This value can be estimated on the training set]
# Define KFold cross-validation
kf = KFold(n_splits=5)

# For each preconfiguration
for i, preconfig in preconfig.items():
    lang = preconfig["lang"]
    model_name = preconfig["model_name"]
    
    if lang == "spanish":
        X= es_train_df
    elif lang == "english":
        X= en_train_df
    
    # Start a parent run for this preconfiguration
    # parent_run = wandb.init(project='lnr_oppositional_thinking',
    #                         entity='davidandreuroqueta',
    #                         group=f'{lang}_{model_name}',
    #                         job_type='model')
    # parent_run.config.update(preconfig)
    # parent_run.config.update({"SEED":SEED})

    print("Loading Tokenizer " + model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Loading Transformer Model " + model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


    # Initialize a counter for the runs
    run_counter = 0

    # For each hyperparameter configuration
    for config in product_dict(**hyperparams):
        run_counter += 1
        # Start a child run for this hyperparameter configuration
        # with wandb.init(project='lnr_oppositional_thinking',
        #                 entity='davidandreuroqueta',
        #                 group=f'{lang}_{model_name}',
        #                 job_type='hyperparam-tuning',
        #                 name=f'{lang}_{model_name}_{run_counter}',
        #                 ) as run:
        #     # Log hyperparameters
        #     run.config.update(config)
            
        # For each fold
        for fold, (train_index, val_index) in enumerate(kf.split(X)):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]

            # Start a child run for this fold
            with wandb.init(project=f'lnr_oppositional_thinking_{formatted_datetime}',
                            entity='davidandreuroqueta',
                            group=f'{lang}_{model_name}',
                            job_type=f'hyperparam-tuning-{run_counter}',
                            name=f'{lang}_{model_name}_{run_counter}_fold_{fold}'
                            ) as fold_run:
                fold_run.config.update(preconfig)
                fold_run.config.update(config)
                fold_run.config.update({"SEED":SEED})

                # Log the fold number
                fold_run.config.update({"fold": fold + 1})

                # Train and validate your model, log metrics, etc.
                # ...
                # FINE-TUNING the model and obtaining the best model across all epochs
                fineTmodel = training(_wandb=fold_run, _model=model, _train_data=X_train, _val_data=X_val,
                                    _learning_rate=config["learning"], _optimizer_name=config["optimizer_name"],
                                    _schedule=config["schedule"], _epochs=config["epochs"], _tokenizer=tokenizer,
                                    _batch_size=config["batch_size"], _padding="max_length", _max_length=config["max_length"],
                                    _truncation=True, _patience=config["patience"], _measure=config["measure"], _out="./out")

                # VALIDATING OR PREDICTING on the test partition, this time I'm using the validation set, but you have to use the test set.
                preds = validate(_wandb=fold_run, _model=fineTmodel, _test_data=X_val, _tokenizer=tokenizer,
                                _batch_size=config["batch_size"], _padding="max_length", _max_length=config["max_length"],
                                _truncation=True, _measure=config["measure"], evaltype=True)

    # End the parent run
    # parent_run.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdavidandreuroqueta[0m ([33mdavidandreu-org[0m). Use [1m`wandb login --relogin`[0m to force relogin


Loading Tokenizer roberta-base
Loading Transformer Model roberta-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mdavidandreuroqueta[0m. Use [1m`wandb login --relogin`[0m to force relogin
