# Initialization

## Import libraries and the datasets

In [None]:
pip install transformers datasets torch

In [None]:
## Import required libraries

import torch
import random
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import set_seed
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, precision_recall_fscore_support

In [None]:
## Define function to set the seed

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

#set_seed(42)

In [None]:
## Import the data and preprocess them

conan = pd.read_csv('conan.csv')
twitter = pd.read_csv('twitter.csv')

## Create the combined dataset and drop rows with missing values in required columns
data = pd.concat([conan, twitter], ignore_index=True)
data = data.dropna(subset=['hate_speech', 'counter_speech','clarity','evidence', 'rebuttal', 'fairness'])
## Ensure text columns are strings and integers
data['hate_speech'] = data['hate_speech'].astype(str)
data['counter_speech'] = data['counter_speech'].astype(str)
data['clarity'] = data['clarity'].astype(int)
data['evidence'] = data['evidence'].astype(int)
data['rebuttal'] = data['rebuttal'].astype(int)
data['fairness'] = data['fairness'].astype(int)

## Preprocess data for the two separate datasets
conan = conan.dropna(subset=['hate_speech', 'counter_speech','clarity','evidence', 'rebuttal', 'fairness'])
conan['hate_speech'] = conan['hate_speech'].astype(str)
conan['counter_speech'] = conan['counter_speech'].astype(str)
conan['clarity'] = conan['clarity'].astype(int)
conan['evidence'] = conan['evidence'].astype(int)
conan['rebuttal'] = conan['rebuttal'].astype(int)
conan['fairness'] = conan['fairness'].astype(int)

twitter = twitter.dropna(subset=['hate_speech', 'counter_speech','clarity','evidence', 'rebuttal', 'fairness'])
twitter['hate_speech'] = twitter['hate_speech'].astype(str)
twitter['counter_speech'] = twitter['counter_speech'].astype(str)
twitter['clarity'] = twitter['clarity'].astype(int)
twitter['evidence'] = twitter['evidence'].astype(int)
twitter['rebuttal'] = twitter['rebuttal'].astype(int)
twitter['fairness'] = twitter['fairness'].astype(int)

# Models with only CS embeddings (Bert_CS)

## Binary variables

In [None]:
## Define the required funtions for preprocessing, splitting data, computing metrics, training and testing

def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    # Only use counter_speech for input
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(df, target_col, tokenizer, max_length=128, test_size=0.2, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training, validation, and testing, with handling for binary and multi-class labels.
    """
    # Check required columns
    required_columns = ['counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train+val and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    # Further split train+val into train and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (1 - test_size), random_state=random_state)

    # Tokenize inputs using only counter_speech
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        'clf_report': clf_report,
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),  # Enable mixed precision training if GPU is available
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for binary classification
effectiveness_dimensions = {
    "emotional_appeal": 2,  # Binary classification (0-1)
    "audience_adaptation": 2,  # Binary classification (0-1)
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=True)
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=False)
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    #Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_args = TrainingArguments(
        output_dir=f"./results/{dim}_test",
        per_device_eval_batch_size=16,
        do_eval=True,
    )

    test_trainer = Trainer(
        model=trained_model,
        args=test_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    test_results = test_trainer.evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_bert_cs_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


## Multi-label variables

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    # Only using counter_speech for classification
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(df, target_col, tokenizer, max_length=128, test_size=0.2, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training, validation, and testing, with handling for binary and multi-class labels.
    """
    # Check required columns
    required_columns = ['counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train+val and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    # Further split train+val into train and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (1 - test_size), random_state=random_state)

    # Tokenize inputs only counter_speech
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        def adjust_labels(example):
            example['labels'] = torch.tensor(int(example['labels']) - 1)  # Subtract 1 from labels
            return example

        train_dataset = train_dataset.map(adjust_labels)
        val_dataset = val_dataset.map(adjust_labels)
        test_dataset = test_dataset.map(adjust_labels)

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "clf_report": clf_report
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """

    set_seed(seed)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        no_cuda=False
        #fp16=torch.cuda.is_available(), # Enable mixed precision training if GPU is available
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_results_cs.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for multi-label classification
effectiveness_dimensions = {
    "clarity": 3,  # multi-label classification (1-3)
    "evidence": 3,
    'rebuttal': 3,
    'fairness': 3,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=True)
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=False)
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_bert_cs_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


# Models with CS + HS embeddings (Bert_CS_HS)

## Binary variables

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['hate_speech'].tolist(),
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(df, target_col, tokenizer, max_length=128, test_size=0.2, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training, validation, and testing, with handling for binary and multi-class labels.
    """
    # Check required columns
    required_columns = ['hate_speech', 'counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train+val and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    # Further split train+val into train and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (1 - test_size), random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['hate_speech'],
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['hate_speech', 'counter_speech'])
    val_dataset = val_dataset.remove_columns(['hate_speech', 'counter_speech'])
    test_dataset = test_dataset.remove_columns(['hate_speech', 'counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        'clf_report': clf_report,
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for binary classification
effectiveness_dimensions = {
    "emotional_appeal": 2,
    "audience_adaptation": 2,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=True)
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=False)
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")

## Multi-label variables

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['hate_speech'].tolist(),
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(df, target_col, tokenizer, max_length=128, test_size=0.2, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training, validation, and testing, with handling for binary and multi-class labels.
    """
    # Check required columns
    required_columns = ['hate_speech', 'counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train+val and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    # Further split train+val into train and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (1 - test_size), random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['hate_speech'],
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")


    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        def adjust_labels(example):
            example['labels'] = torch.tensor(int(example['labels']) - 1)  # Subtract 1 from labels
            return example

        train_dataset = train_dataset.map(adjust_labels)
        val_dataset = val_dataset.map(adjust_labels)
        test_dataset = test_dataset.map(adjust_labels)


    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['hate_speech', 'counter_speech'])
    val_dataset = val_dataset.remove_columns(['hate_speech', 'counter_speech'])
    test_dataset = test_dataset.remove_columns(['hate_speech', 'counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "clf_report": clf_report
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """

    set_seed(seed)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        no_cuda=False
        #fp16=torch.cuda.is_available(), # Enable mixed precision training if GPU is available
    )

    # Move model to GPU if available
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for multi-label classification
effectiveness_dimensions = {
    "clarity": 3,  # multi-label classification (1-3)
    "evidence": 3,
    'rebuttal': 3,
    'fairness': 3,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=True)
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(data, target_col=dim, tokenizer=tokenizer, is_binary=False)
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


# Cross-validation setting

## Binary CS cross-validation

**Train on Twitter & test on CONAN**

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    # Only use counter_speech for input
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(twitter_df, conan_df, target_col, tokenizer, max_length=128, val_size=0.1, random_state=1, is_binary=False):
    """
    Prepares datasets for training, validation (from Twitter), and testing (from Conan), with handling for binary and multi-class labels.
    """
    # Check required columns in both datasets
    required_columns = ['counter_speech', target_col]
    if not all(col in twitter_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Twitter dataset: {', '.join(set(required_columns) - set(twitter_df.columns))}")
    if not all(col in conan_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Conan dataset: {', '.join(set(required_columns) - set(conan_df.columns))}")

    # Split Twitter dataset into train and validation sets
    train_df, val_df = train_test_split(twitter_df, test_size=val_size, random_state=random_state)

    # Tokenize inputs using only counter_speech
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(conan_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        'clf_report': clf_report,
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=1):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for binary classification
effectiveness_dimensions = {
    "emotional_appeal": 2,  # Binary classification (0-1)
    "audience_adaptation": 2,  # Binary classification (0-1)
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=True
        )
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=False
        )
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_new.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


**Train on CONAN & test on Twitter**

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    # Only use counter_speech for input
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(twitter, conan, target_col, tokenizer, max_length=128, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training, validation (from Twitter), and testing (from Conan), with handling for binary and multi-class labels.
    """
    # Check required columns in both datasets
    required_columns = ['counter_speech', target_col]
    if not all(col in twitter.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Twitter dataset: {', '.join(set(required_columns) - set(twitter.columns))}")
    if not all(col in conan.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Conan dataset: {', '.join(set(required_columns) - set(conan.columns))}")

    # Split conan dataset into train and validation sets
    train_df, val_df = train_test_split(conan, test_size=val_size, random_state=random_state)

    # Tokenize inputs using only counter_speech
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(twitter)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        'clf_report': clf_report,
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for binary classification
effectiveness_dimensions = {
    "emotional_appeal": 2,  # Binary classification (0-1)
    "audience_adaptation": 2,  # Binary classification (0-1)
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter=twitter,
            conan=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=True
        )
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter=twitter,
            conan=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=False
        )
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


## Binary HS + CS cross-validation

In lines 2017 and 220, set the name of the dataset on which you want to train and test.

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['hate_speech'].tolist(),
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels


def prepare_datasets(df, target_col, tokenizer, max_length=128, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares training and validation datasets from a single dataframe.
    """
    # Check required columns
    required_columns = ['hate_speech', 'counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train and validation sets
    train_df, val_df = train_test_split(df, test_size=val_size, random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['hate_speech'],
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['hate_speech', 'counter_speech'])
    val_dataset = val_dataset.remove_columns(['hate_speech', 'counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")

    return train_dataset, val_dataset


def prepare_test_dataset(df, target_col, tokenizer, max_length=128, is_binary=False):
    """
    Prepares the test dataset from a single dataframe.
    """
    # Check required columns
    required_columns = ['hate_speech', 'counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['hate_speech'],
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframe to Hugging Face Dataset
    test_dataset = Dataset.from_pandas(df)

    # Apply tokenization
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor(int(x["labels"]))})

    # Remove unnecessary columns
    test_dataset = test_dataset.remove_columns(['hate_speech', 'counter_speech'])

    # Set the format for PyTorch
    test_dataset.set_format("torch")

    return test_dataset


def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model


def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        'clf_report': clf_report,
    }


def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model


# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example dimensions for binary classification
effectiveness_dimensions = {
    "emotional_appeal": 2,  # Binary classification (0-1)
    "audience_adaptation": 2,  # Binary classification (0-1)
}


# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare training and validation datasets from Conan
    train_dataset, val_dataset = prepare_datasets(conan, target_col=dim, tokenizer=tokenizer, is_binary=(num_labels == 2))

    # Prepare test dataset from Twitter
    test_dataset = prepare_test_dataset(twitter, target_col=dim, tokenizer=tokenizer, is_binary=(num_labels == 2))

    # Initialize the model
    model = initialize_model(num_labels, is_binary=(num_labels == 2))

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the Twitter test set
    print("\nEvaluating on the conan test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_conan_test_results.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


## Multi-label CS cross-validation

**Train on Twitter & test on CONAN**

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(twitter_df, conan_df, target_col, tokenizer, max_length=128, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training and validation.
    """
    # Check required columns
    required_columns = ['counter_speech', target_col]
    if not all(col in twitter_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Twitter dataset: {', '.join(set(required_columns) - set(twitter_df.columns))}")
    if not all(col in conan_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Conan dataset: {', '.join(set(required_columns) - set(conan_df.columns))}")

    # Split Twitter dataset into train and validation sets
    train_df, val_df = train_test_split(twitter_df, test_size=val_size, random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(conan_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        def adjust_labels(example):
            example['labels'] = torch.tensor(int(example['labels']) - 1)  # Subtract 1 from labels
            return example

        train_dataset = train_dataset.map(adjust_labels)
        val_dataset = val_dataset.map(adjust_labels)
        test_dataset = test_dataset.map(adjust_labels)

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "clf_report": clf_report
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        #metric_for_best_model="accuracy",
        no_cuda=False
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_results_cs.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for multi-label classification
effectiveness_dimensions = {
    "clarity": 3,  # multi-label classification (1-3)
    "evidence": 3,
    'rebuttal': 3,
    'fairness': 3,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=True
        )
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=False
        )
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_cs_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


**Train on CONAN & test on Twitter**

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(twitter_df, conan_df, target_col, tokenizer, max_length=128, val_size=0.1, random_state=3, is_binary=False):
    """
    Prepares datasets for training and validation from `twitter_df` and testing from `conan_df`.
    """
    # Check required columns
    required_columns = ['counter_speech', target_col]
    if not all(col in twitter_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Twitter dataset: {', '.join(set(required_columns) - set(twitter_df.columns))}")
    if not all(col in conan_df.columns for col in required_columns):
        raise ValueError(f"Missing required columns in Conan dataset: {', '.join(set(required_columns) - set(conan_df.columns))}")

    # Split CONAN dataset into train and validation sets
    train_df, val_df = train_test_split(conan_df, test_size=val_size, random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(twitter_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")

    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        def adjust_labels(example):
            example['labels'] = torch.tensor(int(example['labels']) - 1)  # Subtract 1 from labels
            return example

        train_dataset = train_dataset.map(adjust_labels)
        val_dataset = val_dataset.map(adjust_labels)
        test_dataset = test_dataset.map(adjust_labels)

    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['counter_speech'])
    val_dataset = val_dataset.remove_columns(['counter_speech'])
    test_dataset = test_dataset.remove_columns(['counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "clf_report": clf_report
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=3):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """
    set_seed(seed)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        #metric_for_best_model="accuracy",
        no_cuda=False
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_results_cs.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model

# Example dimensions for multi-label classification
effectiveness_dimensions = {
    "clarity": 3,  # multi-label classification (1-3)
    "evidence": 3,
    'rebuttal': 3,
    'fairness': 3,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets based on whether it's binary or multi-class classification
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=True
        )
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, test_dataset = prepare_datasets(
            twitter_df=twitter,
            conan_df=conan,
            target_col=dim,
            tokenizer=tokenizer,
            is_binary=False
        )
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the test set
    print("\nEvaluating on the test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_cs_short.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")


## Multi-label HS + CS cross-validation

From line 184-191, set the dataset for training and testing.

In [None]:
def preprocess_data(data, tokenizer, max_length, target_col):
    """
    Tokenizes the input text data and prepares labels for the given target column.
    """
    inputs = tokenizer(
        data['hate_speech'].tolist(),
        data['counter_speech'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = torch.tensor(data[target_col].values)
    return inputs, labels

def prepare_datasets(df, target_col, tokenizer, max_length=128, test_size=0.2, val_size=0.1, random_state=0, is_binary=False):
    """
    Prepares datasets for training, validation, and testing, with handling for binary and multi-class labels.
    """
    # Check required columns
    required_columns = ['hate_speech', 'counter_speech', target_col]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(set(required_columns) - set(df.columns))}")

    # Split data into train+val and test sets
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    # Further split train+val into train and validation sets
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (1 - test_size), random_state=random_state)

    # Tokenize inputs
    def tokenize_function(examples):
        return tokenizer(
            examples['hate_speech'],
            examples['counter_speech'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Convert dataframes to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Apply tokenization
    train_dataset = train_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    val_dataset = val_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    test_dataset = test_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

    # Rename target column to "labels"
    train_dataset = train_dataset.rename_column(target_col, "labels")
    val_dataset = val_dataset.rename_column(target_col, "labels")
    test_dataset = test_dataset.rename_column(target_col, "labels")


    # Adjust labels
    if is_binary:
        train_dataset = train_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        val_dataset = val_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
        test_dataset = test_dataset.map(lambda x: {"labels": torch.tensor([float(x["labels"])])})
    else:
        def adjust_labels(example):
            example['labels'] = torch.tensor(int(example['labels']) - 1)  # Subtract 1 from labels
            return example

        train_dataset = train_dataset.map(adjust_labels)
        val_dataset = val_dataset.map(adjust_labels)
        test_dataset = test_dataset.map(adjust_labels)


    # Remove unnecessary columns
    train_dataset = train_dataset.remove_columns(['hate_speech', 'counter_speech'])
    val_dataset = val_dataset.remove_columns(['hate_speech', 'counter_speech'])
    test_dataset = test_dataset.remove_columns(['hate_speech', 'counter_speech'])

    # Set the format for PyTorch
    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

def initialize_model(num_labels, is_binary=False):
    """
    Initializes a BERT model for binary or multi-class classification.
    """
    if is_binary:
        num_labels = 1  # Single output for sigmoid activation
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    return model

def compute_metrics(eval_pred):
    """
    Computes accuracy, F1 score, precision, and recall for the given predictions.
    """
    logits, labels = eval_pred
    labels = labels.astype(int)

    if logits.shape[1] == 1:  # Binary classification with sigmoid
        preds = (torch.sigmoid(torch.FloatTensor(logits)) > 0.5).long()
    else:  # Multi-class classification with softmax
        probs = torch.softmax(torch.FloatTensor(logits), dim=1)
        preds = np.argmax(probs, axis=-1)
    preds = preds.cpu().detach().numpy()
    # Calculate metrics
    clf_report = classification_report(labels, preds, output_dict=True)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "clf_report": clf_report
    }

def train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer, seed=0):
    """
    Trains the BERT model and saves the best model and tokenizer.
    """

    set_seed(seed)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{dim}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",  # Evaluation strategy set to epoch
        save_strategy="epoch",  # Save strategy also set to epoch
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        no_cuda=False

    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{dim}")
    tokenizer.save_pretrained(f"./models/{dim}")

    # Save evaluation results
    results = trainer.evaluate()
    with open(f"./results/{dim}_eval_results.txt", "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

    return trainer.model


# Example dimensions for multi-label classification
effectiveness_dimensions = {
    "clarity": 3,  # multi-label classification (1-3)
    "evidence": 3,
    'rebuttal': 3,
    'fairness': 3,
}

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Iterate through dimensions and train models
for dim, num_labels in effectiveness_dimensions.items():
    print(f"\nTraining model for {dim} with {num_labels} labels...\n")

    # Prepare datasets for training using conan data and testing using twitter data
    if num_labels == 2:  # Binary classification
        train_dataset, val_dataset, _ = prepare_datasets(conan, target_col=dim, tokenizer=tokenizer, is_binary=True)
        _, _, test_dataset = prepare_datasets(twitter, target_col=dim, tokenizer=tokenizer, is_binary=True)
        model = initialize_model(num_labels, is_binary=True)
    else:  # Multi-class classification
        train_dataset, val_dataset, _ = prepare_datasets(conan, target_col=dim, tokenizer=tokenizer, is_binary=False)
        _, _, test_dataset = prepare_datasets(twitter, target_col=dim, tokenizer=tokenizer, is_binary=False)
        model = initialize_model(num_labels, is_binary=False)

    # Train the model
    trained_model = train_model(dim, num_labels, train_dataset, val_dataset, model, tokenizer)

    # Evaluate on the twitter test set
    print("\nEvaluating on the conan test set...")
    test_results = Trainer(
        model=trained_model,
        args=TrainingArguments(output_dir=f"./results/{dim}_test"),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    ).evaluate(test_dataset)
    print(test_results)

    # Save test set results
    with open(f"./results/{dim}_t_new.txt", "w") as f:
        for key, value in test_results.items():
            f.write(f"{key}: {value}\n")
