In [None]:
# To choose which GPU to use

import os

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

#os.environ['TF_CUDNN_RESET_RND_GEN_STATE'] = '1'

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Check CUDA availability and GPU details in PyTorch

import torch
print(torch.cuda.is_available())  # For PyTorch
print(torch.cuda.device_count())  # Number of GPUs

True
1


In [None]:
# Paste your access token. For more information you can go: https://huggingface.co/docs/hub/en/security-tokens

from huggingface_hub import login

# Login using your access token
login(token= "your access token")

In [None]:
# Upload datasets from HuggingFace

from datasets import load_dataset
from datasets import concatenate_datasets
import pandas as pd

datasets = {
    'rte': load_dataset("super_glue", "rte", trust_remote_code=True),
    'wnli': load_dataset("nyu-mll/glue", "wnli"),
    'qnli': load_dataset("nyu-mll/glue", "qnli"),
    'mnli': load_dataset("nyu-mll/glue", "mnli"),
    'snli': load_dataset("stanfordnlp/snli"),
    'cb': load_dataset("super_glue", "cb", trust_remote_code=True),
    'sst2': load_dataset("nyu-mll/glue", "sst2"),
    'rt': load_dataset("cornell-movie-review-data/rotten_tomatoes"),
    'qqp': load_dataset("nyu-mll/glue", "qqp"),
    'mrpc': load_dataset("nyu-mll/glue", "mrpc"),
    'pawsx': load_dataset("google-research-datasets/paws-x", "en"),
    'copa': load_dataset("super_glue", "copa", trust_remote_code=True),
    'piqa': load_dataset("ybisk/piqa", trust_remote_code=True),
    'agn': load_dataset("fancyzhx/ag_news"),
    'trec': load_dataset("SetFit/TREC-QC"),
    'wsc': load_dataset("super_glue", "wsc", trust_remote_code=True),
    'teo': load_dataset("christophsonntag/OLID"),
    'tei': load_dataset("Parth1612/pp_distilbert_ft_tweet_irony"),
    'wic': load_dataset("super_glue", "wic", trust_remote_code=True),
    'cola': load_dataset("nyu-mll/glue", "cola"),
    'wino': pd.read_json("/work/dpotosku/WINO dataset/train_xl.jsonl", lines=True)
}

In [None]:
# Take the correct split for each dataset. Since each uploaded dataset contains: train, test, and, possibly, validation splits, it's necessary to take the one we need.

def unpacking_datsets(dataset_name, dataset, split):

    """  
    Unpacks the dataset based on its name and split type.  
      
    - For 'wino', returns the dataset as-is.  
    - For 'mnli' (validation), selects 'validation_matched'.  
    - For several datasets ('snli', 'rt', 'pawsx', 'agn', 'trec', 'teo', 'tei') with 'validation' split,  
      selects the 'test' subset instead.  
    - Otherwise, returns the dataset for the given split.  
    """ 
    
    if dataset_name == 'wino':
        unpacked_datset = dataset
    elif dataset_name == 'mnli' and split == 'validation':
        unpacked_datset = dataset['validation_matched']
    elif dataset_name == 'snli' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'rt' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'pawsx' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'agn' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'trec' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'teo' and split == 'validation':
        unpacked_datset = dataset['test']
    elif dataset_name == 'tei' and split == 'validation':
        unpacked_datset = dataset['test']
    else:
        unpacked_datset = dataset[split]
        
    return unpacked_datset

In [None]:
train_split = 'train'

train_split_datasets = {
    name: unpacking_datsets(name, dataset, train_split)
    for name, dataset in datasets.items()  
}

In [None]:
# This block we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the block.

#test_split = 'validation'

#test_split_dataset = {
#    name: unpacking_datsets(name, dataset, test_split)
#    for name, dataset in datasets.items()
#}

In [6]:
def remove_columns(dataset_name, dataset):
    """
    Preprocess a training dataset by removing unnecessary columns.

    Parameters:
        dataset_name (str): The name of the dataset.
                            If 'wino', the dataset is already a Pandas DataFrame.
                            Otherwise, the 'train' split of the dataset will be converted to a Pandas DataFrame.
        dataset (Dataset or pd.DataFrame): The input dataset. This can be either:
                                           - A Pandas DataFrame (if `dataset_name` is 'wino').
                                           - A Hugging Face Dataset object containing a 'train' split.

    Returns:
        pd.DataFrame: A new DataFrame with specified columns removed.
    """
    # Determine if the dataset is already in DataFrame format or needs conversion.
    if dataset_name == 'wino':
        # Dataset is already a Pandas DataFrame for 'wino'.
        train_dataset = dataset
    else:
        # Convert the 'train' split of the dataset to a Pandas DataFrame.
        train_dataset = dataset.to_pandas()

    # List of columns to drop from the dataset.
    columns_to_drop = [
        'idx', 'processed_input', 'id', 'label_nw', 'label_original',
        'label_coarse', 'label_coarse_original', 'span1_index', 'span2_index',
        'input_ids', 'attention_mask', 'cleaned_tweet', 'subtask_b', 'subtask_c',
        'phrase1', 'qID', 'start1', 'start2', 'end1', 'end2', 'ID', 'Organization name',
        'Target'
    ]

    # Drop the specified columns. Ignore errors if columns are missing.
    new_dataset = train_dataset.drop(columns=columns_to_drop, errors='ignore')

    return new_dataset

In [None]:
# Removing annesessary columns from the datasets and creating a dictionary with the modified datasets

train_without_columns = {
    name: remove_columns(name, dataset)
    for name, dataset in train_split_datasets.items()
}

In [None]:
# This block we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the block.

#test_without_columns = {
#    name: remove_columns(name, dataset)
#    for name, dataset in test_split_dataset.items()
#}

In [None]:
# Preparing the datasets to have a unified structure: 'text', 'dataset_name', and 'label',
# where 'text' contains all the values related to 'label. For example, the WIC dataset has the main columns:
# 'sentence 1', 'sentence 2', 'word', and 'label'.
# The 'text' column will contain a concatenation of the columns' values 'sentence 1', 'sentence 2', and 'word'.
# As their combination corresponds to the value in the 'label' column.

In [None]:
# TREC dataset contains 'label' and 'label_text' columns that are annecessary, and
# the 'label_coarse_text' column representing text-labels. The 'label' and 'label_text' columns
# will be dropped, and the 'label_coarse_text' column's name will be changed to 'label'.

drop_columns = ['label', 'label_text']

cleaned_trec_train = train_without_columns['trec'].drop(columns = drop_columns)
cleaned_trec_train = cleaned_trec_train.rename(columns = {'label_coarse_text': 'label'})

# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#cleaned_trec_test = test_without_columns['trec'].drop(columns = drop_columns)
#cleaned_trec_test = cleaned_trec_test.rename(columns = {'label_coarse_text': 'label'})

In [None]:
train_without_columns['trec'] = cleaned_trec_train

# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#test_without_columns['trec'] = cleaned_trec_test

In [None]:
# TEO dataset contains the 'subtask_a' column that represents label-column, but it has incorrect name.
# The 'subtask_a' column will be renamed to 'label'.

cleaned_teo_train = train_without_columns['teo'].rename(columns = {'subtask_a':'label'})


# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#cleaned_teo_test = test_without_columns['teo'].rename(columns = {'subtask_a':'label'})

In [None]:
train_without_columns['teo'] = cleaned_teo_train

# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#test_without_columns['teo'] = cleaned_teo_test

In [None]:
# WIC dataset has incorrect columns order (accordin to the papaer's examples it's supposed to have the following order:
# 'sentence1', 'sentence2', 'phrase2', 'label'). It will be changed.

new_order = ['sentence1', 'sentence2', 'word', 'label']

cleared_wic_train = train_without_columns['wic'].copy()
cleared_wic_train = cleared_wic_train[new_order]


# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#cleared_wic_test = test_without_columns['wic'].copy()
#cleared_wic_test = cleared_wic_test[new_order]

In [None]:
train_without_columns['wic'] = cleared_wic_train


# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#test_without_columns['wic'] = cleared_wic_test

In [None]:
# WINO dataset has the 'answer' column that reprepents labels. Its name will be changed to 'label'.

cleared_wino = train_without_columns['wino'].rename(columns = {'answer':'label'})


# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#cleared_wino = test_without_columns['wino'].rename(columns = {'answer':'label'})

In [None]:
train_without_columns['wino'] = cleared_wino

# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#test_without_columns['wino'] = cleared_wino

In [7]:
import pandas as pd
from typing import Dict

def change_labels_in_datasets_with_optional_mappings(
    mappings: Dict[str, dict], 
    datasets: Dict[str, pd.DataFrame]
) -> Dict[str, pd.DataFrame]:
    """
    Replace numeric values in the 'label' column for all datasets in a dictionary,
    using a different mapping for each dataset. If no mapping is provided for a dataset,
    its 'label' values remain unchanged.
    
    Args:
        mappings (Dict[str, dict]): A dictionary where keys are dataset names and
                                    values are dictionaries mapping numeric values to text labels.
        datasets (Dict[str, pd.DataFrame]): A dictionary where keys are dataset names and
                                            values are pandas DataFrames with a 'label' column.

    Returns:
        Dict[str, pd.DataFrame]: A new dictionary with updated datasets.
    """
    updated_datasets = {}

    for name, dataset in datasets.items():
        # Validate that the dataset contains the 'label' column
        if 'label' not in dataset.columns:
            raise ValueError(f"Dataset '{name}' does not contain a 'label' column.")
        
        # Get the mapping for the current dataset, or None if no mapping is provided
        mapping = mappings.get(name)
        
        # If no mapping is provided, keep the dataset unchanged
        if mapping is None:
            updated_datasets[name] = dataset.copy()
        else:
            # Apply the mapping
            updated_dataset = dataset.copy()
            updated_dataset['label'] = updated_dataset['label'].replace(mapping)
            updated_datasets[name] = updated_dataset

    return updated_datasets


In [None]:
# For TREC, WSC, and SUBJ datasets, there is no need to change values in the 'label' columns, 
# because they already contain text labels.

mappings = {
    'rte': {0: 'entailment', 1: 'not entailment'},
    'wnli': {0: 'not entailment', 1: 'entailment'},
    'qnli': {0: 'entailment', 1: 'not entailment'},
    'mnli': {0: 'entailment', 1: 'neutral', 2: 'contradiction'},
    'snli': {-1: 'unknown', 0: 'entailment', 1: 'neutral', 2: 'contradiction'},
    'cb': {0: 'entailment', 1: 'contradiction', 2: 'neutral'},
    'sst2': {0: 'negative', 1: 'positive'},
    'rt': {0: 'negative', 1: 'positive'},
    'qqp': {0: 'not duplicate', 1: 'duplicate'},
    'mrpc': {0: 'not equivalent', 1: 'equivalent'},
    'pawsx': {0: 'not paraphrase', 1: 'paraphrase'},
    'copa': {0: 'choice 1', 1: 'choice 2'},
    'piqa': {0: 'choice 1', 1: 'choice 2'},
    'agn': {0:'world', 1:'sports', 2:'business', 3:'science/technology'},
    'trec': {'entities': 'entity', 'description and abstract concepts': 'description and abstract concept',
             'human beings': 'human being', 'numeric values': 'numeric value', 'locations': 'location'},
    'wsc': {0:'false', 1:'true'},
    'teo': {'OFF':'offensive', 'NOT':'not offensive'},
    'tei': {0:'not irony', 1:'irony'},
    'wic': {0:'false', 1:'true'},
    'cola': {0:'unacceptable', 1:'acceptable'},
    'wino': {1:'choice 1', 2:'choice 2'},
    # evaluation datasets
    'teh' : {1:'hate', 0: 'not hate'},
    'teab' : {'NONE': 'none', 'FAVOR': 'favor', 'AGAINST': 'against'},
    'teat' : {'NONE': 'none', 'FAVOR': 'favor', 'AGAINST': 'against'},
    'tefe' : {'NONE': 'none', 'FAVOR': 'favor', 'AGAINST': 'against'},
    'tehi' : {'NONE': 'none', 'FAVOR': 'favor', 'AGAINST': 'against'},
    'adec' : {1: 'adverse drug event', 2: 'not adverse drug event'},
    'or' : {1: 'not overruling', 2: 'overruling'},
    'sot' : {1: 'company', 2: 'research institute', 3: 'university'},
    'tos' : {1: 'not potentially unfair', 2: 'potentially unfair'},
    'tc' : {1: 'complaint', 2: 'no complaint'}
}

In [None]:
train_datsets = change_labels_in_datasets_with_optional_mappings(mappings, train_without_columns)

In [None]:
# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#test_datsets = change_labels_in_datasets_with_optional_mappings(mappings, test_without_columns)

In [9]:
import pandas as pd

def structuring_dataset(dataset, dataset_name, label_col='label'):
    """
    Transforms the dataset by concatenating row values with specific formatting
    and adds a dataset name column.

    Args:
        dataset (pd.DataFrame): Input dataset.
        dataset_name (str): Name of the dataset.
        label_col (str): Name of the label column. Default is 'label'.

    Returns:
        pd.DataFrame: Transformed dataset with 'text', 'dataset_name' and 'label' columns.
    """
    if not isinstance(dataset, pd.DataFrame):
        raise ValueError("The input dataset must be a pandas DataFrame.")
    
    def concatenate_with_custom_logic(row):
        combined_values = []
        for col, value in row.items():
            if col != label_col:
                combined_values.append(f"{value}")
        return ' \n '.join(combined_values).strip()

    # Create the 'text' column using the custom logic
    dataset['text'] = dataset.apply(concatenate_with_custom_logic, axis=1)
    dataset['dataset_name'] = dataset_name

    # Extract the label column as a separate column
    dataset['label'] = dataset[label_col]

    # Return only the necessary columns
    return dataset[['text', 'dataset_name', 'label']]


In [None]:
test = train_datsets['wic']
print(test)

In [None]:
pre_train_dataset = {
    name: structuring_dataset(dataset, name)
    for name, dataset in train_datsets.items()
}

In [None]:
# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.

#pre_test_dataset = {
#    name: structuring_dataset(dataset, name)
#    for name, dataset in test_datsets.items()
#}

In [None]:
train_dataset = pd.concat(pre_train_dataset.values(), ignore_index=True)

In [None]:
# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
# test_dataset = pd.concat(pre_test_dataset.values(), ignore_index=True)

In [None]:
train_dataset['dataset_name'].unique()

In [None]:
# Make sure that each dataset contains no more than 25k examples.

def creating_balanced_dataset(dataset):
    # Group by 'dataset_name' and process each group
    grouped_dataset = dataset.groupby('dataset_name', group_keys=False).apply(
        lambda x: x.sample(n=25000, random_state=42) if x.shape[0] > 25000 else x
    )
    # Reset the index of the resulting dataset
    return grouped_dataset.reset_index(drop=True)


In [None]:
balanced_train_dataset = creating_balanced_dataset(train_dataset)

In [None]:
balanced_train_dataset['dataset_name'].value_counts()

In [None]:
len(balanced_train_dataset['label'].unique())

In [None]:
# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#balanced_test_dataset = creating_balanced_dataset(test_dataset)

In [None]:
# The function below constructs training or evaluating datasets that will be given to a pre-trained model during training or
#  to the symbol-tuned model during evaluation, respectively.

# The output for the training dataset will be a PandasDataFrame with the following columns: 'text', 'labels', 'num_exemplars', 'original_labels', and 'remappings'.
# Where 'text' corresponds to the inputs of prompts, 'labels' the outputs of the prompts, 'num_exemplars' the number of in-context examples per class,
# 'original_labels' is the list of original labels per dataset, and 'remappings' corresponds to the original labels' remappings.
# This structure makes it easier to construct full few-shot prompts. These are the elements of the final training dataset.
# Note that we use two terms, 'training dataset' and 'final training dataset' in this work.
# The reason is that with the function below, we create a training dataset, but it is not the last stage of it, as its elements need to be formatted into the final template,
# but the formatting happens inside SFTTrainer because we provide the trainer with the final template and the training dataset as one of the arguments.

# For evaluation dataset we follow the same idea: the function below constructs the evaluation dataset with the columns: 'text', 'labels', 'task', 'instruction',
# 'relevant label', and 'prompt'. Where 'text' corresponds to the inputs of prompts, 'labels' the outputs of the prompts, 'task' the dataset name, 'instruction'
# the boolean variable that specifies whether this element of the evaluation dataset contains instruction or not,
#  'relevant label' the boolean variable that specifies whether this element of the evaluation dataset contains relevant labels or not,
# 'prompt' contains an instruction prompt.
# With this structure, constructing the final evaluation dataset will be more convenient. It will be constructed using the .generate() function from HuggingFace
# using the evaluation dataset and the corresponding final template.

In [None]:
# Connect to the function that generates random arbitrary symbols. The source: https://github.com/JerryWeiAI/symbol-tuning
# The function was downloaded and added to the folder with this file for convenience.

from generate_symbols.generate_symbols import generate

In [None]:
def process_row(row, random_prompt_number, is_instruction_prompt = False, dataset_name = None):
    """
    Processes a single row, generating a prompt based on the input and label.
    """
    # type(row) is string it means that it's an evaluation example and
    # its labels should be in a separate column, not in a template.
    
    if isinstance(row, str):
        input_text, label_text = row, ""  # When row is a string, no label (only input text)
    else:
        input_text, label_text = row['text'], row['label']

    # Instructions for creation evaluation dataset    
    if is_instruction_prompt and dataset_name:
        instructions = {
            'subj' : 'Is the following sentence subjective or objective?',
            'teh' : 'Label the following tweet based on whether it contains hate speech.',
            'teab' : 'Read the following tweet and determine its stance on abortion.',
            'teat' : 'Read the following tweet and determine its stance on atheism.',
            'tefe' : 'Read the following tweet and determine its stance on feminism.',
            'tehi' : 'Read the following tweet and determine its stance on Hillary Clinton.',
            'adec' : 'Label the following sentence based on whether it is related to an adverse drug event.',
            'or' : 'Label the following sentence based on whether it is overruling or not.',
            'sot' : 'Read the following paper title and institution name and classify the institution as a university, company, or research institute.',
            'tos' : 'Label the following sentence from a Terms of Service based on whether it is potentially unfair.',
            'tc' : 'Label the following tweet text based on whether it contains a complaint.'
        }
        
        instruction = instructions[dataset_name]
        prompt = f"Question: {instruction} \n {input_text} \n Answer: {label_text}"
    else:
        # Templates
        prompts = [
            f"Input: {input_text} \n Output: {label_text}",
            f"Input: {input_text} \n Target: {label_text}",
            f"Input: {input_text} \n Symbol: {label_text}",
            f"Input: {input_text} \n Label: {label_text}",
            f"Question: {input_text} \n Answer: {label_text}",
            f"Student: {input_text} \n Teacher: {label_text}",
            f"X = {input_text} \n Y = {label_text}",
            f"Q: {input_text} \n A: {label_text}",
            f"{input_text} -> {label_text}",
            f"Sentences: {input_text} \n Mapped To: {label_text}",
        ]
        
        prompt = prompts[random_prompt_number]

    return prompt


def dataset_with_remapped_labels(dataset, remapp_dic):
    """Remap labels in the dataset based on the given dictionary."""
    if not isinstance(remapp_dic, dict):
        raise TypeError(f"Expected remapping variable needs to be a dictionary, got {type(remapp_dic)}")
    dataset['label'] = dataset['label'].map(remapp_dic)
    return dataset

In [None]:
import random
import pandas as pd
import re
import gc
    

def creating_final_datasets(dataset, prompts_number, test_dataset, groupedby,
                                         is_train_dataset):
    """
    Creates a training or evaluating dataset by sampling a random number of exemplars per class for the training or 4 
    random examples per class for the evaluating dataset,
    and applying the `process_row` function to generate text data.
    """

    # if-statements to catch the cases when the datasets
    # (either the one for future in-context examples or evaluation examples)
    # doesn't have a proper strcture.

    if len(dataset.columns) != 3 or len(test_dataset.columns) != 3:
        print("The dataset for in-context examples or the dataset for evaluation examples doesn't have a proper structure.")
        return

    required_columns = ['text', 'dataset_name', 'label']

    if not all(col in dataset.columns for col in required_columns) or not all(col in test_dataset.columns for col in required_columns):
        print("The dataset for in-context examples or the dataset for evaluation examples doesn't have a proper structure.")
        return

    final_dataset = []  # List to store the final dataset
    
    # Creating a dictionary to map the labels from the current dataset to arbitrary symbols
    current_labels = dataset['label'].unique().tolist()
    num_labels = len(current_labels)
    is_eval = not is_train_dataset
    random_labels = generate(num_labels, is_eval)
    remapping = dict(zip(current_labels, random_labels))

    if not is_train_dataset:
        # Track number of prompts for each setting:
        # with instructions/without instructions, with relevant labels/without relevant labels
        promt_num_per_set = prompts_number // 4

        # Initial settings
        instruct = True
        rel_labels = True

    # Prepare the test dataset for evaluation examples
    if test_dataset.empty:
        print("The test dataset for evaluation example creation is empty.")
        return

    # Creation of final dataset with prompts and remapped labels
    for i in range(prompts_number):
        
        # Set examples per class
        if is_train_dataset:
            exemplars_per_class = random.randint(2, 10)
        else:
            exemplars_per_class = 4

        # Sample examples from each class
        sampled_dataset = (
            dataset.groupby('label', group_keys=False)
            .apply(lambda x: x.sample(n=min(len(x), exemplars_per_class), random_state=random.randint(0, 9999)))
            .reset_index(drop=True)
        )

        
        # Switch for constructing the elements of the evaluation dataset within 4 settings
        if not is_train_dataset:
            if i % promt_num_per_set == 0 and i != prompts_number and i != 0:
                if instruct and rel_labels:
                    instruct = not instruct
                elif not instruct and rel_labels:
                    rel_labels = not rel_labels
                elif not instruct and not rel_labels:
                    instruct = not instruct

        # Remap labels if needed
        sampled_dataset_new = (
            dataset_with_remapped_labels(sampled_dataset, remapping)
            if (is_train_dataset or not rel_labels)
            else sampled_dataset
        )

        # Generate a random prompt number
        random_prompt_number = random.randint(0, 9)

        # Process rows to generate prompts
        processed_rows = sampled_dataset_new.apply(
            lambda row: process_row(row, random_prompt_number, instruct, groupedby) if not is_train_dataset
            else process_row(row, random_prompt_number),
            axis=1
        ).dropna()


        if processed_rows.empty:
            print('There is not an in-context example.')

        cell = " \n ".join(processed_rows).strip()

        # Create an evaluation example
        sample_test = test_dataset.sample(n=1, random_state=random.randint(0, 9999))  # Ensure one sample
        result_text = sample_test.iloc[0]['text']  # Extract only 'text' since 'label' will be in a separate column 

        # Process the evaluation example
        evaluation_ex = process_row(
            row=result_text,
            random_prompt_number=random_prompt_number,
            is_instruction_prompt=instruct if not is_train_dataset else False,
            dataset_name=groupedby if not is_train_dataset else None
        )

        if evaluation_ex is None:
            continue

        # Combine the cell and evaluation_ex into the final cell
        final_cell = cell + ' \n ' + evaluation_ex

        label_value = sample_test['label'].iloc[0]  # Extract label safely

        if is_train_dataset or not rel_labels:
            label = remapping[label_value]  # Apply remapping
        else:
            label = label_value  # Use the original label

        if not is_train_dataset:
            if instruct and rel_labels:
                original_labels = str(current_labels)
                prompt = f"This prompt contains relevant labels and instructions. The original natural language labels are {original_labels}."
            elif not instruct and rel_labels:
                original_labels = str(current_labels)
                prompt = f"This prompt contains relevant labels but no instructions. The natural language labels are {original_labels}."
            elif not instruct and not rel_labels:
                original_labels = str(current_labels)
                remapped_labels = str(random_labels)
                prompt = f"This prompt contains no relevant labels and no instructions. The original natural language labels {original_labels} have been remapped to {remapped_labels}, respectively."
            else:
                original_labels = str(current_labels)
                remapped_labels = str(random_labels)                
                prompt = f"This prompt contains no relevant labels but has instructions. The original natural language labels {original_labels} have been remapped to {remapped_labels}, respectively."
        
        if is_train_dataset:
            final_dataset.append({
                'text': final_cell,
                'labels': label,
                'num_exemplars': exemplars_per_class,
                'original_labels': current_labels,
                'remappings': random_labels
                    })
        else:
            final_dataset.append({
                'text': final_cell,
                'labels': label,
                'task': groupedby,
                'instruction': instruct,
                'relevant label': rel_labels,
                'prompt': prompt
                
            })

        if i % 1000 == 0:
            print(f"Processed {i} prompts...")

    return pd.DataFrame(final_dataset)


In [None]:
final_train_dataset = {}
grouped_datasets = balanced_train_dataset.groupby('dataset_name')

# This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
# Since we found out that the model's performance did not differ, there is no need to run the rows.
#grouped_datasets = balanced_test_dataset.groupby('dataset_name')

for group_key, dataset in grouped_datasets:
    

    # This rows we used for comparison: whether taken from the different splits in-context and evaluation examples, impacts the performance of the resulting model.
    # Since we found out that the model's performance did not differ, there is no need to run the rows.
    #corres_test_dataset = balanced_test_dataset.loc[balanced_test_dataset['dataset_name'] == group_key].copy()
    
    # Call creating_training_dataset function for each group
    processed_dataset = creating_final_datasets(dataset=dataset, prompts_number = 25000,
                                                test_dataset = dataset,
                                                groupedby=group_key, is_train_dataset = True)
    
    # Ensure that the processed dataset is valid before adding to the dictionary
    if not processed_dataset.empty:
        final_train_dataset[group_key] = processed_dataset
    else:
        print(f"Warning: Empty or invalid dataset returned for {group_key}")


# Concatenate the results into a final DataFrame
final_train_df = pd.concat(final_train_dataset.values(), ignore_index=True)

In [None]:
# Upload pre-train model and corresponding tokenizer

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, AutoTokenizer, AutoConfig
import torch

In [None]:
# !!! Make sure to get access to the models on their corresponding pages on HuggingFace.
# "meta-llama/Llama-3.1-8B-Instruct": https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
# "meta-llama/Llama-3.2-3B-Instruct": https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
# "google/gemma-7b-it": https://huggingface.co/google/gemma-7b-it

model_checkpoint = "meta-llama/Llama-3.1-8B-Instruct"
#model_checkpoint = "meta-llama/Llama-3.2-3B-Instruct"
#model_checkpoint = "google/gemma-7b-it"

In [None]:
# Load the model configuration
config = AutoConfig.from_pretrained(model_checkpoint)
config.hidden_dropout_prob = 0.05  # Dropout rate for hidden layers
config.attention_probs_dropout_prob = 0.05  # Dropout rate for attention layers

# Load the model with the modified configuration
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, torch_dtype=torch.bfloat16,
                        attn_implementation="flash_attention_2", config=config)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.to('cuda')

In [None]:
# The fuction that creates elements for the final training dataset

def formatting_prompts_func(examples):
    text = examples["text"]
    label = examples["labels"]
    k = str(examples['num_exemplars'])
    original_labels = str(examples['original_labels'])
    remappings = str(examples['remappings'])

    text = f'''### Overview. This prompt contains k = {k} in-context exemplars per class. The original natural language labels {original_labels} have been remapped to {remappings}, respectively.
    
    ### Prompt:
    {text}
            
    ### Answer:
    {label}'''

    return text

In [None]:
# SFTTrainer expects an argument dataset with the appropriate type

from datasets import Dataset

final_train_ds = Dataset.from_pandas(final_train_df)

In [None]:
train_val_split = final_train_ds.train_test_split(test_size=0.05, seed=42)

In [None]:
train_val_split

In [None]:
# Configuration for LoRA

from peft import LoraConfig

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias = 'none',
    task_type="CAUSAL_LM",
)

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import Trainer, TrainingArguments

training_args = SFTConfig(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=2000,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    bf16=True,
    gradient_accumulation_steps=8,  # Reduced gradient accumulation steps
    gradient_checkpointing=True,  # Reduce memory usage
    dataloader_num_workers=4,  # Use multiple workers for data loading
    dataloader_pin_memory=True,  # Speed up data transfer to GPU
    packing=True, # Responsible for chunking dataset to the context length and packing it
    max_seq_length=512, # Context length
    save_safetensors=False,
    weight_decay=0.1,
    learning_rate=2e-05,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_val_split['train'],
    eval_dataset=train_val_split['test'],
    tokenizer=tokenizer,
    peft_config=peft_config,
    formatting_func = formatting_prompts_func, # Creation of full few-shot prompts
     
)

trainer.train()

In [None]:
trainer.save_model()

In [None]:
import matplotlib.pyplot as plt

# Extract training and validation losses from the Trainer's log history
train_losses = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
eval_losses = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]

# Plot the loss curves
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label="Training Loss")
plt.plot(eval_losses, label="Validation Loss")
plt.xlabel("Steps or Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curves")
plt.legend()
plt.grid()
plt.show()

In [None]:
### EVALUATION

In [None]:
from datasets import load_dataset

# Evaluation datasets

eval_datasets = {
    'subj' : load_dataset("SetFit/subj", split = 'test'),
    'teh' : load_dataset("christinacdl/HatEval_2019_Test_Set_Task5", split = 'train'), 
    'teab' : load_dataset("krishnagarg09/SemEval2016Task6", split = 'validation'), 
    'teat' : load_dataset("krishnagarg09/SemEval2016Task6", split = 'validation'),
    'tefe' : load_dataset("krishnagarg09/SemEval2016Task6", split = 'validation'),
    'tehi' : load_dataset("krishnagarg09/SemEval2016Task6", split = 'validation'),
    'adec' : load_dataset("ought/raft", "ade_corpus_v2", split = 'train'),
    'or' : load_dataset("ought/raft", "overruling", split = 'train'),
    'sot' : load_dataset("ought/raft", "semiconductor_org_types", split = 'train'), 
    'tos' : load_dataset("ought/raft", "terms_of_service", split = 'train'),
    'tc' : load_dataset("ought/raft", "twitter_complaints", split = 'train') 
}


Repo card metadata block was not found. Setting CardData to empty.


In [None]:
# Format the datasets to have one structure: ['text', 'label']. Where 'label' column contains text values.

In [None]:
# TEAB, TEAT, TEFE, TEHI datasets are combined in the source. We need to separate them.

eval_datasets['teab'] = eval_datasets['teab'].filter(lambda example: example['Target'] == 'Legalization of Abortion')
eval_datasets['teat'] = eval_datasets['teat'].filter(lambda example: example['Target'] == 'Atheism')
eval_datasets['tefe'] = eval_datasets['tefe'].filter(lambda example: example['Target'] == 'Feminist Movement')
eval_datasets['tehi'] = eval_datasets['tehi'].filter(lambda example: example['Target'] == 'Hillary Clinton')

In [None]:
# Remove unnecessary columns from the datasets.

without_columns_eval = {
    name: remove_columns(name, dataset)
    for name, dataset in eval_datasets.items()
}

In [None]:
# SUBJ dataset has 'label_text' column that represents labels' text values, the 'label' column isn't needed.

without_columns_eval['subj'] = without_columns_eval['subj'].drop(columns = 'label')
without_columns_eval['subj'] = without_columns_eval['subj'].rename(columns = {'label_text': 'label'})

In [None]:
# TEH dataset contains unnecessary column 'text_label'.

without_columns_eval['teh'] = without_columns_eval['teh'].drop(columns = 'text_label')

In [None]:
# TEAB, TEAT, TEFE, TEHI datasets' columns need to be renamed.

without_columns_eval['teab'] = without_columns_eval['teab'].rename(columns = {'Tweet':'text', 'Stance': 'label'})
without_columns_eval['teat'] = without_columns_eval['teat'].rename(columns = {'Tweet':'text', 'Stance': 'label'})
without_columns_eval['tefe'] = without_columns_eval['tefe'].rename(columns = {'Tweet':'text', 'Stance': 'label'})
without_columns_eval['tehi'] = without_columns_eval['tehi'].rename(columns = {'Tweet':'text', 'Stance': 'label'})

In [None]:
# ADEC, OR, SOT, TOS and TC datasets' columns need to be renamed.

change_label_name = ['adec', 'or', 'sot', 'tos', 'tc']

for name, dataset in without_columns_eval.items():
    if name in change_label_name:
        without_columns_eval[name] = without_columns_eval[name].rename(columns = {'Sentence': 'text',
                                                                                  'Label': 'label',
                                                                                  'Paper title': 'text',
                                                                                 'Tweet text': 'text'})
                                                                                   

In [None]:
# Reduce number of examples per dataset to 100 random ones.

for name, dataset in without_columns_eval.items():
    if dataset.shape[0] > 100:
        without_columns_eval[name] = dataset.sample(n=100, random_state=42).reset_index(drop=True)
    else:
        without_columns_eval[name] = dataset

In [None]:
# Change labels' values to text labels.

eval_datasets = change_labels_in_datasets_with_optional_mappings(mappings, without_columns_eval)

In [None]:
# Structure the datasets according to the necessary structure.

eval_datasets = {
    name: structuring_dataset(dataset, name)
    for name, dataset in eval_datasets.items()
}

In [24]:
eval_dataset = pd.concat(eval_datasets.values(), ignore_index=True)

In [25]:
eval_dataset

Unnamed: 0,text,dataset_name,label
0,an astute teenager has a major problem that mi...,subj,objective
1,this riveting world war ii moral suspense stor...,subj,subjective
2,the ring just left me cold and wet like i was ...,subj,subjective
3,"a sly female empowerment movie , although not ...",subj,subjective
4,ex-special forces operator frank martin ( jaso...,subj,objective
...,...,...,...
804,@asblough Yep! It should send you a notificati...,tc,no complaint
805,@Wavy2Timez for real,tc,no complaint
806,@KenyaPower_Care no power in south b area... ...,tc,complaint
807,Honda won't do anything about water leaking in...,tc,complaint


In [None]:
# Construct the evaluation dataset.

final_eval_dataset = {}
grouped_datasets = eval_dataset.groupby('dataset_name')

for group_key, dataset in grouped_datasets:
    
    # Call creating_final_datasets function for each group
    processed_dataset = creating_final_datasets(dataset, 100,
                                                dataset, group_key, False)
    
    # Ensure that the processed dataset is valid before adding to the dictionary
    if processed_dataset is not None and not processed_dataset.empty:
        final_eval_dataset[group_key] = processed_dataset
    else:
        print(f"Warning: Empty or invalid dataset returned for {group_key}")


# Concatenate the results into a final DataFrame
final_eval = pd.concat(final_eval_dataset.values(), ignore_index=True)

In [27]:
final_eval

Unnamed: 0,text,labels,task,instruction,relevant label,prompt
0,Question: Label the following sentence based o...,not adverse drug event,adec,True,True,This prompt contains relevant labels and instr...
1,Question: Label the following sentence based o...,not adverse drug event,adec,True,True,This prompt contains relevant labels and instr...
2,Question: Label the following sentence based o...,adverse drug event,adec,True,True,This prompt contains relevant labels and instr...
3,Question: Label the following sentence based o...,not adverse drug event,adec,True,True,This prompt contains relevant labels and instr...
4,Question: Label the following sentence based o...,not adverse drug event,adec,True,True,This prompt contains relevant labels and instr...
...,...,...,...,...,...,...
1095,Question: Label the following sentence from a ...,64266,tos,True,False,This prompt contains no relevant labels but ha...
1096,Question: Label the following sentence from a ...,64266,tos,True,False,This prompt contains no relevant labels but ha...
1097,Question: Label the following sentence from a ...,thema,tos,True,False,This prompt contains no relevant labels but ha...
1098,Question: Label the following sentence from a ...,64266,tos,True,False,This prompt contains no relevant labels but ha...


In [None]:
# Evaluation for models with adapters

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Paths
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
#base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
#base_model_name = "google/gemma-7b-it"

lora_path = "./results"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_path)

# Load LoRA adapter config
peft_config = PeftConfig.from_pretrained(lora_path)
print("LoRA Config:", peft_config)

# Load base model
model = AutoModelForCausalLM.from_pretrained(base_model_name)
model.to('cuda')

# Load LoRA adapters
model = PeftModel.from_pretrained(model, lora_path)

# Check if LoRA layers are correctly applied
print(model)  # Should list LoRA layers

# Merge for faster inference
model = model.merge_and_unload()

model.to('cuda')

In [None]:
model.eval()

In [None]:
# Evaluation for model without adapters

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "meta-llama/Llama-3.1-8B-Instruct"
#model_path = "meta-llama/Llama-3.2-3B-Instruct"
#model_path = "google/gemma-7b-it"


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load base model
model = AutoModelForCausalLM.from_pretrained(model_path)

model.to('cuda')

print(model)

In [None]:
model.eval()

In [None]:
# The function that contains the final template.

def formatting_func_eval(example):

    text = f'''### Overview. {example['prompt']}
    ### Prompt:
    {example['text']}
            
    ### Answer:
    '''

    return text

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
from collections import defaultdict
import re


context_length = 512

accuracies = defaultdict(list)

def calculate_accuracy(dataset, model, tokenizer, task, instruction, relevant_labels):
    
    required_columns = ['text', 'labels', 'prompt']
    predicted_labels = []
    correct_predictions = 0
    dataset_labels = [str(label).strip().lower() for label in dataset['labels'].dropna().tolist()]

    if not all(col in dataset.columns for col in required_columns):
        print("The dataset doesn't have a proper structure.")
        return
    
    class_labels = dataset['labels'].unique().tolist()
    class_token_ids = [tokenizer.encode(label, add_special_tokens=True) for label in class_labels]
    
    # Loop through each row in the dataset to generate predictions and compare them with ground truth labels
    for idx, row in dataset.iterrows():


        # Tokenize the input text
        input_encodings = tokenizer(
            formatting_func_eval(row),
            truncation=False, 
            max_length=context_length,
            return_tensors="pt",
            return_attention_mask=True,
            add_special_tokens=True
        )

        # Generate output conditioned by the provided inputs, i.e., input_encodings
        generated_ids = model.generate(
            input_encodings['input_ids'].to('cuda'),
            attention_mask=input_encodings['attention_mask'].to('cuda'),
            max_new_tokens=10, 
            do_sample=False,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            force_words_ids=class_token_ids,  # Forces output to be one of the class labels
            num_beams=2, # beam-search decoding, because num_beams > 1 and do_sample=False
        )

        # To see only newly generated tokens without provided input
        new_tokens = generated_ids[:, len(input_encodings['input_ids'][0]):]

        # Decode the generated token IDs into text
        generated_text = tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip().lower()
        
        # Initialize final_label as 'other' by default
        final_label = 'other'

        # Check if generated_text starts with any of the truth labels
        for truth_label in dataset_labels:
            if generated_text.startswith(truth_label):
                final_label = truth_label
                break
            
        if row['labels'].strip().lower() == final_label.strip().lower():
            correct_predictions += 1
        
        predicted_labels.append(final_label.strip().lower())
        
    actual_labels = dataset_labels
    labels = list(set(dataset_labels))
    
    if 'other' in predicted_labels:
        labels.append('other')

    
    confusion_matrix = metrics.confusion_matrix(actual_labels, predicted_labels, labels = labels)
    
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels = labels)
    cm_display.plot()
    plt.title(f"Confusion Matrix for {task} with settings:\n Instructions={instruction} and original labels={relevant_labels}")
    plt.show()
    
    def safe_div(numerator, denominator):
        return numerator / denominator if denominator > 0 else None
    
    setting_accuracy = safe_div(correct_predictions,dataset.shape[0])
    
    print(f"""Accuracy for {task} with settings:\n Instructions={instruction} 
          and original labels={relevant_labels} is {setting_accuracy:.4f}""")

    
    accuracies[task].append(setting_accuracy)
    

In [None]:
# Evaluation of the symbol-tuned model in 4 settings

from pandas.api.types import CategoricalDtype

# Define the custom sorting order for 'instruction' and 'relevant label'
instruction_order = [False, True]  # False comes first
relevant_label_order = [False, True]  # False comes first

# Convert columns to categorical types with the specified order
final_eval['instruction'] = final_eval['instruction'].astype(CategoricalDtype(instruction_order, ordered=True))
final_eval['relevant label'] = final_eval['relevant label'].astype(CategoricalDtype(relevant_label_order, ordered=True))

# Sort the DataFrame according to the custom order
final_eval = final_eval.sort_values(by=['task', 'instruction', 'relevant label'])

group_columns = ['task', 'instruction', 'relevant label'] 
grouped_datasets = final_eval.groupby(group_columns)

for group_keys, group_dataset in grouped_datasets:
    struct_dataset = group_dataset.drop(columns = group_columns).reset_index(drop=True)
    calculate_accuracy(struct_dataset, model, tokenizer, group_keys[0], group_keys[1], group_keys[2])

In [None]:
# Dictionary with the calculated accuracies

accuracies

In [None]:
# Calculate average accuracy among settings and tasks

In [None]:
# Initialize sums to 0
no_inst_irrel_lb = 0
no_inst_rel_lb = 0
instr_irrel_lb = 0
inst_rel_lb = 0

# Loop through the dictionary
for key, value in accuracies.items():  
    if len(value) > 0:
        no_inst_irrel_lb += value[0]
    if len(value) > 1:
        no_inst_rel_lb += value[1]
    if len(value) > 2:
        instr_irrel_lb += value[2]
    if len(value) > 3:
        inst_rel_lb += value[3]
        
acc_no_inst_irrel_lb = no_inst_irrel_lb / len(accuracies)
acc_no_inst_rel_lb = no_inst_rel_lb / len(accuracies)
acc_instr_irrel_lb = instr_irrel_lb / len(accuracies)
acc_inst_rel_lb = inst_rel_lb / len(accuracies)


print(f"""
      Average Model Accuracy for settings: without instructions and with remapped labels is {acc_no_inst_irrel_lb:.4f}\n 
      Average Model Accuracy for settings: without instructions and with original labels is {acc_no_inst_rel_lb:.4f}\n
      Average Model Accuracy for settings: with instructions and with remapped labels is {acc_instr_irrel_lb:.4f}\n
      Average Model Accuracy for settings: with instructions and with original labels is {acc_inst_rel_lb:.4f}
      """)


In [None]:
# Additinal blocks to evaluate models on 5-shot MMLU if needed.

In [None]:
def format_fun_mmlu(example):
    
    text = f'''### Overview. This prompt contains relevant labels and instructions. The original natural language labels
are [“A”, “B”, “C”, “D”].
    ### Prompt:
    {example['text']}
    '''

    return text

In [None]:

from datasets import load_dataset

mmlu_5shot = load_dataset("FuryMartin/Ianvs-MMLU-5-shot")
mmlu_5shot = mmlu_5shot['test']
mmlu_5shot_fin = mmlu_5shot.remove_columns(["prompt", 'explanation', 'level_1_dim', 'level_2_dim', 'level_3_dim', 'level_4_dim'])
mmlu = mmlu_5shot_fin.to_pandas()
mmlu = mmlu.rename(columns = {'query': 'text', 'response': 'labels'})

mmlu.head()

In [None]:
calculate_accuracy(mmlu, model, tokenizer, 'MMLU 5-shot', True, True)