# 1. Load data

Loads the data into the notebook

In [None]:
from IPython.display import Audio, display
def notify():
    audio_url = 'https://upload.wikimedia.org/wikipedia/commons/5/58/Zip_tone.ogg'
    display(Audio(audio_url, autoplay=True))

#!pip install --upgrade transformers

#!pip install transformers[torch]

# !pip install --upgrade scikit-learn
# !pip install umap-learn
# !pip install scikit-learn
# !pip install datasets
# !pip install catboost
# !pip install shap
# !pip install --user prince
# !pip install tensorflow scikeras scikit-learn
# !pip install optuna

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

# load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# modelling
import regex as re
import torch
from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import TensorDataset

# load custom functions
# import sys
# function_path = '/content/drive/MyDrive/Functions/Dissertation/'
# sys.path.append(function_path)

Mounted at /content/drive


In [None]:
# english dataset
folder_path = '/content/drive/MyDrive/Data/SemEval/Task 11/Track A'
train = pd.read_csv(folder_path + '/Training Data/eng.csv', delimiter = ',')
dev = pd.read_csv(folder_path + '/Dev Data/eng_a.csv', delimiter = ',')
train.head(3)

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0


In [None]:
# multilingual dataset
train_path = '/content/drive/MyDrive/Data/SemEval/Task 11/Track A/Training Data'
desired_emotions = ['text', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
dfs = []

for file_name in os.listdir(train_path):
    if file_name.endswith('.csv'):

        file_path = os.path.join(train_path, file_name)
        df = pd.read_csv(file_path, delimiter=',')

        lang_code = file_name[:-4]
        df['lang'] = lang_code

        if file_name != 'eng.csv':
            df = df.sample(frac=0.5, random_state=2001)

        if all(emotion in df.columns for emotion in desired_emotions):
            dfs.append(df[desired_emotions + ['lang']])


multi_df = pd.concat(dfs, ignore_index=True)
print(multi_df.shape)
multi_df.head()

(17118, 7)


Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise,lang
0,But not very happy.,0,0,1,1,0,eng
1,Well she's not gon na last the whole song like...,0,0,1,0,0,eng
2,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0,eng
3,"Yes, the Oklahoma city bombing.",1,1,0,1,1,eng
4,They were dancing to Bolero.,0,0,1,0,0,eng


In [None]:
# translated dataset
folder_path = '/content/drive/MyDrive/Data/SemEval/Task 11/Track A'
trans_df = pd.read_csv(folder_path + '/Training Data/translated.csv', delimiter = ',')
trans_df = trans_df.drop(columns=['raw_text'])
print(trans_df.shape)
trans_df.head(3)

(6203, 7)


Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise,lang
0,The thought of death is scarier than death #Re...,0,1,0,0,0,ru
1,"In this right, you can understand the connecti...",0,0,0,0,0,ru
2,"As I woke up, all the face, terrible kidney pa...",0,0,0,1,0,ru


In [None]:
train = train.drop(columns=['id'])
no_labels = train.shape[1] - 1

dev_ids = dev['id'].tolist()
dev = dev.drop(columns=['id'])

print(train.shape, dev.shape)
train.head()

(2768, 6) (116, 6)


Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise
0,But not very happy.,0,0,1,1,0
1,Well she's not gon na last the whole song like...,0,0,1,0,0
2,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0
3,"Yes, the Oklahoma city bombing.",1,1,0,1,1
4,They were dancing to Bolero.,0,0,1,0,0


In [None]:
notify()

# 2. Functions

Contains the functions used in modelling the data

## 2.1 Data Processor

In [None]:
#!pip install datasets

In [None]:
from datasets import Dataset, DatasetDict

class TextClassificationProcessor:
    def __init__(self, data, dev, no_labels, test_size=0.1, random_state=2001, max_length=64, tokenizer_name='bert-base-uncased', folder_path=None, tokenizer_save_name=None):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.data = data
        self.dev = dev
        self.test_size = test_size
        self.random_state = random_state
        self.max_length = max_length
        self.folder_path = folder_path
        self.tokenizer_save_name = tokenizer_save_name
        self.label_cols = self.data.drop(columns=['text']).columns.tolist()

        self._prepare_splits()
        self._extract_texts_and_labels()
        self._tokenize()

    def _convert_labels_to_float(self, df):
        return [[float(label) for label in labels] for labels in df[self.label_cols].values.tolist()]

    def _prepare_splits(self):
        temp_df, self.val_df = train_test_split(self.data, test_size=self.test_size, random_state=self.random_state)
        self.train_df, self.test_df = train_test_split(temp_df, test_size=self.test_size, random_state=self.random_state)

    def _extract_texts_and_labels(self):
        self.train_texts_list = self.train_df['text'].tolist()
        self.val_texts_list = self.val_df['text'].tolist()
        self.test_texts_list = self.test_df['text'].tolist()
        self.dev_texts_list = self.dev['text'].tolist()

        self.train_labels_list = self._convert_labels_to_float(self.train_df)
        self.val_labels_list = self._convert_labels_to_float(self.val_df)
        self.test_labels_list = self._convert_labels_to_float(self.test_df)
        self.dev_labels_list = self._convert_labels_to_float(self.dev)

    def _tokenize(self):

        # Convert DataFrames to Hugging Face Datasets
        self.train_dataset = Dataset.from_dict({'text': self.train_texts_list, 'labels': self.train_labels_list})
        self.val_dataset = Dataset.from_dict({'text': self.val_texts_list, 'labels': self.val_labels_list})
        self.test_dataset = Dataset.from_dict({'text': self.test_texts_list, 'labels': self.test_labels_list})
        self.dev_dataset = Dataset.from_dict({'text': self.dev_texts_list, 'labels': self.dev_labels_list})

        # Tokenize datasets
        def tokenize_function(examples):
            return self.tokenizer(examples['text'], padding="max_length", truncation=True, max_length=self.max_length)

        self.train_dataset = self.train_dataset.map(tokenize_function, batched=True)
        self.val_dataset = self.val_dataset.map(tokenize_function, batched=True)
        self.test_dataset = self.test_dataset.map(tokenize_function, batched=True)
        self.dev_dataset = self.dev_dataset.map(tokenize_function, batched=True)

    def get_dataset(self, split):
        if split == 'train':
            return self.train_dataset
        elif split == 'val':
            return self.val_dataset
        elif split == 'test':
            return self.test_dataset
        elif split == 'dev':
            return self.dev_dataset

    def save_tokenizer(self, folder_path):
        self.tokenizer.save_pretrained(folder_path)
        print('Tokenizer saved!')

## 2.2 Model Training

In [None]:
class TextClassifierTrainer:
    def __init__(self, model_name, no_labels, training_args, folder_path, model_save_name, problem_type="multi_label_classification"):
        self.model_name = model_name
        self.problem_type = problem_type
        self.no_labels = no_labels
        self.training_args = training_args
        self.folder_path = folder_path
        self.model_save_name = model_save_name

        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.no_labels,
            problem_type=self.problem_type)

    def create_trainer(self, train_dataset, val_dataset):
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset)
        return trainer

    def train_model(self, train_dataset, val_dataset, checkpoint=None):
        trainer = self.create_trainer(train_dataset, val_dataset)
        if checkpoint is None:
            trainer.train()
        else:
            trainer.train(resume_from_checkpoint=checkpoint)

        trainer.save_model(self.folder_path + self.model_save_name)
        print("Model saved!")

## 2.3 Model Evalution

In [None]:
class ModelPredictionEvaluation:
    def __init__(self, trained_model, dataset, processor, model_name=None, dev_ids=None, folder_path=None):
        self.trained_model = trained_model
        self.dataset = dataset
        self.processor = processor
        self.model_name = model_name
        self.dev_ids = dev_ids
        self.folder_path = folder_path
        self.preds = None

    def evaluate_model(self):
        results = self.trained_model.evaluate(self.dataset)
        print("Evaluation results:", results)

    def predict(self):
        logits = self.trained_model.predict(self.dataset)
        probs = torch.sigmoid(torch.tensor(logits.predictions))
        self.preds = (probs > 0.5).int().numpy()

    def evaluate_predictions(self):
        actuals = self.processor.test_labels_list
        accuracy = accuracy_score(actuals, self.preds)
        micro = f1_score(actuals, self.preds, average='micro')
        macro = f1_score(actuals, self.preds, average='macro')
        print(f"Accuracy: {accuracy}")
        print(f"Micro F1 Score: {micro}")
        print(f"Macro F1 Score: {macro}")
        return macro

    def save_predictions(self):
        pred_eng_a = pd.DataFrame(self.preds, columns=self.processor.label_cols)
        pred_eng_a.insert(0, 'id', self.dev_ids)
        pred_eng_a = pred_eng_a[["id"] + self.processor.label_cols]
        pred_eng_a.to_csv(f'{self.folder_path}/Dev Data/Predictions/{self.model_name}_pred_eng_a.csv', index=False)
        print("Predictions saved!")

    def run(self):
        if self.dev_ids is None:
            self.evaluate_model()
            self.predict()
            return self.evaluate_predictions() # return macro score
        else:
            self.predict()
            self.save_predictions()
            return None

In [None]:
notify()

# 3. Translation

Takes the multilingual dataset and translates it

In [None]:
# missing text samples (none)
multi_df[multi_df['text'].str.len() < 5]

Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise,lang
4053,воу,0,0,0,0,1,rus
4521,вау!,0,0,1,0,1,rus


In [None]:
# different languages
print(multi_df['lang'].unique())

lang_map = {
    'eng': 'en',  # English
    'ptbr': 'pt',  # Portuguese
    'rus': 'ru',  # Russian
    'sun': 'su',  # Sundanese
    'oro': 'om',  # Oromo
    'som': 'so',  # Somali
    'tir': 'ti',  # Tigrinya
    'deu': 'de',  # German
    'amh': 'am'}  # Amharic

multi_df['lang'] = multi_df['lang'].map(lang_map)

multi_df.iloc[2767:2769]

['eng' 'ptbr' 'rus' 'sun' 'oro' 'som' 'tir' 'deu' 'amh']


Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise,lang
2767,He had 4 inches and 40 pounds on me and I stil...,0,0,1,0,1,en
2768,A esquerda precisa se organizar e lutar juntos...,0,0,0,0,0,pt


In [None]:
# missing emotions

#deu = multi_df[multi_df.lang == 'rus']
zero_rows = multi_df.drop(['text', 'lang'], axis=1).sum(axis=1) == 0
multi_df[zero_rows].shape

(5370, 7)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-2dmjxgqa
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-2dmjxgqa
  Resolved https://github.com/huggingface/transformers to commit 5cabc75b4bdb2e67935f7195f901afd150746eb3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10328720 sha256=1aae9164e00a2cf6687d0c0bb4f9dbb6157f2ec87efa770e393b3d0c49ef690a
  Stored in directory: /tmp/pip-ephem-wheel-cache-ww7y1li5/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

In [None]:
# drop languages
drop_langs = ['am', 'so', 'pt', 'su']
df = multi_df.drop(multi_df[multi_df['lang'].isin(drop_langs)].index)
df.lang.unique(), df.shape

(array(['en', 'ru', 'om', 'ti', 'de'], dtype=object), (8971, 7))

In [None]:
#!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/897.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import pandas as pd
from transformers import pipeline
from transformers import logging

logging.set_verbosity_error()

class Translator:
    def __init__(self, df, folder_path, drop_langs=['en']):
        self.df = df[~df['lang'].isin(drop_langs)].reset_index(drop=True)  # Reset index after filtering
        self.translators = self.get_translators()
        self.folder_path = folder_path
        self.counter = 0

    def get_translator(self, current_language):
        model_name = f"Helsinki-NLP/opus-mt-{current_language}-en"
        try:
            translator = pipeline(f'translation_en_to_{current_language}', model=model_name, tokenizer=model_name)
            print(f"Model found for {current_language}")
            return translator
        except Exception as e:
            print(f"Model for {current_language} not found. Error: {e}")
            return None

    def get_translators(self):
        unique_langs = self.df['lang'].unique()
        return {lang: self.get_translator(lang) for lang in unique_langs}

    def translate_text(self, text, current_language):
        translator = self.translators.get(current_language)
        if translator:
            try:
                translated_text = translator(text)
                return translated_text[0]['translation_text']
            except Exception as e:
                print(f"Error translating text in language {current_language}: {e}")
                return text
        return text

    def save_checkpoint(self):
        """Saves the DataFrame to a CSV checkpoint."""
        print("Saving checkpoint.")
        self.df.to_csv(self.folder_path + '/Training Data/translated.csv', index=False)

    def translate_df(self):
        self.df['raw_text'] = self.df['text']

        for idx, row in self.df.iterrows():
            self.df.at[idx, 'text'] = self.translate_text(row['text'], row['lang'])
            self.counter += 1

            if self.counter % 1000 == 0: # save every 1000 rows
                self.save_checkpoint()

        self.save_checkpoint()
        return self.df

In [None]:
translator = Translator(df, folder_path)
trans_df = translator.translate_df()

Model found for ru
Model found for om
Model found for ti
Model found for de
Saving checkpoint.
Error translating text in language om: index out of range in self
Saving checkpoint.
Error translating text in language om: index out of range in self
Saving checkpoint.
Saving checkpoint.
Saving checkpoint.
Saving checkpoint.
Saving checkpoint.


In [None]:
trans_df

Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise,lang,raw_text
3881,The thought of death is scarier than death #Re...,0,1,0,0,0,ru,Мысль о смерти страшнее смерти #читаювзаимно
3882,"In this right, you can understand the connecti...",0,0,0,0,0,ru,вот в этом праве можно хорошо понять проводку ...
3883,"As I woke up, all the face, terrible kidney pa...",0,0,0,1,0,ru,"Как проснулась,отекло все лицо,жуткие боли в п..."
3884,We're just consumers.,0,0,0,0,0,ru,Мы просто потребители.
3885,Get the fuck out of here),1,0,0,0,0,ru,иди нахер))
...,...,...,...,...,...,...,...,...
12237,Our policy has managed to be more afraid of ex...,0,0,0,0,0,de,Das schlimme ist. Unsere Politik hat es gescha...
12238,Shows me somehow that there are also with the ...,0,0,1,0,0,de,"Zeigt mir irgendwie, dass es auch bei den Nörg..."
12239,"Poor guy, I'm sure he's sad now",0,0,0,1,0,de,"Der arme , ist jetzt bestimmt traurig"
12240,We decided against it. If something had been n...,0,0,0,0,0,de,Wir haben uns dagegen entschieden. Wenn etwas ...


# 4. EDA

Visualises the data

In [None]:
for i in train['text'].head(10):
    print(i)


But not very happy.
Well she's not gon na last the whole song like that, so since I'm behind her and the audience can't see below my torso pretty much, I use my hand to push down on the lid and support her weight.
She sat at her Papa's recliner sofa only to move next to me and start clinging to my arms.
Yes, the Oklahoma city bombing.
They were dancing to Bolero.
Still had sex with her, though.
But I am exhausted-my eyes feel like they are about to pop out of my head-I need some soothing music and images to help my mediate, even for a few minutes, just take me away $ # 365 yoga Monday's Musical Meditation: 0 minutes ago early night-first day of work after mat.
We ordered some food at Mcdonalds instead of buying food at the theatre because of the ridiculous prices the theatre has.
Now my parents live in the foothills, and the college is in a large valley.
We get to the porch and my dog starts *growling*, like a big boy growl, like shits going down growl.


# 5. Modelling (English Dataset)



Models the english dataset alone

Ideas

1. BERT (2018) - standard, original model
2. DistilBERT/ALBERT (2019) - faster & lighter
3. RoBERTa/XLNet (2019) - improvements
4. XLM-RoBERTa (2019) - multilingual datasets
5. Classical ML - Multi-word embeddings (TF-IDF, unigrams, digrams, FastText)

## 5.1 BERT (73%)

### Training

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='bert-base-uncased',
    folder_path=folder_path,
    tokenizer_save_name='/bert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')

# train model
model_name = "bert-base-uncased"
problem_type = "multi_label_classification"
model_save_name = '/bert_model'
tokenizer_save_name = '/bert_tokenizer'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# api key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, problem_type, no_labels, training_args, folder_path)
trainer.train_model(train_dataset, val_dataset)

Train set: 2241
Val set: 277
Test set: 250


In [None]:
# model_name = "bert-base-uncased"
# problem_type = "multi_label_classification" # sigmoid (not softmax)

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     problem_type=problem_type,
#     num_labels=no_labels)

# training_args = TrainingArguments(
#     output_dir=folder_path,                         # training logs saved to directory
#     eval_strategy="epoch",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3)

# # training_args = TrainingArguments(
# #     output_dir='./results',
# #     num_train_epochs=10,
# #     per_device_train_batch_size=32,
# #     per_device_eval_batch_size=16,
# #     learning_rate=1e-7,
# #     evaluation_strategy="epoch",
# #     save_strategy="epoch",
# #     optimizers=(optimizer, None))

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset)

# trainer.train() # d178de1a7c092db9f84071c0f557e6950f376aa9

# # Save model & tokenizer
# model_save_name = '/bert_model'
# tokenizer_save_name = '/bert_tokenizer'
# trainer.save_model(folder_path + model_save_name)
# model.save_pretrained(folder_path + tokenizer_save_name)
# notify()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.401008
2,No log,0.347052
3,No log,0.328963


In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # Load the model
# model = BertForSequenceClassification.from_pretrained(folder_path + '/bert_model')

# # Load the tokenizer
# #tokenizer = BertTokenizer.from_pretrained(folder_path + '/bert_tokeniser')

# training_args = TrainingArguments(folder_path + '/bert_model/training_args')

# # Reinitialize the trainer with the loaded model and tokenizer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     processing_class=processor.tokenizer)

### Test model

In [None]:
# Model evaluation
results = trainer.evaluate(test_dataset)
print(results)

# Predictions
logits = trainer.predict(test_dataset)
probs = torch.sigmoid(torch.tensor(logits.predictions)) # logits to probs
preds = (probs > 0.5).int() # to binary

actuals = processor.test_labels_list
preds = preds.numpy() # tensor to np.array

# Evaluation metrics
accuracy = accuracy_score(actuals, preds)
f1 = f1_score(actuals, preds, average='micro') # inbalanced
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

notify()

{'eval_loss': 0.3763325810432434, 'eval_model_preparation_time': 0.0067, 'eval_runtime': 65.276, 'eval_samples_per_second': 3.83, 'eval_steps_per_second': 0.49}
Accuracy: 0.428
F1 Score: 0.7275132275132276


### Dev Test


In [None]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])  # Use input_ids length as the dataset size

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


In [None]:
tokenizer = processor.tokenizer
dev_texts_list = dev['text'].tolist()
dev_encodings = tokenizer(dev_texts_list, padding="max_length", truncation=True, max_length=processor.max_length)
dev_dataset = TextClassifierDataset(dev_encodings, labels=None)  # No labels for dev data

logits = trainer.predict(dev_dataset)
probs = torch.sigmoid(torch.tensor(logits.predictions))     # Convert logits to probabilities
preds = (probs > 0.5).int()                                 # Apply threshold to convert to binary
preds = preds.numpy()                                       # Convert tensor to numpy array

pred_eng_a = pd.DataFrame(preds, columns=processor.label_cols)
pred_eng_a.insert(0, 'id', dev_ids)
pred_eng_a = pred_eng_a[["id"] + processor.label_cols]
pred_eng_a.to_csv(f'{folder_path}/Dev Data/pred_eng_a.csv', index=False)

notify()

## 5.2 DistilBERT (68%)

### Optuna (Failed)

In [None]:
#!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
#!pip install optuna
import optuna

# model vars
model = 'distilbert'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8])
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    #adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        #adam_epsilon=adam_epsilon,
        #warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)

    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.create_trainer(train_dataset, val_dataset)

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-30 23:12:52,048] A new study created in memory with name: no-name-a1e5d963-00c7-4c9a-aece-ccdf0abfaf27
  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 51


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.409794
2,0.413400,0.404838
3,0.413400,0.418991


Model saved!


Evaluation results: {'eval_loss': 0.46969011425971985, 'eval_model_preparation_time': 0.002, 'eval_runtime': 29.4809, 'eval_samples_per_second': 8.48, 'eval_steps_per_second': 1.085}


[I 2024-12-31 00:11:22,304] Trial 0 finished with value: 0.5976023064883 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 9.652377461356108e-05, 'weight_decay': 2.5414622568999556e-06, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.38
Micro F1 Score: 0.6873315363881402
Macro F1 Score: 0.5976023064883


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.433145
2,0.470700,0.38496
3,0.470700,0.368299


Model saved!


Evaluation results: {'eval_loss': 0.415951669216156, 'eval_model_preparation_time': 0.0033, 'eval_runtime': 29.5291, 'eval_samples_per_second': 8.466, 'eval_steps_per_second': 1.084}


[I 2024-12-31 01:09:01,132] Trial 1 finished with value: 0.5831474410257099 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.6799913365790037e-05, 'weight_decay': 4.304707260536617e-06, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.376
Micro F1 Score: 0.6766304347826086
Macro F1 Score: 0.5831474410257099


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.44941
2,0.486000,0.401632
3,0.486000,0.392395


Model saved!


Evaluation results: {'eval_loss': 0.43033212423324585, 'eval_model_preparation_time': 0.002, 'eval_runtime': 31.1753, 'eval_samples_per_second': 8.019, 'eval_steps_per_second': 1.026}


[I 2024-12-31 02:05:54,174] Trial 2 finished with value: 0.5402973614165518 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.2175110059947478e-05, 'weight_decay': 0.00011251995578702072, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.348
Micro F1 Score: 0.6610407876230661
Macro F1 Score: 0.5402973614165518


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.415926
2,0.456900,0.371731
3,0.456900,0.361203


Model saved!


Evaluation results: {'eval_loss': 0.4116642475128174, 'eval_model_preparation_time': 0.002, 'eval_runtime': 31.4958, 'eval_samples_per_second': 7.938, 'eval_steps_per_second': 1.016}


[I 2024-12-31 03:02:05,862] Trial 3 finished with value: 0.5959015042441693 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.931687076353232e-05, 'weight_decay': 0.00017209570242521777, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.392
Micro F1 Score: 0.6901217861975643
Macro F1 Score: 0.5959015042441693


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.413361
2,0.420200,0.399584
3,0.420200,0.407481


Model saved!


Evaluation results: {'eval_loss': 0.4656156301498413, 'eval_model_preparation_time': 0.0022, 'eval_runtime': 31.2026, 'eval_samples_per_second': 8.012, 'eval_steps_per_second': 1.026}


[I 2024-12-31 03:58:20,105] Trial 4 finished with value: 0.5918357066274686 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.153811881114878e-05, 'weight_decay': 0.008593591543374426, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.4
Micro F1 Score: 0.680161943319838
Macro F1 Score: 0.5918357066274686


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.416404
2,0.455200,0.371147
3,0.455200,0.354028


Model saved!


Evaluation results: {'eval_loss': 0.4082444906234741, 'eval_model_preparation_time': 0.002, 'eval_runtime': 31.2418, 'eval_samples_per_second': 8.002, 'eval_steps_per_second': 1.024}


[I 2024-12-31 04:54:59,715] Trial 5 finished with value: 0.5973729989939253 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 2.1602599088147256e-05, 'weight_decay': 1.4122805184391041e-05, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.408
Micro F1 Score: 0.691156462585034
Macro F1 Score: 0.5973729989939253


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.42026
2,0.459600,0.374265
3,0.459600,0.363982


Model saved!


Evaluation results: {'eval_loss': 0.41386231780052185, 'eval_model_preparation_time': 0.002, 'eval_runtime': 31.2012, 'eval_samples_per_second': 8.013, 'eval_steps_per_second': 1.026}


[I 2024-12-31 05:50:35,349] Trial 6 finished with value: 0.5925969762143993 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.8559392974712937e-05, 'weight_decay': 2.366108520883202e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.5976023064883.


Accuracy: 0.396
Micro F1 Score: 0.6865671641791045
Macro F1 Score: 0.5925969762143993


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


### Model B

In [None]:
# vars
model = 'distilbert'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    save_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    run_name=f"{model}_classification_run")

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.478698
2,0.510100,0.42529
3,0.510100,0.390254
4,0.375300,0.378907
5,0.375300,0.375755


Model saved!


In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4190017580986023, 'eval_model_preparation_time': 0.0048, 'eval_runtime': 50.7807, 'eval_samples_per_second': 4.923, 'eval_steps_per_second': 0.63}
Accuracy: 0.384
Micro F1 Score: 0.6685159500693482
Macro F1 Score: 0.5756590699940454


In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.5297229290008545, 'eval_model_preparation_time': 0.0079, 'eval_runtime': 73.9997, 'eval_samples_per_second': 3.378, 'eval_steps_per_second': 0.432}
Accuracy: 0.432
Micro F1 Score: 0.7034482758620689
Macro F1 Score: 0.639699356581238


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name=model,
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


### Model A

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='distilbert-base-uncased',
    folder_path=folder_path,
    tokenizer_save_name='/distilbert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "distilbert-base-uncased"
model_save_name = '/distilbert_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.433419
2,No log,0.379081


Epoch,Training Loss,Validation Loss
1,No log,0.433419
2,No log,0.379081
3,No log,0.36534


Model saved!


In [None]:
test_evaluator = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluator.run()

Evaluation results: {'eval_loss': 0.41824111342430115, 'eval_model_preparation_time': 0.0036, 'eval_runtime': 53.2716, 'eval_samples_per_second': 4.693, 'eval_steps_per_second': 0.3}
Predictions complete!
Accuracy: 0.396
F1 Score: 0.6971736204576043


In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='distilbert-base-uncased',
    folder_path=folder_path,
    tokenizer_save_name='/distilbert_tokenizer')

dev_dataset = processor.get_dataset(split='dev')

Train set: 2241
Val set: 277
Test set: 250


In [None]:
dev_preds = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='distilbert',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_preds.run()

Predictions saved!


## 5.3 ModernBERT (72%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-08qm9rhu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-08qm9rhu
  Resolved https://github.com/huggingface/transformers to commit 82fcac0a7e40dc6cc5e3121d714b9b16775293ad
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10306363 sha256=70c3e0cd90f3cce5d699ded17a7fa3676482160d3283511bd73d5ff59c718ae5
  Stored in directory: /tmp/pip-ephem-wheel-cache-28zsi_l7/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

### Train

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='answerdotai/ModernBERT-base',
    folder_path=folder_path,
    tokenizer_save_name='/modernbert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "answerdotai/ModernBERT-base"
model_save_name = '/modernbert_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.415591
2,No log,0.396208
3,No log,0.454367


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4347096085548401, 'eval_model_preparation_time': 0.0195, 'eval_runtime': 82.254, 'eval_samples_per_second': 3.039, 'eval_steps_per_second': 0.195}
Accuracy: 0.4
F1 Score: 0.6290099815195178


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='modernbert',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 5.4 DistilRoBERTa (68%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

!pip install optuna

Collecting optuna
  Using cached optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Using cached optuna-4.1.0-py3-none-any.whl (364 kB)
Using cached alembic-1.14.0-py3-none-any.whl (233 kB)
Using cached colorlog-6.9.0-py3-none-any.whl (11 kB)
Using cached Mako-1.3.8-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.8 alembic-1.14.0 colorlog-6.9.0 optuna-4.1.0


### Optuna

In [None]:
import optuna

# vars
model = 'distilroberta'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8, 16, 32])
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 200)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        adam_epsilon=adam_epsilon,
        warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)
    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.model

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-28 13:52:42,751] A new study created in memory with name: no-name-32a77e0e-dc60-4073-89d1-0ad54914736b
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
  adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.613905
2,No log,0.580811
3,No log,0.576961


[W 2024-12-28 14:35:58,224] Trial 0 failed with parameters: {'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3, 'learning_rate': 1.852898841985992e-06, 'weight_decay': 0.00020258808333802481, 'adam_epsilon': 3.269667958708342e-07, 'warmup_steps': 75, 'lr_scheduler_type': 'linear'} because of the following error: AttributeError("'DistilBertForSequenceClassification' object has no attribute 'evaluate'").
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-11-a77f7a0c9863>", line 60, in objective
    macro = test_evaluation.run()
  File "<ipython-input-8-84c09fb6b097>", line 39, in run
    self.evaluate_model()
  File "<ipython-input-8-84c09fb6b097>", line 12, in evaluate_model
    results = self.trained_model.evaluate(self.dataset)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", li

Model saved!


AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'evaluate'

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='distilroberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/distilroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "distilroberta-base"
model_save_name = '/distilroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.410592
2,No log,0.377999
3,No log,0.360245


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4016924500465393, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 28.2722, 'eval_samples_per_second': 8.843, 'eval_steps_per_second': 0.566}
Accuracy: 0.392
F1 Score: 0.6052793238507524


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='distilroberta',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 5.5 XLM-RoBERTa (63%)

### Train

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=multi_df,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='xlm-roberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/xlmroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "xlm-roberta-base"
model_save_name = '/xlmroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Train set: 11352
Val set: 1402
Test set: 1262


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.380756
2,0.419400,0.342036


Epoch,Training Loss,Validation Loss
1,No log,0.380756
2,0.419400,0.342036


In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=multi_df,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='xlm-roberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/xlmroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')


# train model
#model_name = "xlm-roberta-base"
model_name = "/content/drive/MyDrive/Data/SemEval/Task 11/Track A/checkpoint-1000"
problem_type = "multi_label_classification"
model_save_name = '/xlmroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, problem_type, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset, checkpoint=model_name)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 11352
Val set: 1402
Test set: 1262


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
3,0.3327,0.323458


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.31981056928634644, 'eval_model_preparation_time': 0.0064, 'eval_runtime': 298.138, 'eval_samples_per_second': 4.233, 'eval_steps_per_second': 0.265}
Accuracy: 0.49128367670364503
F1 Score: 0.5635067827073423


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='xlmroberta',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 5.6 TinyBERT

In [None]:
#!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

### Optuna

In [None]:
import optuna

# model vars
model = 'tinybert'
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8])
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    #adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        #adam_epsilon=adam_epsilon,
        #warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)

    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.create_trainer(train_dataset, val_dataset)

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")


Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-27 11:41:51,304] A new study created in memory with name: no-name-b170e388-dc85-4a63-8ad2-64119fa859f2
  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.574586
2,0.582000,0.532229
3,0.582000,0.525854


Model saved!


Evaluation results: {'eval_loss': 0.523144543170929, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 5.5424, 'eval_samples_per_second': 45.107, 'eval_steps_per_second': 5.774}


[I 2024-12-27 11:51:11,587] Trial 0 finished with value: 0.20490241003061516 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.457331761906483e-05, 'weight_decay': 8.760578572569119e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.20490241003061516.


Accuracy: 0.18
Micro F1 Score: 0.46440677966101696
Macro F1 Score: 0.20490241003061516


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.522441
2,0.547500,0.488648
3,0.547500,0.486133


Model saved!


Evaluation results: {'eval_loss': 0.5081325769424438, 'eval_model_preparation_time': 0.0028, 'eval_runtime': 7.128, 'eval_samples_per_second': 35.073, 'eval_steps_per_second': 4.489}


[I 2024-12-27 12:00:05,043] Trial 1 finished with value: 0.3416666666666667 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 2.491975690390402e-05, 'weight_decay': 1.0162266557249161e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.3416666666666667.


Accuracy: 0.244
Micro F1 Score: 0.5351681957186545
Macro F1 Score: 0.3416666666666667


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.571894
2,0.582900,0.530898
3,0.582900,0.525663


Model saved!


Evaluation results: {'eval_loss': 0.5256613492965698, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 3.8439, 'eval_samples_per_second': 65.038, 'eval_steps_per_second': 8.325}


[I 2024-12-27 12:08:48,510] Trial 2 finished with value: 0.2424456166425053 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.2784138212697802e-05, 'weight_decay': 0.001124327008237993, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.3416666666666667.


Accuracy: 0.204
Micro F1 Score: 0.48986486486486486
Macro F1 Score: 0.2424456166425053


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.519516
2,0.514200,0.453216
3,0.514200,0.446025


Model saved!


Evaluation results: {'eval_loss': 0.4904323220252991, 'eval_model_preparation_time': 0.0034, 'eval_runtime': 4.117, 'eval_samples_per_second': 60.724, 'eval_steps_per_second': 7.773}


[I 2024-12-27 12:18:06,589] Trial 3 finished with value: 0.45108259030160563 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.514566935871922e-05, 'weight_decay': 0.009333959259625914, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.45108259030160563.


Accuracy: 0.284
Micro F1 Score: 0.5919778699861687
Macro F1 Score: 0.45108259030160563


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.570974
2,0.581500,0.527668
3,0.581500,0.517589


Model saved!


Evaluation results: {'eval_loss': 0.5208468437194824, 'eval_model_preparation_time': 0.0045, 'eval_runtime': 5.0586, 'eval_samples_per_second': 49.421, 'eval_steps_per_second': 6.326}


[I 2024-12-27 12:27:02,413] Trial 4 finished with value: 0.2805446644133269 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.4181686271546608e-05, 'weight_decay': 6.825571709506599e-06, 'lr_scheduler_type': 'linear'}. Best is trial 3 with value: 0.45108259030160563.


Accuracy: 0.224
Micro F1 Score: 0.5083056478405316
Macro F1 Score: 0.2805446644133269


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.488702
2,0.511300,0.448612
3,0.511300,0.438056


Model saved!


Evaluation results: {'eval_loss': 0.48242810368537903, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 6.5359, 'eval_samples_per_second': 38.251, 'eval_steps_per_second': 4.896}


[I 2024-12-27 12:35:52,017] Trial 5 finished with value: 0.48676070472701205 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 9.274343484913477e-05, 'weight_decay': 1.1150368638505584e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.292
Micro F1 Score: 0.6038251366120219
Macro F1 Score: 0.48676070472701205


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.584097
2,0.593800,0.546284
3,0.593800,0.534914


Model saved!


Evaluation results: {'eval_loss': 0.5320590734481812, 'eval_model_preparation_time': 0.0185, 'eval_runtime': 4.3543, 'eval_samples_per_second': 57.414, 'eval_steps_per_second': 7.349}


[I 2024-12-27 12:44:40,912] Trial 6 finished with value: 0.18965451422027105 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.1331126813317836e-05, 'weight_decay': 6.476256691791227e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.184
Micro F1 Score: 0.4717948717948718
Macro F1 Score: 0.18965451422027105


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.49722
2,0.513600,0.44465
3,0.513600,0.440456


Model saved!


Evaluation results: {'eval_loss': 0.48869356513023376, 'eval_model_preparation_time': 0.0072, 'eval_runtime': 6.3812, 'eval_samples_per_second': 39.178, 'eval_steps_per_second': 5.015}


[I 2024-12-27 12:53:26,879] Trial 7 finished with value: 0.4763526989008893 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.551834733844088e-05, 'weight_decay': 1.7226835631098667e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.276
Micro F1 Score: 0.6021798365122616
Macro F1 Score: 0.4763526989008893


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.505621
2,0.530600,0.470881
3,0.530600,0.462845


Model saved!


Evaluation results: {'eval_loss': 0.5019523501396179, 'eval_model_preparation_time': 0.0017, 'eval_runtime': 4.2994, 'eval_samples_per_second': 58.148, 'eval_steps_per_second': 7.443}


[I 2024-12-27 13:02:15,688] Trial 8 finished with value: 0.3631839564989222 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 4.276933281974512e-05, 'weight_decay': 0.0003094642342714543, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.244
Micro F1 Score: 0.5427728613569321
Macro F1 Score: 0.3631839564989222


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.552931
2,0.569100,0.513342
3,0.569100,0.509026


Model saved!


Evaluation results: {'eval_loss': 0.5152259469032288, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.3112, 'eval_samples_per_second': 57.988, 'eval_steps_per_second': 7.422}


[I 2024-12-27 13:11:06,058] Trial 9 finished with value: 0.3052648250337596 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.635468538155532e-05, 'weight_decay': 0.0004637676471172026, 'lr_scheduler_type': 'cosine'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.228
Micro F1 Score: 0.5192307692307693
Macro F1 Score: 0.3052648250337596
Best trial: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 9.274343484913477e-05, 'weight_decay': 1.1150368638505584e-05, 'lr_scheduler_type': 'linear'}


### Train

In [None]:
# model vars
model = 'tinybert'
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model_save_name = '/tinybert_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)

# test model
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()
notify()

Map:   0%|          | 0/2241 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.499875
2,0.526000,0.456318
3,0.526000,0.447474


Model saved!


Evaluation results: {'eval_loss': 0.49065423011779785, 'eval_model_preparation_time': 0.0016, 'eval_runtime': 3.9421, 'eval_samples_per_second': 63.419, 'eval_steps_per_second': 8.118}
Accuracy: 0.28
Micro F1 Score: 0.5722460658082976
Macro F1 Score: 0.41544694885799566


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.49065423011779785, 'eval_model_preparation_time': 0.0016, 'eval_runtime': 5.332, 'eval_samples_per_second': 46.887, 'eval_steps_per_second': 6.001}
Accuracy: 0.28
Micro F1 Score: 0.5722460658082976
Macro F1 Score: 0.41544694885799566


0.41544694885799566

In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='tinybert',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 5.7 ModernBERT - large (74%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ltg4h7be
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ltg4h7be
  Resolved https://github.com/huggingface/transformers to commit 24c91f095fec4d90fa6901ef17146b4f4c21d0a3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10328608 sha256=bc1772a81dfe5be58977402cab9401b756c3437b2b27a5e1de5e9f666521cb36
  Stored in directory: /tmp/pip-ephem-wheel-cache-66qawilc/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

### Train

In [None]:
# vars
model = 'modernbertlarge'
model_name = "answerdotai/ModernBERT-large"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    save_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    run_name=f"{model}_classification_run")

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.341837


Epoch,Training Loss,Validation Loss
1,No log,0.341837
2,No log,0.349972
3,No log,0.457487


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.47418248653411865, 'eval_model_preparation_time': 0.0081, 'eval_runtime': 203.8987, 'eval_samples_per_second': 1.226, 'eval_steps_per_second': 0.078}
Accuracy: 0.496
F1 Score: 0.6938939098333786


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name=model_name,
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


# 6. Modelling (Translated Dataset)

Models the larger, translated dataset

In [None]:
# delete language indicator
trans_df = trans_df.drop(columns=['lang'])

# delete non-emotion rows
zero_rows = trans_df.drop('text', axis=1).sum(axis=1) == 0
trans_df = trans_df[~zero_rows]

trans_df.head(3)

Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise
0,The thought of death is scarier than death #Re...,0,1,0,0,0
1,"In this right, you can understand the connecti...",0,0,0,0,0
2,"As I woke up, all the face, terrible kidney pa...",0,0,0,1,0


## 6.1 BERT (73%)

### Training

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='bert-base-uncased',
    folder_path=folder_path,
    tokenizer_save_name='/bert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')

# train model
model_name = "bert-base-uncased"
problem_type = "multi_label_classification"
model_save_name = '/bert_model'
tokenizer_save_name = '/bert_tokenizer'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# api key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, problem_type, no_labels, training_args, folder_path)
trainer.train_model(train_dataset, val_dataset)

Train set: 2241
Val set: 277
Test set: 250


In [None]:
# model_name = "bert-base-uncased"
# problem_type = "multi_label_classification" # sigmoid (not softmax)

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     problem_type=problem_type,
#     num_labels=no_labels)

# training_args = TrainingArguments(
#     output_dir=folder_path,                         # training logs saved to directory
#     eval_strategy="epoch",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3)

# # training_args = TrainingArguments(
# #     output_dir='./results',
# #     num_train_epochs=10,
# #     per_device_train_batch_size=32,
# #     per_device_eval_batch_size=16,
# #     learning_rate=1e-7,
# #     evaluation_strategy="epoch",
# #     save_strategy="epoch",
# #     optimizers=(optimizer, None))

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset)

# trainer.train() # d178de1a7c092db9f84071c0f557e6950f376aa9

# # Save model & tokenizer
# model_save_name = '/bert_model'
# tokenizer_save_name = '/bert_tokenizer'
# trainer.save_model(folder_path + model_save_name)
# model.save_pretrained(folder_path + tokenizer_save_name)
# notify()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.401008
2,No log,0.347052
3,No log,0.328963


In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # Load the model
# model = BertForSequenceClassification.from_pretrained(folder_path + '/bert_model')

# # Load the tokenizer
# #tokenizer = BertTokenizer.from_pretrained(folder_path + '/bert_tokeniser')

# training_args = TrainingArguments(folder_path + '/bert_model/training_args')

# # Reinitialize the trainer with the loaded model and tokenizer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     processing_class=processor.tokenizer)

### Test model

In [None]:
# Model evaluation
results = trainer.evaluate(test_dataset)
print(results)

# Predictions
logits = trainer.predict(test_dataset)
probs = torch.sigmoid(torch.tensor(logits.predictions)) # logits to probs
preds = (probs > 0.5).int() # to binary

actuals = processor.test_labels_list
preds = preds.numpy() # tensor to np.array

# Evaluation metrics
accuracy = accuracy_score(actuals, preds)
f1 = f1_score(actuals, preds, average='micro') # inbalanced
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

notify()

{'eval_loss': 0.3763325810432434, 'eval_model_preparation_time': 0.0067, 'eval_runtime': 65.276, 'eval_samples_per_second': 3.83, 'eval_steps_per_second': 0.49}
Accuracy: 0.428
F1 Score: 0.7275132275132276


### Dev Test


In [None]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])  # Use input_ids length as the dataset size

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


In [None]:
tokenizer = processor.tokenizer
dev_texts_list = dev['text'].tolist()
dev_encodings = tokenizer(dev_texts_list, padding="max_length", truncation=True, max_length=processor.max_length)
dev_dataset = TextClassifierDataset(dev_encodings, labels=None)  # No labels for dev data

logits = trainer.predict(dev_dataset)
probs = torch.sigmoid(torch.tensor(logits.predictions))     # Convert logits to probabilities
preds = (probs > 0.5).int()                                 # Apply threshold to convert to binary
preds = preds.numpy()                                       # Convert tensor to numpy array

pred_eng_a = pd.DataFrame(preds, columns=processor.label_cols)
pred_eng_a.insert(0, 'id', dev_ids)
pred_eng_a = pred_eng_a[["id"] + processor.label_cols]
pred_eng_a.to_csv(f'{folder_path}/Dev Data/pred_eng_a.csv', index=False)

notify()

## 6.2 DistilBERT (68%)

### Optuna (Failed)

In [None]:
#!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
#!pip install optuna
import optuna

# vars
model = 'distilbert'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8])
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    #adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        #adam_epsilon=adam_epsilon,
        #warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)

    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.create_trainer(train_dataset, val_dataset)

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-27 22:56:10,540] A new study created in memory with name: no-name-fa8aa4dd-2a20-45bd-8583-ed089be5d904
  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.418382
2,0.457300,0.371014
3,0.457300,0.360616


Model saved!


Evaluation results: {'eval_loss': 0.411830872297287, 'eval_model_preparation_time': 0.0036, 'eval_runtime': 26.9451, 'eval_samples_per_second': 9.278, 'eval_steps_per_second': 1.188}


[I 2024-12-27 23:46:52,239] Trial 0 finished with value: 0.5757727725114227 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.986767963102854e-05, 'weight_decay': 0.005926740996339108, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.5757727725114227.


Accuracy: 0.4
Micro F1 Score: 0.6854082998661312
Macro F1 Score: 0.5757727725114227


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.409353
2,0.448100,0.36584
3,0.448100,0.354211


Model saved!


Evaluation results: {'eval_loss': 0.407562792301178, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 26.1019, 'eval_samples_per_second': 9.578, 'eval_steps_per_second': 1.226}


[I 2024-12-28 00:36:16,816] Trial 1 finished with value: 0.6050674580533382 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 2.226102505311579e-05, 'weight_decay': 0.0039607407233010665, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.6050674580533382.


Accuracy: 0.412
Micro F1 Score: 0.6984993178717599
Macro F1 Score: 0.6050674580533382


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.405636
2,0.443600,0.363212
3,0.443600,0.351113


Model saved!


Evaluation results: {'eval_loss': 0.40454769134521484, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 25.2861, 'eval_samples_per_second': 9.887, 'eval_steps_per_second': 1.266}


[I 2024-12-28 01:25:22,704] Trial 2 finished with value: 0.6025164378105554 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 2.3707773837261727e-05, 'weight_decay': 2.31459948301011e-05, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.6050674580533382.


Accuracy: 0.408
Micro F1 Score: 0.6964769647696477
Macro F1 Score: 0.6025164378105554


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.380373
2,0.418000,0.357782
3,0.418000,0.351032


Model saved!


Evaluation results: {'eval_loss': 0.3983914852142334, 'eval_model_preparation_time': 0.0021, 'eval_runtime': 24.9026, 'eval_samples_per_second': 10.039, 'eval_steps_per_second': 1.285}


[I 2024-12-28 02:14:12,228] Trial 3 finished with value: 0.6319607787647804 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 4.3867631268394856e-05, 'weight_decay': 0.006948840834385908, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.6319607787647804.


Accuracy: 0.4
Micro F1 Score: 0.7092391304347826
Macro F1 Score: 0.6319607787647804


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.446925
2,0.484000,0.399473
3,0.484000,0.390168


Model saved!


Evaluation results: {'eval_loss': 0.42863357067108154, 'eval_model_preparation_time': 0.0033, 'eval_runtime': 31.3546, 'eval_samples_per_second': 7.973, 'eval_steps_per_second': 1.021}


[I 2024-12-28 03:03:38,167] Trial 4 finished with value: 0.5399674930866833 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.2540199498110377e-05, 'weight_decay': 0.00014916641527113923, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.6319607787647804.


Accuracy: 0.344
Micro F1 Score: 0.6610644257703081
Macro F1 Score: 0.5399674930866833


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.447132
2,0.482700,0.397778
3,0.482700,0.382245


Model saved!


Evaluation results: {'eval_loss': 0.4245350956916809, 'eval_model_preparation_time': 0.0093, 'eval_runtime': 28.7988, 'eval_samples_per_second': 8.681, 'eval_steps_per_second': 1.111}


[I 2024-12-28 03:55:06,517] Trial 5 finished with value: 0.5712309983316082 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.3842396064983672e-05, 'weight_decay': 3.0726386471754925e-05, 'lr_scheduler_type': 'linear'}. Best is trial 3 with value: 0.6319607787647804.


Accuracy: 0.368
Micro F1 Score: 0.6739427012278308
Macro F1 Score: 0.5712309983316082


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.397277
2,0.419200,0.369085
3,0.419200,0.365795


Model saved!


Evaluation results: {'eval_loss': 0.4159008264541626, 'eval_model_preparation_time': 0.0021, 'eval_runtime': 26.9383, 'eval_samples_per_second': 9.28, 'eval_steps_per_second': 1.188}


[I 2024-12-28 04:46:53,838] Trial 6 finished with value: 0.630207386598364 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 5.856203482704892e-05, 'weight_decay': 1.0135118013977243e-05, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.6319607787647804.


Accuracy: 0.396
Micro F1 Score: 0.6955345060893099
Macro F1 Score: 0.630207386598364


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.410476
2,0.420100,0.395679
3,0.420100,0.40366


Model saved!


Evaluation results: {'eval_loss': 0.4506837725639343, 'eval_model_preparation_time': 0.0025, 'eval_runtime': 28.2605, 'eval_samples_per_second': 8.846, 'eval_steps_per_second': 1.132}


[I 2024-12-28 05:39:03,578] Trial 7 finished with value: 0.5966762541795756 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.130565524488611e-05, 'weight_decay': 1.3209714283231154e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.6319607787647804.


Accuracy: 0.388
Micro F1 Score: 0.6829268292682927
Macro F1 Score: 0.5966762541795756


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


### Model B

In [None]:
# vars
model = 'distilbert'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    save_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    run_name=f"{model}_classification_run")

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.478698
2,0.510100,0.42529
3,0.510100,0.390254
4,0.375300,0.378907
5,0.375300,0.375755


Model saved!


In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4190017580986023, 'eval_model_preparation_time': 0.0048, 'eval_runtime': 50.7807, 'eval_samples_per_second': 4.923, 'eval_steps_per_second': 0.63}
Accuracy: 0.384
Micro F1 Score: 0.6685159500693482
Macro F1 Score: 0.5756590699940454


In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.5297229290008545, 'eval_model_preparation_time': 0.0079, 'eval_runtime': 73.9997, 'eval_samples_per_second': 3.378, 'eval_steps_per_second': 0.432}
Accuracy: 0.432
Micro F1 Score: 0.7034482758620689
Macro F1 Score: 0.639699356581238


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name=model,
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


### Model A

In [None]:
# model vars
model = 'distilbert'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=trans_df,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='distilbert-base-uncased',
    folder_path=folder_path,
    tokenizer_save_name='/distilbert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 3164
Val set: 391
Test set: 352


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.401425
2,0.416900,0.417397
3,0.263400,0.450613


Model saved!


In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4068620502948761, 'eval_model_preparation_time': 0.0021, 'eval_runtime': 40.1259, 'eval_samples_per_second': 8.772, 'eval_steps_per_second': 1.097}
Accuracy: 0.5284090909090909
Micro F1 Score: 0.6299862448418156
Macro F1 Score: 0.6096879874322835


0.6096879874322835

In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name=model,
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


In [None]:
zero_rows = trans_df.drop('text', axis=1).sum(axis=1) == 0
trans_df[zero_rows]

Unnamed: 0,text,Anger,Fear,Joy,Sadness,Surprise
1,"In this right, you can understand the connecti...",0,0,0,0,0
3,We're just consumers.,0,0,0,0,0
7,Please recommend a chunk or a show.,0,0,0,0,0
15,It's probably a period when you want to be loved.,0,0,0,0,0
17,So all possible things were described and the ...,0,0,0,0,0
...,...,...,...,...,...,...
6195,So either you'll be blessed like Saitama. In t...,0,0,0,0,0
6196,"If you pay your garbage fees by weight, that's...",0,0,0,0,0
6198,Our policy has managed to be more afraid of ex...,0,0,0,0,0
6201,We decided against it. If something had been n...,0,0,0,0,0


## 6.3 ModernBERT (72%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-08qm9rhu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-08qm9rhu
  Resolved https://github.com/huggingface/transformers to commit 82fcac0a7e40dc6cc5e3121d714b9b16775293ad
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10306363 sha256=70c3e0cd90f3cce5d699ded17a7fa3676482160d3283511bd73d5ff59c718ae5
  Stored in directory: /tmp/pip-ephem-wheel-cache-28zsi_l7/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

### Train

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='answerdotai/ModernBERT-base',
    folder_path=folder_path,
    tokenizer_save_name='/modernbert_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "answerdotai/ModernBERT-base"
model_save_name = '/modernbert_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.415591
2,No log,0.396208
3,No log,0.454367


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4347096085548401, 'eval_model_preparation_time': 0.0195, 'eval_runtime': 82.254, 'eval_samples_per_second': 3.039, 'eval_steps_per_second': 0.195}
Accuracy: 0.4
F1 Score: 0.6290099815195178


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='modernbert',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 6.4 DistilRoBERTa (68%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

!pip install optuna

Collecting optuna
  Using cached optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Using cached optuna-4.1.0-py3-none-any.whl (364 kB)
Using cached alembic-1.14.0-py3-none-any.whl (233 kB)
Using cached colorlog-6.9.0-py3-none-any.whl (11 kB)
Using cached Mako-1.3.8-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.8 alembic-1.14.0 colorlog-6.9.0 optuna-4.1.0


### Optuna

In [None]:
import optuna

# vars
model = 'distilroberta'
model_name = "distilbert-base-uncased"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8, 16, 32])
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 200)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        adam_epsilon=adam_epsilon,
        warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)
    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.model

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-28 13:52:42,751] A new study created in memory with name: no-name-32a77e0e-dc60-4073-89d1-0ad54914736b
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
  adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.613905
2,No log,0.580811
3,No log,0.576961


[W 2024-12-28 14:35:58,224] Trial 0 failed with parameters: {'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3, 'learning_rate': 1.852898841985992e-06, 'weight_decay': 0.00020258808333802481, 'adam_epsilon': 3.269667958708342e-07, 'warmup_steps': 75, 'lr_scheduler_type': 'linear'} because of the following error: AttributeError("'DistilBertForSequenceClassification' object has no attribute 'evaluate'").
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-11-a77f7a0c9863>", line 60, in objective
    macro = test_evaluation.run()
  File "<ipython-input-8-84c09fb6b097>", line 39, in run
    self.evaluate_model()
  File "<ipython-input-8-84c09fb6b097>", line 12, in evaluate_model
    results = self.trained_model.evaluate(self.dataset)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", li

Model saved!


AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'evaluate'

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='distilroberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/distilroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "distilroberta-base"
model_save_name = '/distilroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.410592
2,No log,0.377999
3,No log,0.360245


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4016924500465393, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 28.2722, 'eval_samples_per_second': 8.843, 'eval_steps_per_second': 0.566}
Accuracy: 0.392
F1 Score: 0.6052793238507524


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='distilroberta',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 6.5 XLM-RoBERTa (63%)

### Train

In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=multi_df,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='xlm-roberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/xlmroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
model_name = "xlm-roberta-base"
model_save_name = '/xlmroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Train set: 11352
Val set: 1402
Test set: 1262


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.380756
2,0.419400,0.342036


Epoch,Training Loss,Validation Loss
1,No log,0.380756
2,0.419400,0.342036


In [None]:
# process datasets
processor = TextClassificationProcessor(
    data=multi_df,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name='xlm-roberta-base',
    folder_path=folder_path,
    tokenizer_save_name='/xlmroberta_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')


# train model
#model_name = "xlm-roberta-base"
model_name = "/content/drive/MyDrive/Data/SemEval/Task 11/Track A/checkpoint-1000"
problem_type = "multi_label_classification"
model_save_name = '/xlmroberta_model'

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, problem_type, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset, checkpoint=model_name)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 11352
Val set: 1402
Test set: 1262


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
3,0.3327,0.323458


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.31981056928634644, 'eval_model_preparation_time': 0.0064, 'eval_runtime': 298.138, 'eval_samples_per_second': 4.233, 'eval_steps_per_second': 0.265}
Accuracy: 0.49128367670364503
F1 Score: 0.5635067827073423


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='xlmroberta',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 6.6 TinyBERT

In [None]:
#!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

### Optuna

In [None]:
import optuna

# model vars
model = 'tinybert'
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# Define objective function for Optuna
def objective(trial):
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8])
    per_device_eval_batch_size = trial.suggest_categorical('per_device_eval_batch_size', [8])
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    #adam_epsilon = trial.suggest_loguniform('adam_epsilon', 1e-8, 1e-6)
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine'])

    # Update training args with Optuna suggestions
    training_args = TrainingArguments(
        output_dir=folder_path,
        eval_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        #adam_epsilon=adam_epsilon,
        #warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type)

    # Train model
    trainer = TextClassifierTrainer(model_name=model_name, no_labels=no_labels, training_args=training_args, folder_path=folder_path, model_save_name=model_save_name)

    trainer.train_model(train_dataset, val_dataset)
    trained_model = trainer.create_trainer(train_dataset, val_dataset)

    # Evaluate the model on test set
    test_evaluation = ModelPredictionEvaluation(
        trained_model=trained_model,
        dataset=test_dataset,
        processor=processor)

    macro = test_evaluation.run()
    return macro  # Return loss for optimization

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Best trial results
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")


Train set: 2241
Val set: 277
Test set: 250


[I 2024-12-27 11:41:51,304] A new study created in memory with name: no-name-b170e388-dc85-4a63-8ad2-64119fa859f2
  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.574586
2,0.582000,0.532229
3,0.582000,0.525854


Model saved!


Evaluation results: {'eval_loss': 0.523144543170929, 'eval_model_preparation_time': 0.0026, 'eval_runtime': 5.5424, 'eval_samples_per_second': 45.107, 'eval_steps_per_second': 5.774}


[I 2024-12-27 11:51:11,587] Trial 0 finished with value: 0.20490241003061516 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.457331761906483e-05, 'weight_decay': 8.760578572569119e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.20490241003061516.


Accuracy: 0.18
Micro F1 Score: 0.46440677966101696
Macro F1 Score: 0.20490241003061516


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.522441
2,0.547500,0.488648
3,0.547500,0.486133


Model saved!


Evaluation results: {'eval_loss': 0.5081325769424438, 'eval_model_preparation_time': 0.0028, 'eval_runtime': 7.128, 'eval_samples_per_second': 35.073, 'eval_steps_per_second': 4.489}


[I 2024-12-27 12:00:05,043] Trial 1 finished with value: 0.3416666666666667 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 2.491975690390402e-05, 'weight_decay': 1.0162266557249161e-06, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.3416666666666667.


Accuracy: 0.244
Micro F1 Score: 0.5351681957186545
Macro F1 Score: 0.3416666666666667


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.571894
2,0.582900,0.530898
3,0.582900,0.525663


Model saved!


Evaluation results: {'eval_loss': 0.5256613492965698, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 3.8439, 'eval_samples_per_second': 65.038, 'eval_steps_per_second': 8.325}


[I 2024-12-27 12:08:48,510] Trial 2 finished with value: 0.2424456166425053 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.2784138212697802e-05, 'weight_decay': 0.001124327008237993, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 0.3416666666666667.


Accuracy: 0.204
Micro F1 Score: 0.48986486486486486
Macro F1 Score: 0.2424456166425053


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.519516
2,0.514200,0.453216
3,0.514200,0.446025


Model saved!


Evaluation results: {'eval_loss': 0.4904323220252991, 'eval_model_preparation_time': 0.0034, 'eval_runtime': 4.117, 'eval_samples_per_second': 60.724, 'eval_steps_per_second': 7.773}


[I 2024-12-27 12:18:06,589] Trial 3 finished with value: 0.45108259030160563 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.514566935871922e-05, 'weight_decay': 0.009333959259625914, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.45108259030160563.


Accuracy: 0.284
Micro F1 Score: 0.5919778699861687
Macro F1 Score: 0.45108259030160563


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.570974
2,0.581500,0.527668
3,0.581500,0.517589


Model saved!


Evaluation results: {'eval_loss': 0.5208468437194824, 'eval_model_preparation_time': 0.0045, 'eval_runtime': 5.0586, 'eval_samples_per_second': 49.421, 'eval_steps_per_second': 6.326}


[I 2024-12-27 12:27:02,413] Trial 4 finished with value: 0.2805446644133269 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.4181686271546608e-05, 'weight_decay': 6.825571709506599e-06, 'lr_scheduler_type': 'linear'}. Best is trial 3 with value: 0.45108259030160563.


Accuracy: 0.224
Micro F1 Score: 0.5083056478405316
Macro F1 Score: 0.2805446644133269


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.488702
2,0.511300,0.448612
3,0.511300,0.438056


Model saved!


Evaluation results: {'eval_loss': 0.48242810368537903, 'eval_model_preparation_time': 0.0032, 'eval_runtime': 6.5359, 'eval_samples_per_second': 38.251, 'eval_steps_per_second': 4.896}


[I 2024-12-27 12:35:52,017] Trial 5 finished with value: 0.48676070472701205 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 9.274343484913477e-05, 'weight_decay': 1.1150368638505584e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.292
Micro F1 Score: 0.6038251366120219
Macro F1 Score: 0.48676070472701205


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.584097
2,0.593800,0.546284
3,0.593800,0.534914


Model saved!


Evaluation results: {'eval_loss': 0.5320590734481812, 'eval_model_preparation_time': 0.0185, 'eval_runtime': 4.3543, 'eval_samples_per_second': 57.414, 'eval_steps_per_second': 7.349}


[I 2024-12-27 12:44:40,912] Trial 6 finished with value: 0.18965451422027105 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.1331126813317836e-05, 'weight_decay': 6.476256691791227e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.184
Micro F1 Score: 0.4717948717948718
Macro F1 Score: 0.18965451422027105


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.49722
2,0.513600,0.44465
3,0.513600,0.440456


Model saved!


Evaluation results: {'eval_loss': 0.48869356513023376, 'eval_model_preparation_time': 0.0072, 'eval_runtime': 6.3812, 'eval_samples_per_second': 39.178, 'eval_steps_per_second': 5.015}


[I 2024-12-27 12:53:26,879] Trial 7 finished with value: 0.4763526989008893 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 8.551834733844088e-05, 'weight_decay': 1.7226835631098667e-05, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.276
Micro F1 Score: 0.6021798365122616
Macro F1 Score: 0.4763526989008893


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.505621
2,0.530600,0.470881
3,0.530600,0.462845


Model saved!


Evaluation results: {'eval_loss': 0.5019523501396179, 'eval_model_preparation_time': 0.0017, 'eval_runtime': 4.2994, 'eval_samples_per_second': 58.148, 'eval_steps_per_second': 7.443}


[I 2024-12-27 13:02:15,688] Trial 8 finished with value: 0.3631839564989222 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 4.276933281974512e-05, 'weight_decay': 0.0003094642342714543, 'lr_scheduler_type': 'linear'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.244
Micro F1 Score: 0.5427728613569321
Macro F1 Score: 0.3631839564989222


  num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5, 10)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.552931
2,0.569100,0.513342
3,0.569100,0.509026


Model saved!


Evaluation results: {'eval_loss': 0.5152259469032288, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.3112, 'eval_samples_per_second': 57.988, 'eval_steps_per_second': 7.422}


[I 2024-12-27 13:11:06,058] Trial 9 finished with value: 0.3052648250337596 and parameters: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 1.635468538155532e-05, 'weight_decay': 0.0004637676471172026, 'lr_scheduler_type': 'cosine'}. Best is trial 5 with value: 0.48676070472701205.


Accuracy: 0.228
Micro F1 Score: 0.5192307692307693
Macro F1 Score: 0.3052648250337596
Best trial: {'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3, 'learning_rate': 9.274343484913477e-05, 'weight_decay': 1.1150368638505584e-05, 'lr_scheduler_type': 'linear'}


### Train

In [None]:
# model vars
model = 'tinybert'
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model_save_name = '/tinybert_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5)

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)

# test model
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()
notify()

Train set: 2241
Val set: 277
Test set: 250


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.494806
2,0.523500,0.457503
3,0.523500,0.452516
4,0.416800,0.441182
5,0.416800,0.433662


Model saved!


Evaluation results: {'eval_loss': 0.4928179979324341, 'eval_model_preparation_time': 0.003, 'eval_runtime': 3.7962, 'eval_samples_per_second': 65.856, 'eval_steps_per_second': 8.43}
Accuracy: 0.28
Micro F1 Score: 0.5953991880920162
Macro F1 Score: 0.4966850302277807


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.4928179979324341, 'eval_model_preparation_time': 0.003, 'eval_runtime': 4.8512, 'eval_samples_per_second': 51.533, 'eval_steps_per_second': 6.596}
Accuracy: 0.28
Micro F1 Score: 0.5953991880920162
Macro F1 Score: 0.4966850302277807


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name='tinybert',
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!


## 6.7 ModernBERT - large (74%)

In [None]:
#!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ltg4h7be
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ltg4h7be
  Resolved https://github.com/huggingface/transformers to commit 24c91f095fec4d90fa6901ef17146b4f4c21d0a3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10328608 sha256=bc1772a81dfe5be58977402cab9401b756c3437b2b27a5e1de5e9f666521cb36
  Stored in directory: /tmp/pip-ephem-wheel-cache-66qawilc/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

### Train

In [None]:
# vars
model = 'modernbertlarge'
model_name = "answerdotai/ModernBERT-large"
model_save_name = f'/{model}_model'

# process datasets
processor = TextClassificationProcessor(
    data=train,
    dev=dev,
    no_labels=no_labels,
    test_size=0.1,
    random_state=2001,
    max_length=64,
    tokenizer_name=model_name,
    folder_path=folder_path,
    tokenizer_save_name=f'/{model}_tokenizer')

train_dataset = processor.get_dataset(split='train')
val_dataset = processor.get_dataset(split='val')
test_dataset = processor.get_dataset(split='test')
dev_dataset = processor.get_dataset(split='dev')

# train model
training_args = TrainingArguments(
    output_dir=folder_path,
    eval_strategy="epoch",
    save_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    run_name=f"{model}_classification_run")

# API key: d178de1a7c092db9f84071c0f557e6950f376aa9
trainer = TextClassifierTrainer(model_name, no_labels, training_args, folder_path, model_save_name)
trainer.train_model(train_dataset, val_dataset)
trained_model = trainer.create_trainer(train_dataset, val_dataset)
notify()

Train set: 2241
Val set: 277
Test set: 250


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.341837


Epoch,Training Loss,Validation Loss
1,No log,0.341837
2,No log,0.349972
3,No log,0.457487


Model saved!


### Test

In [None]:
test_evaluation = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=test_dataset,
    processor=processor)

test_evaluation.run()

Evaluation results: {'eval_loss': 0.47418248653411865, 'eval_model_preparation_time': 0.0081, 'eval_runtime': 203.8987, 'eval_samples_per_second': 1.226, 'eval_steps_per_second': 0.078}
Accuracy: 0.496
F1 Score: 0.6938939098333786


In [None]:
dev_predictions = ModelPredictionEvaluation(
    trained_model=trained_model,
    dataset=dev_dataset,
    processor=processor,
    model_name=model_name,
    dev_ids=dev_ids,
    folder_path=folder_path)

dev_predictions.run()

Predictions saved!
