In [1]:
import json
import pandas as pd

def process_absa_dataset(input_path: str, output_path: str):
    """
    Converts an ABSA dataset from raw JSONL format to a flat format with text, aspect, and sentiment columns.

    Args:
        input_path (str): Path to the input .jsonl file
        output_path (str): Path to the output .csv file
    """
    processed_data = []

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line.strip())
            text = entry.get("text", "")
            labels = entry.get("labels", [])

            for label in labels:
                start, end, aspect_sentiment = label
                if "_" in aspect_sentiment:
                    *aspect_parts, sentiment = aspect_sentiment.split("_")
                    aspect = "_".join(aspect_parts)
                else:
                    aspect, sentiment = aspect_sentiment, "neutral"


                processed_data.append({
                    "text": text,
                    "aspect": aspect,
                    "sentiment": sentiment
                })

    # Save to CSV
    df = pd.DataFrame(processed_data)
    df.to_csv(output_path, index=False)
    print(f"Processed dataset saved to: {output_path}")



In [2]:

input_file = "review_with_aspect.jsonl"
output_file = "aspect_based_sentiment.csv"
process_absa_dataset(input_file, output_file)

Processed dataset saved to: aspect_based_sentiment.csv


In [3]:
from transformers import AutoTokenizer, AutoModel

model_name = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd

# Load your preprocessed dataset
df = pd.read_csv("aspect_based_sentiment.csv")

# Optional: label encoding
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["sentiment"].map(label_map)



In [5]:
from transformers import AutoTokenizer


# Encode text + aspect as sentence pairs
def encode_pair(row):
    return tokenizer(
        row["text"],
        row["aspect"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Example encoding (batching will come later)
sample = df.iloc[0]
encoded = encode_pair(sample)

print(encoded.keys())  # Shows input_ids, attention_mask, etc.


dict_keys(['input_ids', 'attention_mask'])


In [6]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

class ABSADataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(row["label"])
        }


In [7]:
import torch.nn as nn
import torch.optim as optim

# Model
class ModernBertClassifier(nn.Module):
    def __init__(self, model_name, num_labels=3, dropout=0.3, classifier_layers=1, hidden_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        in_features = self.bert.config.hidden_size

        if(classifier_layers == 1):
            self.classifier = nn.Linear(in_features, num_labels)
        elif classifier_layers == 2:
            self.classifier = nn.Sequential(
                nn.Linear(in_features, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, num_labels)
            )
        else:
            raise ValueError("classifier_layers must be 1 or 2")

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = output.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(cls_output))

In [8]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

train_dataset = ABSADataset(train_df, tokenizer)
val_dataset = ABSADataset(val_df, tokenizer)
test_dataset = ABSADataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=128, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=128, num_workers=8)


In [9]:
print(f"Train size: {len(train_dataset)}")


Train size: 103660


In [10]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, dataloader, compute_loss=False):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

            if compute_loss:
                loss = criterion(outputs, labels)
                total_loss += loss.item()

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")
    avg_loss = total_loss / len(dataloader) if compute_loss else None
    return acc, f1, avg_loss



In [11]:
#Baseline 1 - Majority class

from sklearn.dummy import DummyClassifier

# Prepare labels
y_train = train_df["label"]
y_test = test_df["label"]

# Create and train dummy classifier
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit([[0]] * len(y_train), y_train)  # Fake features, labels only matter

# Predict and evaluate
dummy_preds = dummy.predict([[0]] * len(y_test))
acc_dummy = accuracy_score(y_test, dummy_preds)
f1_dummy = f1_score(y_test, dummy_preds, average="macro")

print(f"Majority Class Baseline - Accuracy: {acc_dummy:.4f} | F1 Score: {f1_dummy:.4f}")


Majority Class Baseline - Accuracy: 0.4667 | F1 Score: 0.2121


In [12]:
#Baseline 2 - Logictic regression with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Combine text + aspect for TF-IDF baseline (mimicking ABSA input)
train_texts = (train_df["text"] + " [ASPECT] " + train_df["aspect"]).tolist()
test_texts = (test_df["text"] + " [ASPECT] " + test_df["aspect"]).tolist()

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

y_train = train_df["label"]
y_test = test_df["label"]

# Train logistic regression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict and evaluate
baseline_preds = clf.predict(X_test)
acc = accuracy_score(y_test, baseline_preds)
f1 = f1_score(y_test, baseline_preds, average="macro")

print(f"Logistic Regression Baseline - Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")


Logistic Regression Baseline - Accuracy: 0.7165 | F1 Score: 0.7387


In [13]:
import wandb

#Login wandb
wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33merdemerturk[0m ([33merdemerturk-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [14]:
# Define sweep config: Random search for LR, Dropout, and Architecture
sweep_config = {
    'method': 'random',  # grid search is tooo expensive.
    'metric': {
      'name': 'val_f1',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [1e-5, 2e-5, 3e-5]
        },
        'dropout': {
            'values': [0.1, 0.3, 0.5]
        },
        'classifier_layers': {
            'values': [1, 2]   # 1 for single linear, 2 for hidden+output
        },
        'batch_size': {
            'values': [32, 64, 128] #256 does not fit in memory
        }
    }
}


In [15]:
import wandb
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm  # <--- import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_sweep(config=None):
    with wandb.init(config=config, name="initial-runs-with-5-epoch"):
        config = wandb.config

        # ----- Model -----
        model = ModernBertClassifier(
            model_name="answerdotai/ModernBERT-base",
            num_labels=3,
            dropout=config.dropout,
            classifier_layers=config.classifier_layers
        )

        model = model.to(device)
        
        if torch.cuda.device_count() > 1:
            print("Using", torch.cuda.device_count(), "GPUs!")
            model = torch.nn.DataParallel(model)

        # ----- Data Loaders -----
        train_loader = DataLoader(
            train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=8
        )
        val_loader = DataLoader(
            val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=8
        )

        # ----- Optimizer -----
        optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
        criterion = torch.nn.CrossEntropyLoss()

        num_epochs = 5  # Keep it short due to complexity. 
        #Previous experiments showed that 5 epochs is enough for convergence.

        best_macro_f1 = 0.0
        for epoch in range(num_epochs):
            # ----- TRAINING -----
            model.train()
            train_losses, train_labels, train_preds = [], [], []
            train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
            for batch in train_pbar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                train_losses.append(loss.item())
                preds = torch.argmax(outputs, dim=1)
                train_labels.extend(labels.cpu().numpy())
                train_preds.extend(preds.cpu().numpy())

            train_acc = accuracy_score(train_labels, train_preds)
            train_f1 = f1_score(train_labels, train_preds, average='macro')

            # ----- VALIDATION -----
            model.eval()
            val_losses, val_labels, val_preds = [], [], []
            val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
            with torch.no_grad():
                for batch in val_pbar:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs, labels)

                    val_losses.append(loss.item())
                    preds = torch.argmax(outputs, dim=1)
                    val_labels.extend(labels.cpu().numpy())
                    val_preds.extend(preds.cpu().numpy())

            val_acc = accuracy_score(val_labels, val_preds)
            val_f1 = f1_score(val_labels, val_preds, average='macro')
            
            # Print summary for your monitoring
            print(f"Epoch {epoch+1}/{num_epochs} - "
                  f"Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f} | "
                  f"Train Loss: {sum(train_losses)/len(train_losses):.4f} | "
                  f"Val Loss: {sum(val_losses)/len(val_losses):.4f}")

            # Log to WANDB
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": sum(train_losses) / len(train_losses),
                "train_acc": train_acc,
                "train_f1": train_f1,
                "val_loss": sum(val_losses) / len(val_losses),
                "val_acc": val_acc,
                "val_f1": val_f1
            })

            # Save best
            if val_f1 > best_macro_f1:
                best_macro_f1 = val_f1

        # Log best macro-F1 at the end
        wandb.log({"best_val_macro_f1": best_macro_f1})


In [16]:
sweep_id = wandb.sweep(sweep_config, project="aspect-sentiment-modernbert")
print(f"Sweep ID: {sweep_id}")

wandb.agent(sweep_id, function=train_sweep, count=3)  # Change count as needed


Create sweep with ID: yyehoplr
Sweep URL: https://wandb.ai/erdemerturk-middle-east-technical-university/aspect-sentiment-modernbert/sweeps/yyehoplr
Sweep ID: yyehoplr


[34m[1mwandb[0m: Agent Starting Run: 6rxt2pf7 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	classifier_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	learning_rate: 1e-05


Using 2 GPUs!


Epoch 1/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

[34m[1mwandb[0m: [32m[41mERROR[0m Run 6rxt2pf7 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/ubuntu/.local/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_14016/2281082461.py", line 55, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     outputs = model(input_ids=input_ids, attention_mask=attention_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/ubuntu/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[34m[1mwandb[0m: [32m[41mERROR[0m     return self._call_impl(*args, **kwargs)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Using 2 GPUs!


Epoch 1/5 [Train]:   0%|          | 0/3240 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 1/5 - Train F1: 0.7539 | Val F1: 0.7942 | Train Loss: 0.5292 | Val Loss: 0.4960


Epoch 2/5 [Train]:   0%|          | 0/3240 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 2/5 - Train F1: 0.8030 | Val F1: 0.7961 | Train Loss: 0.4693 | Val Loss: 0.4756


Epoch 3/5 [Train]:   0%|          | 0/3240 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 3/5 - Train F1: 0.8262 | Val F1: 0.8050 | Train Loss: 0.4335 | Val Loss: 0.4692


Epoch 4/5 [Train]:   0%|          | 0/3240 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 4/5 - Train F1: 0.8519 | Val F1: 0.8028 | Train Loss: 0.3851 | Val Loss: 0.4914


Epoch 5/5 [Train]:   0%|          | 0/3240 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 5/5 - Train F1: 0.8733 | Val F1: 0.7983 | Train Loss: 0.3348 | Val Loss: 0.5135


0,1
best_val_macro_f1,▁
epoch,▁▃▅▆█
train_acc,▁▄▅▇█
train_f1,▁▄▅▇█
train_loss,█▆▅▃▁
val_acc,▁▁█▆▄
val_f1,▁▂█▇▄
val_loss,▅▂▁▅█

0,1
best_val_macro_f1,0.80497
epoch,5.0
train_acc,0.8426
train_f1,0.87331
train_loss,0.33485
val_acc,0.74902
val_f1,0.79829
val_loss,0.51346


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: hcj340ld with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	classifier_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	learning_rate: 3e-05


Using 2 GPUs!


Epoch 1/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 1/5 - Train F1: 0.7832 | Val F1: 0.8186 | Train Loss: 0.4939 | Val Loss: 0.4369


Epoch 2/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 2/5 - Train F1: 0.8239 | Val F1: 0.8247 | Train Loss: 0.4293 | Val Loss: 0.4249


Epoch 3/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 3/5 - Train F1: 0.8409 | Val F1: 0.8283 | Train Loss: 0.3990 | Val Loss: 0.4273


Epoch 4/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 4/5 - Train F1: 0.8609 | Val F1: 0.8231 | Train Loss: 0.3607 | Val Loss: 0.4472


Epoch 5/5 [Train]:   0%|          | 0/1620 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already b

Epoch 5/5 - Train F1: 0.8796 | Val F1: 0.8150 | Train Loss: 0.3159 | Val Loss: 0.4781


0,1
best_val_macro_f1,▁
epoch,▁▃▅▆█
train_acc,▁▄▅▇█
train_f1,▁▄▅▇█
train_loss,█▅▄▃▁
val_acc,▃▇█▄▁
val_f1,▃▆█▅▁
val_loss,▃▁▁▄█

0,1
best_val_macro_f1,0.82833
epoch,5.0
train_acc,0.85017
train_f1,0.87962
train_loss,0.31591
val_acc,0.77162
val_f1,0.81495
val_loss,0.47815


In [17]:
torch.cuda.empty_cache()