In [None]:
! pip install transformers[torch] datasets evaluate swifter accelerate wandb

In [None]:
import transformers, datasets
import torch
import matplotlib.pyplot as plt
import pandas as pd
import sys
import numpy as np
from glob import glob
import seaborn as sns
import plotly.express as px
# import swifter
import json
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

# Dataset

In [None]:
!pwd

In [None]:
train_df = pd.read_csv("../input/balanced-goal-full/train.csv").rename(columns= {"source_text":"text", "target_text": "label"})
valid_df = pd.read_csv("../input/balanced-goal-full/valid.csv").rename(columns= {"source_text":"text", "target_text": "label"})
test_df = pd.read_csv("../input/balanced-goal-full/test.csv").rename(columns= {"source_text":"text", "target_text": "label"})

train_df["label"].replace({"negative": 0, "positive": 1}, inplace=True)
valid_df["label"].replace({"negative": 0, "positive": 1}, inplace=True)
test_df["label"].replace({"negative": 0, "positive": 1}, inplace=True)

train_df["text"] = "goal is " + train_df["goal"].astype(int).astype(str) + ", " + train_df["text"]
valid_df["text"] = "goal is " + valid_df["goal"].astype(int).astype(str) + ", " + valid_df["text"]
test_df["text"] = "goal is " + test_df["goal"].astype(int).astype(str) + ", " + test_df["text"]


train_df

In [None]:
valid_df.label.value_counts(normalize=True)

# Create Dataset

In [None]:
import torch
from torch.utils.data import Dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Create instances of the GFM_Dataset for training, validation, and testing
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
for example in train_dataset:
    text = example['text']
    label = example['label']
    print(f"Train Text: {text}, Label: {label}")
    break

# Preprocessing

## Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("textattack/albert-base-v2-SST-2")

def preprocess_function(examples):
    return tokenizer(examples["text"], 
                     truncation=True,
                     max_length=512,
                     padding='max_length')

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_train

## Padding

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from torch import nn 
from transformers import Trainer, TrainingArguments
import torch

def compute_metrics(eval_pred):
    """
    Compute evaluation metrics based on model predictions and labels.

    Parameters:
    - eval_pred (transformers.EvalPrediction): Evaluation prediction object containing model predictions and labels.

    Returns:
    - dict: A dictionary containing the following evaluation metrics:
        - 'accuracy' (float): Classification accuracy.
        - 'f1_score' (float): F1 score.
        - 'auc' (float): Area under the ROC curve.
        - 'recall' (float): Recall score.
        - 'precision' (float): Precision score.
    """
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids

    # Model outputs logits, use softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    predicted_labels = np.argmax(probabilities, axis=1)

    # Compute different metrics
    acc = accuracy_score(labels, predicted_labels)
    f1_val = f1_score(labels, predicted_labels, average="macro")
    auc_val = roc_auc_score(labels, probabilities[:, 1])
    recall_val = recall_score(labels, predicted_labels)
    precision_val = precision_score(labels, predicted_labels)

    return {
        'accuracy': acc,
        'f1_score': f1_val,
        'auc': auc_val,
        'recall': recall_val,
        'precision': precision_val
    }

# class WeightedCrossEntropyLoss(torch.nn.Module):
#     def __init__(self, class_weights):
#         super(WeightedCrossEntropyLoss, self).__init__()
#         self.class_weights = class_weights

#     def forward(self, logits, labels):
#         loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
#         loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
#         return loss

# class CustomTrainer(Trainer):
#     def __init__(self, *args, **kwargs):
#         self.class_weights = kwargs.get("class_weights")
#         del kwargs["class_weights"]
#         super().__init__(*args, **kwargs)

#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss = WeightedCrossEntropyLoss(class_weights=self.class_weights)(logits, labels)
#         return (loss, outputs) if return_outputs else loss


# Train

In [None]:
id2label = {0: "UNSUCCESSFUL", 1: "SUCCESSFUL"}
label2id = {"UNSUCCESSFUL": 0, "SUCCESSFUL": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "textattack/albert-base-v2-SST-2", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
from transformers import Trainer, TrainingArguments, get_cosine_schedule_with_warmup
import wandb

# Initialize W&B
wandb.init()

training_args = TrainingArguments(
    output_dir="finetuned-albert-base-v2-sst2-best-val/",
    learning_rate=1e-5,
    lr_scheduler_type="cosine",  
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=3,
    weight_decay=0.1,
    evaluation_strategy="steps",
    eval_steps=300,
    logging_steps=100,  
    save_strategy="steps",
    save_steps = 900,
    load_best_model_at_end=True,
    metric_for_best_model = "accuracy", 
    greater_is_better = True,
    report_to='wandb',
    save_total_limit = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.predict(tokenized_val)

In [None]:
trainer.predict(tokenized_test)