# init - wandb login 해야함

In [None]:
%%capture
!pip install datasets transformers pandas wandb matplotlib tqdm --upgrade --quiet
!pip install accelerate -U

In [None]:
%%capture
!pip install transformers[torch]

In [None]:
%%capture
!pip install peft
!pip install accelerate bitsandbytes --upgrade --quiet


In [None]:
import datasets
from transformers import pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

## wandb part

In [None]:
import wandb
# from transformers import TrainerCallback

# class WandbCallback(TrainerCallback):
#     def __init__(self, run_name):
#         self.run_name = run_name

#     def on_log(self, args, state, control, logs=None, **kwargs):
#         logs = {} if logs is None else logs
#         wandb.log(logs, step=state.global_step)

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# wandb 초기화
wandb.init(project="NLP", name="reberta_try_3_b64",
           config= {
               "epochs": 3,
               "batch_size": 8,
               "learning_rate": 5e-5
               })

[34m[1mwandb[0m: Currently logged in as: [33minseop0918[0m ([33minseop_09_18[0m). Use [1m`wandb login --relogin`[0m to force relogin


# load dataset

In [None]:
dataset_name, dataset_config_name = "go_emotions", "raw"
dataset_dict = datasets.load_dataset(dataset_name, dataset_config_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})

In [None]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [None]:
for emo in emotions:
  print(emo, end=', ')

admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise, neutral, 

In [None]:
dataset_dict_train = dataset_dict["train"].select(range(int(211225*0.8)))
dataset_dict_vali = dataset_dict["train"].select(range(int(211225*0.8), 211225))

ds_train = dataset_dict_train.map(lambda x : {"labels": [x[c] for c in emotions]})
ds_vali = dataset_dict_vali.map(lambda x : {"labels": [x[c] for c in emotions]})

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

# Model setting

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaConfig, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW

class Classifier(nn.Module):
    def __init__(self, model_name, bnb_config, peft_config, num_labels):
        super(Classifier, self).__init__()

        self.bnb_config = bnb_config
        self.peft_config = peft_config

        self.config = RobertaConfig.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")
        self.model = None

        self.linear = nn.Linear(self.config.hidden_size, num_labels)

    def forward(self, x):
        x = self.model(**x).logits
        return F.sigmoid(x)

    def print_trainable_parameters(self):
        trainable_params = 0
        all_param = 0
        for _, param in self.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
        )

    def init(self, model_name):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config, quantization_config=self.bnb_config)
        self.model = prepare_model_for_kbit_training(self.model)
        self.model = get_peft_model(self.model, self.peft_config)
        self.print_trainable_parameters()

    def load(self, model_name, load_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config, quantization_config=self.bnb_config)
        self.model = prepare_model_for_kbit_training(self.model)
        self.model = PeftModel.from_pretrained(self.model, load_path+"_peft_model", is_trainable=True)
        self.linear.load_state_dict(torch.load(load_path+"_linearmodel.pt"))
        self.print_trainable_parameters()

    def save(self, ckpt_path, str):
        self.model.save_pretrained(ckpt_path+"_"+str+"_peft_model")
        torch.save(self.linear.state_dict(), ckpt_path+"_"+str+"_linearmodel.pt")


In [None]:
# Define emotion labels and model name
num_labels = len(emotions)
model_name = "FacebookAI/roberta-base"

# Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules=['query', 'key', 'value', 'dense']
)

In [None]:
# Initialize classifier
classifier = Classifier(model_name, bnb_config, peft_config, num_labels)
classifier.init(model_name)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 10736668 || all params: 92641592 || trainable%: 11.59


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# dataset
train_dataset = ds_train
val_dataset = ds_vali

# Tokenize dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

In [None]:
encoded_val_dataset

Dataset({
    features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 42245
})

In [None]:
# Data collator
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask}, labels

# DataLoader
train_loader = DataLoader(encoded_train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(encoded_val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
# Optimizer
optimizer = AdamW(classifier.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier.to(device)




Classifier(
  (linear): Linear(in_features=768, out_features=28, bias=True)
  (model): PeftModel(
    (base_model): LoraModel(
      (model): RobertaForSequenceClassification(
        (roberta): RobertaModel(
          (embeddings): RobertaEmbeddings(
            (word_embeddings): Embedding(50265, 768, padding_idx=1)
            (position_embeddings): Embedding(514, 768, padding_idx=1)
            (token_type_embeddings): Embedding(1, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): RobertaEncoder(
            (layer): ModuleList(
              (0-11): 12 x RobertaLayer(
                (attention): RobertaAttention(
                  (self): RobertaSelfAttention(
                    (query): lora.Linear4bit(
                      (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                      (lora_dropout): ModuleDict(
                

# Training

In [None]:
for epoch in range(wandb.config.epochs):  # Number of epochs
    classifier.train()
    total_loss = 0

    # Using tqdm for progress bar
    train_loader_tqdm = tqdm(train_loader, desc=f"Training Epoch {epoch+1}")

    for batch in train_loader_tqdm:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        outputs = classifier(inputs)
        loss = F.binary_cross_entropy(outputs, labels.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_loader_tqdm.set_postfix({"Loss": loss.item()})

    avg_train_loss = total_loss / len(train_loader)
    tqdm.write(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}")

    # Log training loss to wandb
    wandb.log({"train_loss": avg_train_loss})

    # Validation loop
    classifier.eval()
    val_loss = 0

    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc=f"Validation Epoch {epoch+1}")

        for batch in val_loader_tqdm:
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            outputs = classifier(inputs)
            loss = F.binary_cross_entropy(outputs, labels.float())
            val_loss += loss.item()
            val_loader_tqdm.set_postfix({"Validation Loss": loss.item()})

    avg_val_loss = val_loss / len(val_loader)
    tqdm.write(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")

    # Log validation loss to wandb
    wandb.log({"val_loss": avg_val_loss})

Training Epoch 1: 100%|██████████| 2641/2641 [3:47:04<00:00,  5.16s/it, Loss=0.181]


Epoch 1, Train Loss: 0.21302110857807385


Validation Epoch 1: 100%|██████████| 661/661 [16:13<00:00,  1.47s/it, Validation Loss=0.171]


Epoch 1, Validation Loss: 0.1884339904911399


Training Epoch 2: 100%|██████████| 2641/2641 [3:47:02<00:00,  5.16s/it, Loss=0.18]


Epoch 2, Train Loss: 0.18472935906507354


Validation Epoch 2: 100%|██████████| 661/661 [16:13<00:00,  1.47s/it, Validation Loss=0.154]


Epoch 2, Validation Loss: 0.17800600112115741


Training Epoch 3: 100%|██████████| 2641/2641 [3:47:03<00:00,  5.16s/it, Loss=0.182]


Epoch 3, Train Loss: 0.1781630832469179


Validation Epoch 3: 100%|██████████| 661/661 [16:13<00:00,  1.47s/it, Validation Loss=0.151]

Epoch 3, Validation Loss: 0.17457841000831073



