In [None]:
!pip install datasets peft evaluate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 

In [None]:
import csv
import argparse
import json
import datetime
import random
import string
import os


from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import TrainingArguments, Trainer

import datasets

from torch.utils.data import  Dataset
import torch
import copy




In [None]:
is_colab = True
if is_colab:
  from google.colab import drive
  drive.mount("/content/gdrive")

Mounted at /content/gdrive


### Configurations + Hyperparameters Setting


In [None]:
root = ""
if(root == ""):
  raise Exception("Path for root folder not found")

data_path = ""
if(data_path == ""):
  raise Exception("Path for dataset not found")


# For LoRA
lora = False
lora_r = 4
lora_alpha = 4

# For Model
random_id = "".join(random.choices(string.ascii_lowercase + string.digits, k = 8))
token_path = "Qwen/Qwen2-0.5B"
model_path = "Qwen/Qwen2-0.5B"
_tmp = token_path.replace("/", "-") + ("-Lora" if lora else "")

# For Training
model_max_length = 2048
num_train_epochs = 4
batch_size = 16


# For Output
output_dir = f"/{root}/Output/{random_id}-{_tmp}"
print(output_dir)


/content/gdrive/MyDrive/IC_lab_docs/Output/kp9gsnlb-Qwen-Qwen2-0.5B


In [None]:
class CustomDataset(Dataset):
    def __init__(self, labels, encoding=None):
        self.encodings = encoding
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # load image as ndarray type (Height * Width * Channels)
        # be carefull for converting dtype to np.uint8 [Unsigned integer (0 to 255)]
        # in this example, i don't use ToTensor() method of torchvision.transforms
        # so you can convert numpy ndarray shape to tensor in PyTorch (H, W, C) --> (C, H, W)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


def get_text_label(data_path, limit = None):
    texts = []
    labels = []
    with open(data_path, "r", encoding="utf-8") as data_file:
        reader = csv.reader(data_file)
        for index, _row in enumerate(reader):
            if(index == 0 or len(_row) != 2):
                continue
            if((not (limit is None)) and len(texts) == limit):
                break
            texts.append(_row[0])
            labels.append(_row[1])

    unique_label = list(set(copy.deepcopy(labels)))

    unique_label.sort()

    label2id = {}
    id2label = {}
    for label in unique_label:
        label2id[label] = len(label2id)
        id2label[len(label2id) - 1] = label

    labels = [label2id[label] for label in labels]

    print("Successfully load data from", data_path)
    print("There are {} texts and {} labels".format(len(texts), len(labels)))
    print("Example: Text: {}\nLabel: {} - {}".format(texts[0], labels[0], id2label[labels[0]]))
    print(f"Unique labels: {unique_label}")
    return texts, labels, label2id, id2label


In [None]:
# Loading Raw dataset, will process to token later
train_texts, train_labels, label2id, id2label = get_text_label(f"{data_path}/train.csv")
test_texts, test_labels, _, _ = get_text_label(f"{data_path}/test.csv")
test_texts = test_texts[:5] # Trick: we don't use validation test and will run the evaluation seperately so load a few dataset to "skip" testing process
test_labels = test_labels[:5]
print("Successfully load data", len(train_texts), len(train_labels), len(test_texts), len(test_labels))

print(label2id)

Successfully load data from /content/gdrive/MyDrive/IC_lab_docs/Dataset/Final/train.csv
There are 15524 texts and 15524 labels
Example: Text: How to avoid a relapse? I've been having a particularly rough year; I attempted suicide, the love of my life left me, I failed my year at university, I've been physically assaulted, I've been sexually assaulted and today my grandfather died. I feel like I'm running on empty and doing the bare minimum to survive for myself while trying to be a rock to others. My emotional resilience has been wiped out. I feel "okay" but I've felt like this in the past and it has turned out that I've just been lying to myself and making things worse. Any advice on how to manage/process my emotions? Or just how to better understand myself?
As a psychologist, read this post and answer the following question. Does this person suffer from depression? Only choose one answer from the following options: Yes, No.
Label: 14 - yes
Unique labels: ['anger', 'depression', 'disg

In [None]:
#Loading the model
tokenizer = AutoTokenizer.from_pretrained(token_path, truncation_side = 'left', model_max_length = model_max_length)
config = AutoConfig.from_pretrained(model_path, label2id = label2id, id2label = id2label)
config.num_labels = len(id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_path, config = config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Frozen the model
if(hasattr(model, "model")):
    for param in model.model.parameters():
        param.requires_grad = False
else:
    for param in model.transformer.parameters():
        param.requires_grad = False

In [None]:
# Add LoRA if test on LoRA
if(lora == True):

  from peft import LoraConfig, get_peft_model

  lora_config = LoraConfig(
      r = lora_r,
      lora_alpha = lora_alpha,
      lora_dropout=0.05,
      target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"],
      modules_to_save = ["score"],
      bias = "lora_only",
      init_lora_weights = "gaussian"
  )

  model = get_peft_model(model, lora_config)

  model.print_trainable_parameters()
print(model)


Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): 

In [None]:
# Convert raw dataset into tokens

train_encodings = tokenizer(train_texts, truncation = True)
test_encodings = tokenizer(test_texts , truncation = True)

train_dataset = CustomDataset(train_labels, train_encodings)
test_dataset = CustomDataset(test_labels, test_encodings)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
train_args = TrainingArguments (
    output_dir = output_dir,
    do_train = True,
    do_eval = False,
    eval_strategy = "steps",
    prediction_loss_only = True,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = gradient_accumulation_steps,
    eval_accumulation_steps = 1,
    num_train_epochs = num_train_epochs,
    save_strategy = "steps",
    eval_steps = len(train_dataset),
    save_steps = len(train_dataset),
    logging_steps = len(train_dataset),
    learning_rate= 1e-4,
    bf16 = True
)

In [None]:
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


TrainOutput(global_step=1940, training_loss=1.1903035901256442, metrics={'train_runtime': 5107.1364, 'train_samples_per_second': 12.159, 'train_steps_per_second': 0.38, 'total_flos': 1.6979185157197824e+16, 'train_loss': 1.1903035901256442, 'epoch': 3.998969337799536})

In [None]:
if is_colab:
  from google.colab import runtime
  runtime.unassign()