## Install libraries

In [None]:
# Transformers installation
# !pip install evaluate -qq
!pip install -qq accelerate==0.21.0
!pip install git+https://github.com/huggingface/peft -qq
# !pip install -qq transformers[torch]
!pip install -qq wandb
# !pip install -qq datasets evaluate accelerate git+https://github.com/huggingface/transformers.git
!pip install -qq datasets evaluate git+https://github.com/huggingface/transformers.git
!pip install bitsandbytes

In [None]:
!wandb login eb9e407fb23283fbaed6da50d7b53f71db7b4576
%env WANDB_PROJECT=vimmrc

In [None]:
from huggingface_hub import login

login(token="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", add_to_git_credential=True)

## Import libraries

In [1]:
import os
import math
import warnings
import unicodedata
from typing import Sequence
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from dataclasses import dataclass, field

from transformers import pipeline
import datasets 
datasets.disable_progress_bar()

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


## Hyperparameters

In [2]:
id2label = {0: "false", 1: "true"}
label2id = {"false": 0, "true": 1}

In [4]:
# https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
ctx_length = 1024
SESSION_NAME = f"vietcuna-3b_{ctx_length}"
# SESSION_NAME = f"hoa-1b4_{ctx_length}"
# SESSION_NAME = f"ura-llama_{ctx_length}"
CONFIG = {
    # work 2 do
    "do_train": True,
    "do_eval": True,

    # model hyperparameters
    "model_name_or_path": "vietcuna-3b-v2/kalapa-vietcuna-3b/",
    # "model_name_or_path": "kalapa-ura-llama",
    "fp16": True if torch.cuda.is_available() else False,
    # "torch_compile": True,

    # training hyperparameters
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "optim": "adamw_hf",
    "triton": True,
    "learning_rate": 5e-5,
    "weight_decay": 0.1,
    "gradient_accumulation_steps": 8,
    "warmup_ratio": 0.18,
    "max_steps": 500,
    # "num_train_epochs": 1,
    "save_total_limit": 7,

    # dataset
    "dataset_name": "train_kalapa",
    "dataset_name_config": "all",
    "text_column_name": "text",
    "block_size": 256,
    "max_length": 256,

    # eval hyperparameters
    "evaluation_strategy": "steps",
    "eval_steps": 50,

    # directories
    "output_dir": "output/",
    "save_strategy": "steps",
    "save_steps": 50,

    # other parameters and hub
    "seed": 42,
    "push_to_hub": True,
    "hub_model_id" : SESSION_NAME,
    "hub_strategy": "all_checkpoints",

    # load best model at end for inference
    "load_best_model_at_end": True,

    # logging
    "logging_first_step": True,
    "logging_steps": 50,
    "report_to": "wandb",
    "run_name": SESSION_NAME,

    # random seed
    "seed": 42,
    "data_seed": 42,
}

# keep valid arguments
valid_args = {
    k: v for k, v in CONFIG.items() if k in TrainingArguments.__init__.__code__.co_varnames
}

training_args = TrainingArguments(
    **valid_args
)

for key, value in valid_args.items():
    print(f"{key}: {value}")

do_train: True
do_eval: True
fp16: True
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
optim: adamw_hf
learning_rate: 5e-05
weight_decay: 0.1
gradient_accumulation_steps: 8
warmup_ratio: 0.18
max_steps: 500
save_total_limit: 7
evaluation_strategy: steps
eval_steps: 50
output_dir: output/
save_strategy: steps
save_steps: 50
seed: 42
push_to_hub: True
hub_model_id: vietcuna-3b_1024
hub_strategy: all_checkpoints
load_best_model_at_end: True
logging_first_step: True
logging_steps: 50
report_to: wandb
run_name: vietcuna-3b_1024
data_seed: 42


## Loading Model and Tokenizer

In [11]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [10]:
del model

In [12]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training, PeftModel

model_config = AutoConfig.from_pretrained(
    CONFIG['model_name_or_path'],
    num_labels=2, 
    id2label=id2label,
    label2id=label2id,
    summary_type="last",
)

model_config.max_position_embeddings = 4096
model_config.max_length = 4096
model_config.num_labels = 2

tokenizer = AutoTokenizer.from_pretrained(
    CONFIG['model_name_or_path'],
    config=model_config
)
# tokenizer.sep_token = '[SEP]'
# tokenizer.sep_token = tokenizer.bos_token
tokenizer.sep_token = "\n\n"

# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS, 
#     # inference_mode=False, 
#     # target_modules = ['query_key_value']
#     r=16, lora_alpha=32, 
#     lora_dropout=0.05
# )
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['model_name_or_path'],
    config=model_config,
#     low_cpu_mem_usage=True,  # try to limit RAM
#     offload_state_dict=True,  # offload onto disk if needed
#     offload_folder="offload",  # offload model to `offload/`
    load_in_8bit=True
)
# model = prepare_model_for_int8_training(model)
# model = get_peft_model(model, peft_config)
adapters_name = 'duyvt6663/vietcuna-3b_1024'
adapters_name = 'output/checkpoint-400'
model = PeftModel.from_pretrained(model, adapters_name)
# model = model.merge_and_unload()

# model.print_trainable_parameters()
model = model.to(device="cuda")

In [13]:
# model = model.to(device="cuda")
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BloomForSequenceClassification(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 2560)
        (word_embeddings_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): Linear8bitLt(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=7680, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_emb

In [14]:
model.print_trainable_parameters()

trainable params: 5,120 || all params: 3,007,482,880 || trainable%: 0.00017024203309845608


## Load dataset

In [None]:
import pandas as pd
df = pd.read_csv('level_1_unroll.csv')
# df = pd.read_csv(train_paths[0])
# df = df.drop(['Unnamed: 0'], axis=1)

# df.to_csv('formated_public_test.csv', index=False)
df.head()
df = df.drop(['Unnamed: 0', 'level', 'instruction'], axis=1)
df.to_csv('level_1_public_test.csv', index=False)

In [None]:
import pandas as pd

data_path = 'mcqa-data/MedMCQA/'
# data_path = 'synthetic-gpt4-data'
train_paths = [os.path.join(data_path, p) for p in os.listdir(data_path)]
newpaths = []
for path in train_paths:
    # if all(p not in path for p in ['1.', '2.']):
    #     continue
    df = pd.read_csv(path)
    # df = df.dropna()
    # df = df.drop(['Unnamed: 0'], axis=1)
    print(len(df))
    df.to_csv(path, index=False)
    newpaths.append(path)

In [None]:
newpaths

In [None]:
dataset = load_dataset("csv", data_files = {'train': newpaths, 
                                            'validate': 'level_1_public_test.csv'})
print(dataset)

## Preprocess

In [None]:
# def create_input_text(examples):
#     questions = examples['question']
#     answers = examples['option']
#     contexts = examples['context']
    
#     new_examples = { "label": examples['label'], "text": [] }
#     for q, a, c in zip(questions, answers, contexts):
# #         text = q + tokenizer.sep_token + a + tokenizer.sep_token + c
#         text = "Dựa vào ngữ cảnh được cung cấp, cho biết rằng câu trả lời dưới đây có phải là đáp án chính xác cho câu hỏi hay không." + tokenizer.sep_token + "Câu hỏi: " + q + tokenizer.sep_token + "Câu trả lời: " + a + tokenizer.sep_token + "Ngữ cảnh: " + c
#         new_examples['text'].append(text)
#     return new_examples

def create_input_text_2(examples):
    questions = examples['question']
    answers = examples['option']
    contexts = examples['context']
    
    new_examples = { "label": examples['label'], "text": [] }
    for q, a, c in zip(questions, answers, contexts):
        try:
#         text = q + tokenizer.sep_token + a + tokenizer.sep_token + c
            text = "Dựa vào ngữ cảnh được cung cấp, cho biết rằng câu trả lời dưới đây có phải là đáp án chính xác cho câu hỏi hay không." + tokenizer.sep_token + "Ngữ cảnh: " + c + tokenizer.sep_token +  "Câu hỏi: " + q + tokenizer.sep_token + "Câu trả lời: " + a
            new_examples['text'].append(text)
        except Exception as e:
            # print(e)
            q = '*' if q is None else q
            a = '*' if a is None else a
            print(q + '$$$' + a + '$$$' + c + '\n')
            break
    return new_examples

def create_input_text_3(examples):
    questions = examples['question']
    answers = examples['option']
    contexts = examples['context']
    
    new_examples = { "label": examples['label'], "text": [] }
    for q, a, c in zip(questions, answers, contexts):
        try:
#         text = q + tokenizer.sep_token + a + tokenizer.sep_token + c
            text = "[INST] <<SYS>> Dựa vào ngữ cảnh được cung cấp, cho biết rằng câu trả lời dưới đây có phải là đáp án chính xác cho câu hỏi hay không. <<SYS>>" + tokenizer.sep_token + "Ngữ cảnh: " + c + tokenizer.sep_token +  "Câu hỏi: " + q + tokenizer.sep_token + "Câu trả lời: " + a + ' [/INST]'
            new_examples['text'].append(text)
        except Exception as e:
            # print(e)
            q = '*' if q is None else q
            a = '*' if a is None else a
            print(q + '$$$' + a + '$$$' + c + '\n')
            break
    return new_examples

In [None]:
dataset = dataset.map(
    create_input_text_2,
    batched=True,
    num_proc=os.cpu_count() * 2,
    remove_columns=dataset['train'].column_names,
)
dataset['validate'][0]

In [None]:
def preprocess_function(examples):
    return tokenizer.batch_encode_plus(examples["text"], truncation=True, add_special_tokens=True, max_length=2048)

In [None]:
data = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=os.cpu_count() * 2,
)

## Evaluate

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


## Train

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
import transformers
from transformers import DataCollatorWithPadding, Trainer
from torch import nn

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 2 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.6, 0.4], device=model.device))
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validate"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

with torch.autocast("cuda"): 
    trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()