In [1]:
# !pip install transformers peft accelerate bitsandbytes \
#     -U --no-index --find-links /root/autodl-tmp/lmsys-wheel-files2

In [2]:
# accelarate proxy 
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

### cancel accelaration: unset http_proxy && unset https_proxy

In [3]:
import os
os.environ['HUGGINGFACE_HUB_CACHE'] = '/root/autodl-tmp/huggingface_cache'  # or any path with enough space

In [4]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import polars as pl
import pandas as pd
from tqdm import tqdm
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch

print("Available GPUs:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
    print(f"Memory cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")

Available GPUs: 4
GPU 0: Tesla V100-PCIE-32GB
Memory allocated: 0.00 GB
Memory cached: 0.00 GB
GPU 1: Tesla V100-PCIE-32GB
Memory allocated: 0.00 GB
Memory cached: 0.00 GB
GPU 2: Tesla V100-PCIE-32GB
Memory allocated: 0.00 GB
Memory cached: 0.00 GB
GPU 3: Tesla V100-PCIE-32GB
Memory allocated: 0.00 GB
Memory cached: 0.00 GB


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

VER=4

# FINAL SOLUTION IS USE_QLORA=FALSE, TRAIN_100_PERCENT=TRUE, ADD_33K=TRUE, DEBUG=FALSE
USE_QLORA = True
TRAIN_100_PERCENT = False          # whether include valid data in training phase
ADD_33K = False                    # whether include additional data
DEBUG = True

In [12]:
@dataclass
class Config:
    output_dir: str = f"output-{VER}"
    checkpoint: str = "unsloth/gemma-2-27b-bnb-4bit"
    # unsloth/gemma-2-9b-it-bnb-4bit
    # for training under kaggle
    max_length: int = 2048
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    # model parallelism
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4  # global batch size is 8 
    per_device_eval_batch_size: int = 4
    n_epochs: int = 1
    # to be determined
    freeze_layers: int = 0  # there're 42 layers in total, 16 head layers can be freeze
    lr: float = 2e-4
    warmup_steps: int = 20
    # to be tested
    lora_r: int = 16
    # to be tested
    lora_alpha: float = 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    
config = Config()

In [13]:
training_args = TrainingArguments(
    output_dir=f"output-{VER}",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="no",               # doesn't save any checkpoints
    # save_steps=200,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,
    metric_for_best_model='log_loss',
    greater_is_better=False
)

In [14]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj", "down_proj", "up_proj", "o_proj", "gate_proj"],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
    # only if multi-heads are used
    modules_to_save=["score"]
)

In [15]:
qlora = {}
if USE_QLORA:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4", #nf4 or fp4
        bnb_4bit_use_double_quant = False,
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_skip_modules = ["score"]
    )
    qlora['quantization_config'] = bnb_config
    print("Using QLoRA")

Using QLoRA


In [16]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [17]:
# for predicting label 0,1 (model_1, model_2) only
model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=2,                 # no tie in this comp
    torch_dtype=torch.float16,
    device_map="auto",
    # **qlora
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Downloading shards: 100%|██████████| 2/2 [20:16<00:00, 608.03s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.34s/it]
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-27b-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 4608, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4608, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4608, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDic

In [18]:
model.print_trainable_parameters()

trainable params: 104,260,608 || all params: 27,331,398,144 || trainable%: 0.3815


In [19]:
# test_df = pl.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet').to_pandas()
train_df = pl.read_parquet('/root/autodl-tmp/wsdm-cup-multilingual-chatbot-arena/train.parquet').to_pandas()

In [20]:
ds = Dataset.from_pandas(train_df[:10000])

In [21]:
import unicodedata

class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int,
        normalize_unicode: bool = True
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.normalize_unicode = normalize_unicode
        
    def __call__(self, batch: dict) -> dict:
        prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + self.process_text(t) for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + self.process_text(t) for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels=[]
        # as there's no tie in this comp, adjust to binary classification problem
        for win in batch["winner"]:
            if win == "model_a":
                label = 0
            else:
                label = 1
            labels.append(label)
        return {**tokenized, "labels": labels}
        
    # @staticmethod
    # def process_text(text: str) -> str:
    #     return " ".join(eval(text, {"null": ""}))
    
    def process_text(self, text: str) -> str:
        """
        Process text while preserving multilingual characters and formatting.
        
        Args:
            text (str): Input text in any language
            
        Returns:
            str: Processed text with proper handling of multilingual content
        """
        if not isinstance(text, str):
            text = str(text)
            
        # Clean the text while preserving multilingual characters
        text = text.strip()
        
        # Normalize Unicode if enabled (helps with compatibility)
        if self.normalize_unicode:
            text = unicodedata.normalize('NFKC', text)
            
        # Remove zero-width characters and other invisible unicode
        text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
        
        return text

In [22]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True, num_proc=8)

Map (num_proc=8): 100%|██████████| 10000/10000 [00:18<00:00, 553.93 examples/s]


In [23]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

In [24]:
folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    ) 
    for fold_idx in range(config.n_splits)
]

In [25]:
train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    # data_collator=data_collator,
)
trainer.train()

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [26]:
trainer.save_model(f"LoRA-v{VER}")