# Load kaggle

In [None]:
# JAX는 기본적으로 멀티스레딩을 사용,
# os.fork()는 멀티스레딩 코드와 호환되지 않기 때문에 데드락(deadlock)이 발생

import multiprocessing as mp

mp.set_start_method('spawn')

In [None]:
! pwd

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.head()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# import datasets
# from datasets import load_dataset, load_metric, Dataset

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# from accelerate import notebook_launcher, Accelerator, PartialState
# from accelerate.utils import write_basic_config
# from accelerate.inference import prepare_pippy

import transformers
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
    set_seed,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    AutoConfig
)

import os
import shutil
import math
import json
from tqdm import tqdm
import gc
import multiprocessing as mp

In [None]:
# params
# model_name = "/kaggle/input/phi/transformers/1/1"
model_name = "/kaggle/input/flan-t5/pytorch/small/2"
model_path = "model_checkpoint.pth"
seed = 42
# lora_r = 2
# quantize_bit = 16
learning_rate = 5e-4
weight_decay = 0.1
beta1 = 0.9
beta2 = 0.999
eps = 1e-9
l1_rate = 1e-10
batch_size = 48
test_batch_size = 1
max_len = 64
test_max_len = 64
# n_sample = 0.5
n_split = 2
n_epoch = 2
n_split_back = 2
n_epoch_back = 1
device = "cuda"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Preprocessing for Classifier

In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.head()

In [None]:
clf_train = train[['prompt','response_a','response_b','winner_model_a','winner_model_b','winner_tie']]

In [None]:
clf_train.loc[:, "prompt"] = clf_train["prompt"].apply(lambda x: json.loads(x)[0])
clf_train.loc[:, "response_a"] = clf_train["response_a"].apply(lambda x: json.loads(x)[0])
clf_train.loc[:, "response_b"] = clf_train["response_b"].apply(lambda x: json.loads(x)[0])

In [None]:
clf_train = clf_train.dropna()
clf_train = clf_train.reset_index(drop = True)

In [None]:
# clf_train['new_text'] = [ "### prompt: "+clf_train['prompt'][x]+" ### response_a: "+clf_train['response_a'][x]+" ### response_b: "+clf_train['response_b'][x] for x in range(len(clf_train)) ]

In [None]:
clf_train['target'] = [[clf_train['winner_model_a'][x],clf_train['winner_model_b'][x],clf_train['winner_tie'][x]] for x in range(len(clf_train)) ]

In [None]:
clf_train = clf_train[['prompt','response_a','response_b','target']]

In [None]:
clf_train.head()

In [None]:
def cl(x):
  if x == [1,0,0]:
    return 0
  elif x == [0,1,0]:
    return 1
  else :
    return 2

clf_train['labels'] = clf_train['target'].apply(lambda x : cl(x))

In [None]:
clf_train['p_len'] = clf_train['prompt'].apply(lambda x : len(x))
clf_train['a_len'] = clf_train['response_a'].apply(lambda x : len(x))
clf_train['b_len'] = clf_train['response_b'].apply(lambda x : len(x))

In [None]:
clf_train['len'] = clf_train['p_len'] + clf_train['a_len']+ clf_train['b_len']

In [None]:
sample_df = clf_train.sample(int(len(clf_train)*n_sample), weights = "len", random_state=seed).reset_index(drop=True)

In [None]:
sample_df

In [None]:
t_dat, v_dat = train_test_split(sample_df, test_size=0.2, random_state=42, stratify = sample_df['labels'])

t_dat = t_dat.reset_index(drop=True)
v_dat = v_dat.reset_index(drop=True)

In [None]:
t_dat = t_dat.drop( labels= 'target' , axis = 1)
v_dat = v_dat.drop( labels= 'target' , axis = 1)

In [None]:
t_dat.head()

In [None]:
np.unique(t_dat['labels'])

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.prompt = df['prompt']
        self.response_a = df['response_a']
        self.response_b = df['response_b']
        self.max_len = max_len
        self.targets = df.get('labels', None)

    def __len__(self):
        return len(self.prompt)

    def __getitem__(self, index):
        prompt = str(self.prompt[index])
        response_a = str(self.response_a[index])
        response_b = str(self.response_b[index])

        prompt_len = len(self.tokenizer("##prompt: " + prompt, add_special_tokens=True)['input_ids'])
        response_a_len = len(self.tokenizer("##response_a: " + response_a, add_special_tokens=True)['input_ids'])
        response_b_len = len(self.tokenizer("##response_b: " + response_b, add_special_tokens=True)['input_ids'])

        final_prompt_len = min(self.max_len, prompt_len)
        final_a_len = min(self.max_len, response_a_len)
        final_b_len = min(self.max_len, response_b_len)

        prompt_token = self.tokenizer("##prompt: " + prompt, add_special_tokens=True, max_length=final_prompt_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')
        response_a_token = self.tokenizer("##response_a: " + response_a, add_special_tokens=True, max_length=final_a_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')
        response_b_token = self.tokenizer("##response_b: " + response_b, add_special_tokens=True, max_length=final_b_len, truncation=True,padding='max_length', return_attention_mask=True, return_tensors='pt')

        input_ids = torch.cat([prompt_token['input_ids'], response_a_token['input_ids'], response_b_token['input_ids']], dim=1)
        attention_mask = torch.cat([prompt_token['attention_mask'], response_a_token['attention_mask'], response_b_token['attention_mask']], dim=1)

        if self.targets is not None:
            labels = torch.LongTensor([self.targets[index]])
            return {'input_ids': input_ids.flatten(), 'attention_mask': attention_mask.flatten(), 'labels': labels}
        else:
            return {'input_ids': input_ids.flatten(), 'attention_mask': attention_mask.flatten()}

In [None]:
def custom_collate_fn(batch, tokenizer):

    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = torch.cat([item['labels'] for item in batch], dim=0) if 'labels' in batch[0] else None

    # Find the maximum length of the sequences in the batch
    max_len = max([input_id.size(0) for input_id in input_ids])

    # Re-tokenize with the new max length
    new_input_ids = []
    new_attention_masks = []

    for item in batch:
        input_ids = item['input_ids'][:max_len]
        attention_mask = item['attention_mask'][:max_len]

        new_input_ids.append(input_ids)
        new_attention_masks.append(attention_mask)

    new_input_ids = pad_sequence(new_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    new_attention_masks = pad_sequence(new_attention_masks, batch_first=True, padding_value=0)

    output = {
    'input_ids': new_input_ids,
    'attention_mask': new_attention_masks}

    if labels is not None:
        output['labels'] = labels

    return output

In [None]:
def create_dataloaders(df,tokenizer,max_len, batch_size, shuffle = True):
    dataloader = DataLoader(
        CustomDataset(df, tokenizer, max_len), shuffle=shuffle, batch_size=batch_size , collate_fn=lambda x: custom_collate_fn(x, tokenizer)
    )
    return dataloader

In [None]:
# tokenizer.decode([2])

# Training Classifier Model

offline : Unable to install libraries and learning time limit

- Model: Phi
- Quantization : FP32bit
 - Inapplicable
- Adapter: add adaptor to query value r = 2
 - Create an attention layer to add adepter to the attention value and replace it with the existing model
 - Maintain existing weights
- Add a classifier

In [None]:
# # # 모델 구성 가져오기
# # config = AutoConfig.from_pretrained(model_name)

# # # 활성화 함수 설정 (예: gelu)
# # config.hidden_activation = "gelu"
# # config.use_cache = False

# base_model = AutoModel.from_pretrained(model_name) #   ,  config=config)

In [None]:
# base_model

In [None]:
def quantize_tensor(tensor, num_bits=quantize_bit):
    qmin = 0.
    qmax = 2.**num_bits - 1.

    min_val, max_val = tensor.min(), tensor.max()
    scale = (max_val - min_val) / (qmax - qmin)
    zero_point = qmin - min_val / scale

    quantized_tensor = torch.round(tensor / scale + zero_point)
    quantized_tensor = torch.clamp(quantized_tensor, qmin, qmax)
    quantized_tensor = (quantized_tensor - zero_point) * scale

    return quantized_tensor

def quantize_model(model, num_bits=8):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            module.weight.data = quantize_tensor(module.weight.data, num_bits)
            if module.bias is not None:
                module.bias.data = quantize_tensor(module.bias.data, num_bits)
        elif isinstance(module, nn.Conv2d):
            module.weight.data = quantize_tensor(module.weight.data, num_bits)
            if module.bias is not None:
                module.bias.data = quantize_tensor(module.bias.data, num_bits)

    return model


# import torch.quantization

# def quantize_model_dynamic(model):
#     model.qconfig = torch.quantization.default_dynamic_qconfig
#     torch.quantization.prepare(model, inplace=True)
#     torch.quantization.convert(model, inplace=True)
#     return model

In [None]:
class LoRA(nn.Module):
    def __init__(self, in_features, out_features, rank=lora_r, alpha=1.0):
        super(LoRA, self).__init__()
        self.alpha = alpha
        self.rank = rank
        self.lora_a = nn.Linear(in_features, rank, bias=False)
        self.lora_b = nn.Linear(rank, out_features, bias=False)

    def forward(self, x):
        return self.alpha * self.lora_b(self.lora_a(x))

In [None]:
class PhiRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
    """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding with Falcon->Phi
class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
    """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)



def rotate_half(x):
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:

    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

In [None]:
class PhiAttention(nn.Module):
    def __init__(self, config,layer_idx, rank=lora_r):
        super(PhiAttention, self).__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.partial_rotary_factor = config.partial_rotary_factor

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)


        # LoRA adapters for query and value
        # self.lora_q = LoRA(config.hidden_size, self.num_heads * self.head_dim, rank)
        self.lora_v = LoRA(self.hidden_size, self.num_key_value_heads * self.head_dim, rank)


        self.qk_layernorm = config.qk_layernorm
        if self.qk_layernorm:
            self.q_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )
            self.k_layernorm = nn.LayerNorm(
                config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
            )

        self._init_rope()

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = PhiRotaryEmbedding(
                int(self.partial_rotary_factor * self.head_dim),
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = PhiLinearScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding(
                    int(self.partial_rotary_factor * self.head_dim),
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def forward(
        self,
        hidden_states,
        attention_mask = None,
        position_ids = None,
        past_key_value = None,
        output_attentions: bool = False,
        use_cache: bool = False
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states) #+ self.lora_q(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states) + self.lora_v(hidden_states)

        if self.qk_layernorm:
            query_states = self.q_layernorm(query_states)
            key_states = self.k_layernorm(key_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

        # Partial rotary embedding
        query_rot, query_pass = (
            query_states[..., : self.rotary_emb.dim],
            query_states[..., self.rotary_emb.dim :],
        )
        key_rot, key_pass = (
            key_states[..., : self.rotary_emb.dim],
            key_states[..., self.rotary_emb.dim :],
        )
        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]

        cos = cos[position_ids].unsqueeze(1)
        sin = sin[position_ids].unsqueeze(1)
        query_rot = (query_rot * cos) + (rotate_half(query_rot) * sin)
        key_rot = (key_rot * cos) + (rotate_half(key_rot) * sin)


        # [batch_size, seq_length, num_heads, head_dim]
        query_states = torch.cat((query_rot, query_pass), dim=-1)
        key_states = torch.cat((key_rot, key_pass), dim=-1)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
        attn_weights = torch.matmul(
            query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
        ) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)

        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.dense(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

In [None]:
def replace_attention_module(config,layer,layer_idx):
    if hasattr(layer, 'self_attn') and layer_idx//2 == 0:

        new_attention = PhiAttention(config,layer_idx)

        # 쿼리, 키, 값 프로젝션 가중치 및 바이어스를 복사
        new_attention.q_proj.weight.data.copy_(layer.self_attn.q_proj.weight.data)
        new_attention.k_proj.weight.data.copy_(layer.self_attn.k_proj.weight.data)
        new_attention.v_proj.weight.data.copy_(layer.self_attn.v_proj.weight.data)
        new_attention.dense.weight.data.copy_(layer.self_attn.dense.weight.data)

        # 기존 레이어의 self_attn 모듈을 새로운 모듈로 교체
        layer.self_attn = new_attention

In [None]:
loss_fn = nn.CrossEntropyLoss()

class LoraModelForClassification(nn.Module):
    def __init__(self, lora_model):  # config 추가
        super(LoraModelForClassification, self).__init__()
        self.config = lora_model.config  # config 설정
        self.peft_model = lora_model
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.config.hidden_size, 3)
#         self.classifier.weight.data = self.classifier.weight.data.to(torch.float16)
#         self.classifier.bias.data = self.classifier.bias.data.to(torch.float16)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.peft_model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim =1)
        output_dropout = self.dropout(pooled_output)
        logits = self.classifier(output_dropout)
        loss = None
        if labels is not None:
          labels = labels
          loss = loss_fn(logits, labels)
        return loss, logits

## Parallel Training

hugging face accelarate 학습 코드 이용

In [None]:
# from torch.cuda.amp import GradScaler, autocast

# AMP를 위한 GradScaler 설정 float32+float16 연산
# scaler = GradScaler()

In [None]:
def parallel_function(model_name):
    mp.set_start_method('spawn', force=True)

    accelerator = Accelerator(mixed_precision="fp16")

    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    set_seed(seed)

    model = AutoModel.from_pretrained(model_name,torch_dtype=torch.float16) #   ,  config=config)
    model = quantize_model(model)
    for idx, layer in enumerate(model.layers):
        replace_attention_module(model.config,layer,idx)
    model = LoraModelForClassification(model)

    for param in model.peft_model.parameters():
        param.requires_grad = False
    for param in model.classifier.parameters():
        param.requires_grad = True

    for name, module in model.named_modules():
        if isinstance(module, PhiAttention):
            # lora_A 및 lora_B의 학습 가능 여부를 설정합니다.
    #         module.lora_q.lora_a.weight.requires_grad = True
    #         module.lora_q.lora_b.weight.requires_grad = True
            module.lora_v.lora_a.weight.requires_grad = True
            module.lora_v.lora_b.weight.requires_grad = True

    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad == True)
    print(f"Total trainable parameters: {total_params}")

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    train_dataloader = create_dataloaders(t_dat,tokenizer,max_len,batch_size=batch_size, shuffle = True)

    eval_dataloader = create_dataloaders(v_dat,tokenizer,max_len,batch_size=batch_size, shuffle = True)

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * num_epochs,
    )

    progress_bar = tqdm(range(num_epochs * len(train_dataloader)), disable=not accelerator.is_main_process)

    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            loss, _ = model(**batch)
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                _, logits = model(**batch)
            predictions = logits.argmax(dim=-1)
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        all_predictions = torch.cat(all_predictions)[:len(eval_dataloader)].cpu()
        all_labels = torch.cat(all_labels)[:len(eval_dataloader)].cpu()

        acc_metric = accuracy_score(all_labels, all_predictions)
        eval_metric = f1_score(all_labels, all_predictions, average="macro")

        accelerator.print(f"epoch: {epoch} \n accuracy: {acc_metric:.3f} \n f1 score: {eval_metric:.3f}")


    model = accelerator.unwrap_model(model)
    accelerator.wait_for_everyone()

    # 모델 상태 사전 저장
    if accelerator.is_main_process:
        torch.save(model.state_dict(), model_path)

    # 동기화 완료 메시지
    accelerator.wait_for_everyone()

In [None]:
notebook_launcher(parallel_function, args=(model_name,), num_processes=2)

In [None]:
# config_path = '/root/.cache/huggingface/accelerate/default_config.yaml'
# if os.path.exists(config_path):
#     os.remove(config_path)
#     print(f"{config_path} has been removed.")

# 새로운 설정 파일 작성
# write_basic_config()

In [None]:
# !cat /root/.cache/huggingface/accelerate/default_config.yaml