In [3]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [35]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler

from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# 주요 파라미터

class config():
    def __init__(self):
        self.seed = 600  # 시드 값 설정
        self.max_len = 2048  # 입력 시퀀스의 최대 길이
        self.epochs = 3  # 학습 에폭 수
        self.learning_rate = 2e-4  # 학습률 설정
        self.batch_size = 4  # 배치 크기

        self.lora_r = 8  # LoRA 모델 파라미터: r 값
        self.lora_alpha = 32  # LoRA 모델 파라미터: alpha 값
        self.target_module = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj",
                              "gate_proj", "v_proj"]  # 타겟 모듈 리스트
        self.lora_dropout = 0.05  # LoRA 모델 파라미터: 드롭아웃 비율
        self.lora_tasktype = "CAUSAL_LM"  # LoRA 모델 파라미터: 태스크 유형
        self.lora_bias = 'none'  # LoRA 모델 파라미터: 편향 설정
        self.optimizer = "paged_adamw_8bit"  # 옵티마이저 종류
        self.scheduler = "cosine"  # 스케줄러 종류

# config 클래스의 인스턴스 생성
cfg = config()


## Model Pre

In [3]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=cfg.lora_r,  # LoRA 모델의 r 값
    lora_alpha=cfg.lora_alpha,  # LoRA 모델의 alpha 값
    target_modules=cfg.target_module,  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=cfg.lora_dropout,  # LoRA 모델의 드롭아웃 비율
    bias=cfg.lora_bias,  # LoRA 모델의 편향 설정
    task_type=cfg.lora_tasktype  # LoRA 모델의 태스크 유형
)


In [4]:
model_ID = "microsoft/Phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(
	model_ID,
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
)

model.gradient_checkpointing_enable()

# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(cfg.model_path, trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.04s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# peft 라이브러리에서 k 비트 학습 준비 함수 임포트
from peft import prepare_model_for_kbit_training

# 모델에서 그래디언트 체크포인팅 활성화 (메모리 효율 향상)
model.gradient_checkpointing_enable()

# k 비트 학습을 위해 모델 준비 - prepare_model_for_kbit_training 함수 사용
model = prepare_model_for_kbit_training(model)

# PEFT 적용 
model = get_peft_model(model, peft_config)

# 모델을 학습 장치 (GPU 등)로 이동
model = model.to(device)

# 훈련 가능한 파라미터 출력 
model.print_trainable_parameters()

# 모델 출력
model


Using device: cuda
trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): PeftModelForCausalLM(
          (base_model): LoraModel(
            (model): Phi3ForCausalLM(
              (model): Phi3Model(
                (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
                (embed_dropout): Dropout(p=0.0, inplace=False)
                (layers): ModuleList(
                  (0-31): 32 x Phi3DecoderLayer(
                    (self_attn): Phi3Attention(
                      (o_proj): lora.Linear(
                        (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                        (lora_dropout): ModuleDict(
                          (default): Dropout(p=0.05, inplace=False)
                        )
                        (lora_A): ModuleDict(
                          (default): Linear(in_features=3072, out_features=8, bias=False)
                        )
                    

In [8]:
def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt

In [11]:
from datasets import load_dataset
dataset = load_dataset('qiaojin/PubMedQA', 'pqa_artificial')

Downloading data: 100%|██████████| 233M/233M [00:12<00:00, 18.6MB/s] 
Generating train split: 100%|██████████| 211269/211269 [00:01<00:00, 116801.64 examples/s]


In [33]:
q = dataset['train']['question'].to_frame()
c = dataset['train']['context'].to_frame()
label = dataset['train']['final_decision'].to_frame()
df_all = pd.concat([q,c,label], axis=1)


In [34]:
df_all

Unnamed: 0,question,context,final_decision
0,Are group 2 innate lymphoid cells ( ILC2s ) in...,{'contexts': ['Chronic rhinosinusitis (CRS) is...,yes
1,Does vagus nerve contribute to the development...,{'contexts': ['Phosphatidylethanolamine N-meth...,yes
2,Does psammaplin A induce Sirtuin 1-dependent a...,{'contexts': ['Psammaplin A (PsA) is a natural...,yes
3,Is methylation of the FGFR2 gene associated wi...,{'contexts': ['This study examined links betwe...,yes
4,Do tumor-infiltrating immune cell profiles and...,{'contexts': ['Tumor microenvironment immunity...,yes
...,...,...,...
211264,Is urine production rate related to behavioura...,{'contexts': ['To investigate the relation bet...,yes
211265,Does evaluation of the use of general practice...,{'contexts': ['This study set out to show how ...,yes
211266,Does intracoronary angiotensin-converting enzy...,{'contexts': ['There is increasing recognition...,yes
211267,Does transfusion significantly increase the ri...,{'contexts': ['To determine if splenectomy res...,yes


In [4]:
train_df = pd.read_excel('../train_data.xlsx')
valid_df = pd.read_excel('../valid_data.xlsx')

In [5]:
train_data_prompt_list = []
for i,row in train_df.iterrows():
    train_data_prompt_list.append(make_prompt(row['instruct'], row['answer']))

valid_data_prompt_list = []
for i,row in valid_df.iterrows():
    valid_data_prompt_list.append(make_prompt(row['instruct'], row['answer']))


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



In [6]:
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [7]:
train_dataset = Dataset(train_data_prompt_list)
valid_dataset = Dataset(valid_data_prompt_list)

In [8]:
def train(epoch, loader):
    model.train()
    loss_avg = 0
    for i, prompt in enumerate(loader):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        #loss.backward()
        #optimizer.step()
        scaler.update()
        print(f"epoch : {epoch} - step : {i}/{len(loader)} - loss: {loss.item()}")
        loss_avg += loss.item()
        
        del inputs
        del outputs
        del loss
        
    print(f'Epoch: {epoch}, train_Loss:  {loss_avg/len(loader)}')
    loss_dic['Train'].append(loss_avg/len(loader))

        

In [9]:
def validate(epoch,loader):  
    model.eval()
    loss_avg = 0
    with torch.no_grad():       
        for i, prompt in enumerate(loader):
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            loss_avg += loss.item()
            
            del inputs
            del outputs
            del loss
            
    print(f'Epoch: {epoch}, Valid_Loss:  {loss_avg/len(loader)}')
    loss_dic['Val'].append(loss_avg/len(loader))

In [10]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True)

In [11]:
#optimizer = AdamW(model.parameters(), lr = 1e-5)
optimizer = SGD(model.parameters(), lr=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
scaler = GradScaler()

In [12]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
loss_dic = {"epoch":[],"Train":[], "Val":[]}
best_loss = 100
early_stop_count = 0
for epoch in range(1, 99):
    
    loss_dic['epoch'].append(epoch)
    train(epoch, train_loader)
    validate(epoch, valid_loader)
    scheduler.step()
    
    if loss_dic['Val'][epoch - 1] > best_loss:
        early_stop_count += 1       
        if early_stop_count >= 2:
            loss_dic_df = pd.DataFrame(loss_dic)
            loss_dic_df.to_excel('../../★train/hyeogi/loss.xlsx', index=False)
            torch.save(model.state_dict(), f'../../★train/hyeogi/hyeogi_Solar-10.7B-dpo-v1_checkpoint_{epoch}.pth')
            break
    else:
        best_loss = loss_dic['Val'][epoch - 1]
        early_stop_count = 0

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


epoch : 1 - step : 0/14 - loss: 3.338663339614868
epoch : 1 - step : 1/14 - loss: 3.2922987937927246
epoch : 1 - step : 2/14 - loss: 3.3386213779449463
epoch : 1 - step : 3/14 - loss: 3.3227198123931885
epoch : 1 - step : 4/14 - loss: 3.4006810188293457
epoch : 1 - step : 5/14 - loss: 3.3081371784210205
epoch : 1 - step : 6/14 - loss: 3.4179751873016357
epoch : 1 - step : 7/14 - loss: 2.0940959453582764
epoch : 1 - step : 8/14 - loss: 2.1034910678863525
epoch : 1 - step : 9/14 - loss: 1.8338282108306885
epoch : 1 - step : 10/14 - loss: 1.859686255455017
epoch : 1 - step : 11/14 - loss: 1.6385167837142944
epoch : 1 - step : 12/14 - loss: 1.6211540699005127
epoch : 1 - step : 13/14 - loss: 1.7495423555374146
Epoch: 1, train_Loss:  2.5942436712128774
Epoch: 1, Valid_Loss:  1.6565044522285461
epoch : 2 - step : 0/14 - loss: 1.6129189729690552
epoch : 2 - step : 1/14 - loss: 1.5143346786499023
epoch : 2 - step : 2/14 - loss: 1.4992808103561401
epoch : 2 - step : 3/14 - loss: 1.5427643060684