In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [2]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig,HfArgumentParser, get_scheduler, set_seed

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler

from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = {'mode_ID':"microsoft/Phi-3-mini-4k-instruct",
          'seed': 1 ,
          'max_seq_len' : 4096,
          'epochs': 3,
          'lr': 2e-4,
          'batch': 4,
          'lora_r':8,
          'lora_alpha':32,
          'target_module':["q_proj", "up_proj", "o_proj", "k_proj", "down_proj","gate_proj", "v_proj"],
          'lora_dropout':0.05,
          'lora_tasktype' :'CAUSAL_LM',
          'lora_bias' : 'none',
          'optimizer': 'paged_adamw_8bit',
          'scheduler':'cosine'}

## Model 초기화

In [4]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=config['lora_r'],  # LoRA 모델의 r 값
    lora_alpha=config['lora_alpha'],  # LoRA 모델의 alpha 값
    target_modules=config['target_module'],  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=config['lora_dropout'],  # LoRA 모델의 드롭아웃 비율
    bias=config['lora_bias'],  # LoRA 모델의 편향 설정
    task_type=config['lora_tasktype']  # LoRA 모델의 태스크 유형
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )


In [5]:
# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(config['mode_ID'], trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
	config['mode_ID'],
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
	use_cache=False,
	# quantization_config=bnb_config,
)

model.gradient_checkpointing_enable() # 모델에서 그래디언트 체크포인팅 활성화 (메모리 효율 향상)

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1007)')))"), '(Request ID: 97e0d5e9-78c5-4b50-9cda-99ce4c9ce912)')

In [None]:
print(f'Phi3 크기 : {model.num_parameters()/1000**2:.1f}M개의 파라미터')

Phi3 크기 : 3821.1M개의 파라미터


In [None]:
from peft import prepare_model_for_kbit_training # peft 라이브러리에서 k 비트 학습 준비 함수 임포트

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # CUDA 사용 가능 여부 확인

model = prepare_model_for_kbit_training(model)# k 비트 학습을 위해 모델 준비 - prepare_model_for_kbit_training 함수 사용
model = get_peft_model(model, peft_config) # PEFT 적용 
model = model.to(device) # 모델을 학습 장치 (GPU 등)로 이동
model.print_trainable_parameters()# 훈련 가능한 파라미터 출력 

Using device: cuda
trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165


In [8]:
# model.save_pretrained('models/'+model_ckpt, push_to_hun=True, organiztion=org)

In [9]:
def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt

In [10]:
from datasets import load_dataset
dataset = load_dataset('qiaojin/PubMedQA', 'pqa_artificial')
q = dataset['train']['question']
c = dataset['train']['context']
label = dataset['train']['final_decision']

# 'q'와 'c'를 페어링하여 input 열 만들기
input_list = [f"Question: {q_} Context: {c_}" for q_, c_ in zip(q, c)]


df_all = pd.DataFrame({'input': input_list,'label':label})
# 'no'인 값 10000개 추출
no_df = df_all[df_all['label'] == 'no'].sample(n=10000, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df_all[df_all['label'] == 'yes'].sample(n=10000, random_state=42)

# 두 데이터 프레임 합치기
combined_df = pd.concat([no_df, yes_df])

print(combined_df)

                                                    input label
65847   Question: Is short Time Interval Between Neuro...    no
118346  Question: Does universal health insurance cove...    no
116901  Question: Is the spectrum of in vitro radiosen...    no
51829   Question: Is hyperhomocysteinemia in children ...    no
39075   Question: Does mechanical pleurodesis reduce r...    no
...                                                   ...   ...
125548  Question: Is albumin the major plasma protein ...   yes
157294  Question: Does the trans-chromosomic mouse-der...   yes
209371  Question: Does typical savings from each minut...   yes
91596   Question: Is [ Expression of aquaporin 3 and a...   yes
107726  Question: Is urine cortisol concentration as a...   yes

[20000 rows x 2 columns]


In [11]:
# train, test 데이터셋 나누기
X_train, X_test, y_train, y_test = train_test_split(combined_df['input'], combined_df['label'], test_size=0.2, random_state=42)

# train, val 데이터셋 나누기
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


In [12]:
del dataset, q, c, label, input_list, df_all, no_df, yes_df, combined_df

In [13]:
train_data_prompt_list = []
for x,y in zip(X_train, y_train):
    train_data_prompt_list.append(make_prompt(x,y))

valid_data_prompt_list = []
for x2,y2 in zip(X_val, y_val):
    valid_data_prompt_list.append(make_prompt(x2,y2))

test_data_prompt_list = []
for x3,y3 in zip(X_test, y_test):
    test_data_prompt_list.append(make_prompt(x3,y3))

In [14]:
del x, y, x2, y2, x3, y3

In [15]:
train_data_prompt_list[0]

'<|user|>\nQuestion: Is calcipotriol Plus Betamethasone Dipropionate Aerosol Foam Effective , Independent of Body Mass Index and the Extent and Severity of Psoriasis? Context: {\'contexts\': [\'Good treatment adherence is important in the effective management of psoriasis and is related to both the frequency of applications and the amount of product used versus the recommended dose. The efficacy and safety of fixed combination calcipotriol 50\\xa0µg/g (Cal) and betamethasone 0.5\\xa0mg/g as dipropionate (BD) in the treatment of psoriasis is well established; an aerosol foam formulation has been developed to enhance adherence. This subanalysis from the Phase III PSO-FAST study evaluates the amount of Cal/BD foam used during treatment and the association between the extent and severity of baseline disease.\', "Patients (≥18\\xa0years) with mild-to-severe body psoriasis were randomized 3:1 to once-daily Cal/BD foam or vehicle. The amount of Cal/BD foam and vehicle used over the 4-week stu

In [16]:
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [17]:
train_dataset = Dataset(train_data_prompt_list)
valid_dataset = Dataset(valid_data_prompt_list)

In [18]:
def train(epoch, loader):
    model.train()
    loss_avg = 0
    for i, prompt in enumerate(loader):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        #loss.backward()
        #optimizer.step()
        scaler.update()
        print(f"epoch : {epoch} - step : {i}/{len(loader)} - loss: {loss.item()}")
        loss_avg += loss.item()
        
        del inputs
        del outputs
        del loss
        
    print(f'Epoch: {epoch}, train_Loss:  {loss_avg/len(loader)}')
    loss_dic['Train'].append(loss_avg/len(loader))

        

In [19]:
def validate(epoch,loader):  
    model.eval()
    loss_avg = 0
    with torch.no_grad():       
        for i, prompt in enumerate(loader):
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            loss_avg += loss.item()
            
            del inputs
            del outputs
            del loss
            
    print(f'Epoch: {epoch}, Valid_Loss:  {loss_avg/len(loader)}')
    loss_dic['Val'].append(loss_avg/len(loader))

In [20]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [21]:
optimizer = AdamW(model.parameters(), lr = 3e-4)
# optimizer = SGD(model.parameters(), lr=3e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
scaler = GradScaler()

In [23]:
optimizer = AdamW(model.parameters(), lr = 3e-4)
lr_scheduler = get_scheduler(
    name='cosine',
    optimizer=optimizer,
    num_warmup_steps=227,
    num_training_steps=15000
)

In [24]:
from tqdm import tqdm
import time

loss_dic = {"epoch":[],"Train":[], "Val":[]}
best_loss = 100
early_stop_count = 0

for epoch in tqdm(range(1, 10)):
    
    loss_dic['epoch'].append(epoch)
    train(epoch, train_loader)
    validate(epoch, valid_loader)
    lr_scheduler.step()
    
    if loss_dic['Val'][epoch - 1] > best_loss:
        early_stop_count += 1       
        if early_stop_count >= 2:
            loss_dic_df = pd.DataFrame(loss_dic)
            # loss_dic_df.to_excel('./loss.xlsx', index=False)
            # torch.save(model.state_dict(), f'./bestmodel_{epoch}.pth')
            break
    else:
        best_loss = loss_dic['Val'][epoch - 1]
        early_stop_count = 0

  0%|          | 0/9 [00:00<?, ?it/s]You are not running the flash-attention implementation, expect numerical differences.


epoch : 1 - step : 0/1500 - loss: 3.4250924587249756
epoch : 1 - step : 1/1500 - loss: 3.0902509689331055
epoch : 1 - step : 2/1500 - loss: 4.341043949127197
epoch : 1 - step : 3/1500 - loss: 3.307300090789795
epoch : 1 - step : 4/1500 - loss: 4.039970874786377
epoch : 1 - step : 5/1500 - loss: 3.607780933380127
epoch : 1 - step : 6/1500 - loss: 3.518313407897949
epoch : 1 - step : 7/1500 - loss: 4.145331382751465
epoch : 1 - step : 8/1500 - loss: 2.919867753982544
epoch : 1 - step : 9/1500 - loss: 3.853266477584839
epoch : 1 - step : 10/1500 - loss: 2.9841010570526123
epoch : 1 - step : 11/1500 - loss: 2.738149404525757
epoch : 1 - step : 12/1500 - loss: 3.3627140522003174
epoch : 1 - step : 13/1500 - loss: 3.3168561458587646
epoch : 1 - step : 14/1500 - loss: 3.3926730155944824
epoch : 1 - step : 15/1500 - loss: 2.9461305141448975
epoch : 1 - step : 16/1500 - loss: 3.076738119125366
epoch : 1 - step : 17/1500 - loss: 3.1123063564300537
epoch : 1 - step : 18/1500 - loss: 3.34754657745

  0%|          | 0/9 [14:35<?, ?it/s]


KeyboardInterrupt: 