In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig,HfArgumentParser, get_scheduler, set_seed

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler

from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = {'mode_ID':"microsoft/Phi-3-mini-4k-instruct",
          'seed': 1 ,
          'max_seq_len' : 4096,
          'epochs': 3,
          'lr': 2e-4,
          'batch': 4,
          'lora_r':8,
          'lora_alpha':32,
          'target_module':["q_proj", "up_proj", "o_proj", "k_proj", "down_proj","gate_proj", "v_proj"],
          'lora_dropout':0.05,
          'lora_tasktype' :'CAUSAL_LM',
          'lora_bias' : 'none',
          'optimizer': 'paged_adamw_8bit',
          'scheduler':'cosine'}

## Model 초기화

In [4]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=config['lora_r'],  # LoRA 모델의 r 값
    lora_alpha=config['lora_alpha'],  # LoRA 모델의 alpha 값
    target_modules=config['target_module'],  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=config['lora_dropout'],  # LoRA 모델의 드롭아웃 비율
    bias=config['lora_bias'],  # LoRA 모델의 편향 설정
    task_type=config['lora_tasktype']  # LoRA 모델의 태스크 유형
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )


In [5]:
# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(config['mode_ID'], trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
	config['mode_ID'],
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
	use_cache=False,
    # attn_implementation='flash_attention_2'
	# quantization_config=bnb_config,
)

model.gradient_checkpointing_enable() # 모델에서 그래디언트 체크포인팅 활성화 (메모리 효율 향상)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.72s/it]


In [6]:
print(f'Phi3 크기 : {model.num_parameters()/1000**2:.1f}M개의 파라미터')

Phi3 크기 : 3821.1M개의 파라미터


In [7]:
from peft import prepare_model_for_kbit_training # peft 라이브러리에서 k 비트 학습 준비 함수 임포트

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # CUDA 사용 가능 여부 확인

model = prepare_model_for_kbit_training(model)# k 비트 학습을 위해 모델 준비 - prepare_model_for_kbit_training 함수 사용
model = get_peft_model(model, peft_config) # PEFT 적용 
model = model.to(device) # 모델을 학습 장치 (GPU 등)로 이동
model.print_trainable_parameters()# 훈련 가능한 파라미터 출력 

Using device: cuda
trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165


In [8]:
def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt

In [9]:
import pandas as pd
import json
with open('./data/pqaa_train_set.json','r') as f:
    train_data = json.load(f)
with open('./data/pqaa_dev_set.json','r') as f:
    test_data = json.load(f)
    
# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in train_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=500, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=500, random_state=42)

# 두 데이터 프레임 합치기
combined_df = pd.concat([no_df, yes_df])

print(combined_df)

                                                    input final_decision
133561  Question:\nDoes aspirin increase bleeding comp...             no
123846  Question:\nAre measures of socioeconomic posit...             no
143951  Question:\nDoes dialysis within 24 hours of tr...             no
79644   Question:\nIs mild renal pelvic dilatation pre...             no
108150  Question:\nDoes acute blood pressure reduction...             no
...                                                   ...            ...
165729  Question:\nDoes cytoplasmic maspin expression ...            yes
75155   Question:\nDo layer-shaped alginate hydrogels ...            yes
193747  Question:\nDoes oral administration of green p...            yes
34532   Question:\nAre mast cells involved in the path...            yes
104153  Question:\nDoes a patient information booklet ...            yes

[1000 rows x 2 columns]


In [10]:
# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in test_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=50, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=50, random_state=42)

# 두 데이터 프레임 합치기
combined_df_test = pd.concat([no_df, yes_df])

print(combined_df_test)

                                                   input final_decision
2774   Question:\nDoes trimetazidine modify blood lev...             no
4083   Question:\nDo aDAMTS-5 deficient mice develop ...             no
10063  Question:\nIs brucellosis a major cause of feb...             no
3060   Question:\nDoes sertraline alter the beta-adre...             no
7219   Question:\nAre salivary biomarkers suitable fo...             no
...                                                  ...            ...
1469   Question:\nDoes varus malalignment negate the ...            yes
3216   Question:\nDo inflammatory protein levels and ...            yes
8590   Question:\nDoes intraaortic balloon pumping im...            yes
4770   Question:\nDo girls ' childhood trajectories o...            yes
6134   Question:\nIs activation of phospholipase A2 a...            yes

[100 rows x 2 columns]


In [11]:
combined_df['final_decision'].value_counts()

final_decision
no     500
yes    500
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

# train, valid 데이터셋 나누기
X_train, X_val, y_train, y_val = train_test_split(combined_df['input'], combined_df['final_decision'], test_size=0.2, random_state=42)

# test 데이터셋
X_test = combined_df_test['input']
y_test = combined_df_test['final_decision']

In [13]:
del no_df, yes_df, combined_df, df

In [14]:
train_data_prompt_list = []
for x,y in zip(X_train, y_train):
    train_data_prompt_list.append(make_prompt(x,y))

valid_data_prompt_list = []
for x2,y2 in zip(X_val, y_val):
    valid_data_prompt_list.append(make_prompt(x2,y2))

test_data_prompt_list = []
for x3,y3 in zip(X_test, y_test):
    test_data_prompt_list.append(make_prompt(x3,y3))
    test_data_prompt_list = [test_data.split('<|end|>')[0] + '<|end|>' for test_data in test_data_prompt_list]

In [15]:
test_data_prompt_list[0].split('<|end|>')[0]

'<|user|>\nQuestion:\nDoes trimetazidine modify blood levels and immunosuppressant effects of cyclosporine A in renal allograft recipients?\nPlease give me the answer in formats: yes or no\nContext:\n(OBJECTIVE) In renal allograft recipients, trimetazidine (Vastarel) was proposed to be associated with the classic immunosuppressant treatments because it displays anti-ischaemic effects which may protect against cyclosporine A nephrotoxicity. The objective of this work was to assess the possibility of coadministering cyclosporin A, Sandimmun, and trimetazidine.\n(METHODS) Twelve renal transplant patients were selected on the basis of the stability of their cyclosporine A blood concentrations for the previous 3 months. They received trimetazidine, 40 mg twice daily orally for 5 days. Other coadministered drugs were kept unchanged during the study. Before and after trimetazidine administration, cyclosporine A blood concentrations, plasma interleukin-2 and soluble interleukin-2 receptor leve

In [16]:
test_data_prompt_list[0]

'<|user|>\nQuestion:\nDoes trimetazidine modify blood levels and immunosuppressant effects of cyclosporine A in renal allograft recipients?\nPlease give me the answer in formats: yes or no\nContext:\n(OBJECTIVE) In renal allograft recipients, trimetazidine (Vastarel) was proposed to be associated with the classic immunosuppressant treatments because it displays anti-ischaemic effects which may protect against cyclosporine A nephrotoxicity. The objective of this work was to assess the possibility of coadministering cyclosporin A, Sandimmun, and trimetazidine.\n(METHODS) Twelve renal transplant patients were selected on the basis of the stability of their cyclosporine A blood concentrations for the previous 3 months. They received trimetazidine, 40 mg twice daily orally for 5 days. Other coadministered drugs were kept unchanged during the study. Before and after trimetazidine administration, cyclosporine A blood concentrations, plasma interleukin-2 and soluble interleukin-2 receptor leve

In [17]:
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [18]:
train_dataset = Dataset(train_data_prompt_list)
valid_dataset = Dataset(valid_data_prompt_list)

In [19]:
def train(epoch, loader):

    model.train()
    loss_avg = 0
    for i, prompt in enumerate(loader):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        #loss.backward()
        #optimizer.step()
        scaler.update()
        print(f"epoch : {epoch} - step : {i}/{len(loader)} - loss: {loss.item()}")
        loss_avg += loss.item()
        
        del inputs
        del outputs
        del loss
        
    print(f'Epoch: {epoch}, train_Loss:  {loss_avg/len(loader)}')
    loss_dic['Train'].append(loss_avg/len(loader))

        

In [20]:
def validate(epoch,loader):  
    model.eval()
    loss_avg = 0
    with torch.no_grad():       
        for i, prompt in enumerate(loader):
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            loss_avg += loss.item()
            
            del inputs
            del outputs
            del loss
            
    print(f'Epoch: {epoch}, Valid_Loss:  {loss_avg/len(loader)}')
    loss_dic['Val'].append(loss_avg/len(loader))

In [24]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [22]:
# optimizer = AdamW(model.parameters(), lr = 3e-4)
# # optimizer = SGD(model.parameters(), lr=3e-4)
# scheduler = CosineAnnealingLR(optimizer, T_max=10)
scaler = GradScaler()

In [23]:
optimizer = AdamW(model.parameters(), lr = 3e-4)
lr_scheduler = get_scheduler(
    name='cosine',
    optimizer=optimizer,
    num_warmup_steps=227,
    num_training_steps=15000
)

In [26]:
from tqdm import tqdm
import time

loss_dic = {"epoch":[],"Train":[], "Val":[]}
best_loss = 100
early_stop_count = 0

for epoch in tqdm(range(1, 11)):
    
    loss_dic['epoch'].append(epoch)
    train(epoch, train_loader)
    validate(epoch, valid_loader)
    lr_scheduler.step()
    
    if loss_dic['Val'][epoch - 1] > best_loss:
        early_stop_count += 1       
        if early_stop_count >= 2:
            loss_dic_df = pd.DataFrame(loss_dic)
            loss_dic_df.to_csv(f'./results/base_loss_epoch{epoch}.csv', index=False)
            torch.save(model.state_dict(), f'./savedmodel/base_bestmodel_epoch{epoch}.pth')
            break
    else:
        best_loss = loss_dic['Val'][epoch - 1]
        early_stop_count = 0

  0%|          | 0/10 [00:00<?, ?it/s]You are not running the flash-attention implementation, expect numerical differences.


epoch : 1 - step : 0/100 - loss: 2.843454599380493
epoch : 1 - step : 1/100 - loss: 3.6309053897857666
epoch : 1 - step : 2/100 - loss: 2.897318124771118
epoch : 1 - step : 3/100 - loss: 5.2232513427734375
epoch : 1 - step : 4/100 - loss: 3.7282521724700928
epoch : 1 - step : 5/100 - loss: 3.6501128673553467
epoch : 1 - step : 6/100 - loss: 3.3229799270629883
epoch : 1 - step : 7/100 - loss: 4.086520671844482
epoch : 1 - step : 8/100 - loss: 3.3903605937957764
epoch : 1 - step : 9/100 - loss: 4.179718494415283
epoch : 1 - step : 10/100 - loss: 3.5778720378875732
epoch : 1 - step : 11/100 - loss: 3.696044921875
epoch : 1 - step : 12/100 - loss: 3.1827173233032227
epoch : 1 - step : 13/100 - loss: 3.647125005722046
epoch : 1 - step : 14/100 - loss: 3.5673530101776123
epoch : 1 - step : 15/100 - loss: 2.90842342376709
epoch : 1 - step : 16/100 - loss: 3.7696001529693604
epoch : 1 - step : 17/100 - loss: 2.7150142192840576
epoch : 1 - step : 18/100 - loss: 3.733722686767578
epoch : 1 - ste

 10%|█         | 1/10 [06:11<55:42, 371.37s/it]

Epoch: 1, Valid_Loss:  3.3627640533447267
epoch : 2 - step : 0/100 - loss: 3.908290147781372
epoch : 2 - step : 1/100 - loss: 4.468196392059326
epoch : 2 - step : 2/100 - loss: 4.456459999084473
epoch : 2 - step : 3/100 - loss: 3.8397469520568848
epoch : 2 - step : 4/100 - loss: 3.140565872192383
epoch : 2 - step : 5/100 - loss: 3.584003448486328
epoch : 2 - step : 6/100 - loss: 3.4396822452545166
epoch : 2 - step : 7/100 - loss: 3.093414545059204
epoch : 2 - step : 8/100 - loss: 2.6797683238983154
epoch : 2 - step : 9/100 - loss: 5.114637851715088
epoch : 2 - step : 10/100 - loss: 3.3077759742736816
epoch : 2 - step : 11/100 - loss: 2.8443753719329834
epoch : 2 - step : 12/100 - loss: 3.3264288902282715
epoch : 2 - step : 13/100 - loss: 3.886211395263672
epoch : 2 - step : 14/100 - loss: 3.705390691757202
epoch : 2 - step : 15/100 - loss: 4.219462871551514
epoch : 2 - step : 16/100 - loss: 3.8457140922546387
epoch : 2 - step : 17/100 - loss: 3.1270666122436523
epoch : 2 - step : 18/10

 20%|██        | 2/10 [12:25<49:43, 372.96s/it]

Epoch: 2, Valid_Loss:  2.4576582527160644
epoch : 3 - step : 0/100 - loss: 2.565932273864746
epoch : 3 - step : 1/100 - loss: 2.252896785736084
epoch : 3 - step : 2/100 - loss: 2.4722654819488525
epoch : 3 - step : 3/100 - loss: 2.4698617458343506
epoch : 3 - step : 4/100 - loss: 2.4050004482269287
epoch : 3 - step : 5/100 - loss: 2.3630874156951904
epoch : 3 - step : 6/100 - loss: 2.2463390827178955
epoch : 3 - step : 7/100 - loss: 2.424567461013794
epoch : 3 - step : 8/100 - loss: 2.161353826522827
epoch : 3 - step : 9/100 - loss: 2.0373098850250244
epoch : 3 - step : 10/100 - loss: 2.352088451385498
epoch : 3 - step : 11/100 - loss: 2.0062639713287354
epoch : 3 - step : 12/100 - loss: 2.029745578765869
epoch : 3 - step : 13/100 - loss: 2.09151029586792
epoch : 3 - step : 14/100 - loss: 2.285493850708008
epoch : 3 - step : 15/100 - loss: 2.0995571613311768
epoch : 3 - step : 16/100 - loss: 2.195481300354004
epoch : 3 - step : 17/100 - loss: 1.9171351194381714
epoch : 3 - step : 18/10

 30%|███       | 3/10 [18:40<43:38, 374.13s/it]

Epoch: 3, Valid_Loss:  1.18987957239151
epoch : 4 - step : 0/100 - loss: 1.0777610540390015
epoch : 4 - step : 1/100 - loss: 1.1038901805877686
epoch : 4 - step : 2/100 - loss: 0.9783695340156555
epoch : 4 - step : 3/100 - loss: 1.0689506530761719
epoch : 4 - step : 4/100 - loss: 1.2480379343032837
epoch : 4 - step : 5/100 - loss: 1.3468263149261475
epoch : 4 - step : 6/100 - loss: 1.2001373767852783
epoch : 4 - step : 7/100 - loss: 1.0687154531478882
epoch : 4 - step : 8/100 - loss: 1.1311532258987427
epoch : 4 - step : 9/100 - loss: 1.1260300874710083
epoch : 4 - step : 10/100 - loss: 1.026847243309021
epoch : 4 - step : 11/100 - loss: 1.1798378229141235
epoch : 4 - step : 12/100 - loss: 1.2726625204086304
epoch : 4 - step : 13/100 - loss: 1.0348143577575684
epoch : 4 - step : 14/100 - loss: 1.1563081741333008
epoch : 4 - step : 15/100 - loss: 1.3850759267807007
epoch : 4 - step : 16/100 - loss: 1.2643425464630127
epoch : 4 - step : 17/100 - loss: 1.1615980863571167
epoch : 4 - step 

 40%|████      | 4/10 [24:55<37:24, 374.13s/it]

Epoch: 4, Valid_Loss:  1.1563247561454773
epoch : 5 - step : 0/100 - loss: 0.9507243633270264
epoch : 5 - step : 1/100 - loss: 1.459241271018982
epoch : 5 - step : 2/100 - loss: 1.3503077030181885
epoch : 5 - step : 3/100 - loss: 0.9100711941719055
epoch : 5 - step : 4/100 - loss: 1.2550173997879028
epoch : 5 - step : 5/100 - loss: 1.1668814420700073
epoch : 5 - step : 6/100 - loss: 1.2165910005569458
epoch : 5 - step : 7/100 - loss: 1.134772539138794
epoch : 5 - step : 8/100 - loss: 1.1548123359680176
epoch : 5 - step : 9/100 - loss: 1.185459017753601
epoch : 5 - step : 10/100 - loss: 1.1787217855453491
epoch : 5 - step : 11/100 - loss: 1.1520094871520996
epoch : 5 - step : 12/100 - loss: 0.8981131911277771
epoch : 5 - step : 13/100 - loss: 0.8694112300872803
epoch : 5 - step : 14/100 - loss: 1.0787346363067627
epoch : 5 - step : 15/100 - loss: 1.2829562425613403
epoch : 5 - step : 16/100 - loss: 1.3118489980697632
epoch : 5 - step : 17/100 - loss: 1.180798888206482
epoch : 5 - step :

 50%|█████     | 5/10 [31:10<31:12, 374.53s/it]

Epoch: 5, Valid_Loss:  1.1333156967163085
epoch : 6 - step : 0/100 - loss: 1.2100551128387451
epoch : 6 - step : 1/100 - loss: 1.2006744146347046
epoch : 6 - step : 2/100 - loss: 1.0828043222427368
epoch : 6 - step : 3/100 - loss: 0.8785880208015442
epoch : 6 - step : 4/100 - loss: 0.9538809657096863
epoch : 6 - step : 5/100 - loss: 1.0412789583206177
epoch : 6 - step : 6/100 - loss: 1.0823692083358765
epoch : 6 - step : 7/100 - loss: 0.9864757657051086
epoch : 6 - step : 8/100 - loss: 0.9890445470809937
epoch : 6 - step : 9/100 - loss: 1.1520756483078003
epoch : 6 - step : 10/100 - loss: 0.9985886812210083
epoch : 6 - step : 11/100 - loss: 1.120221734046936
epoch : 6 - step : 12/100 - loss: 1.1822255849838257
epoch : 6 - step : 13/100 - loss: 0.9229302406311035
epoch : 6 - step : 14/100 - loss: 1.1914290189743042
epoch : 6 - step : 15/100 - loss: 1.2036080360412598
epoch : 6 - step : 16/100 - loss: 1.1680525541305542
epoch : 6 - step : 17/100 - loss: 0.7060254216194153
epoch : 6 - ste

 60%|██████    | 6/10 [37:26<25:00, 375.08s/it]

Epoch: 6, Valid_Loss:  1.1111786103248595
epoch : 7 - step : 0/100 - loss: 1.0150014162063599
epoch : 7 - step : 1/100 - loss: 1.0982633829116821
epoch : 7 - step : 2/100 - loss: 1.138224482536316
epoch : 7 - step : 3/100 - loss: 1.2722336053848267
epoch : 7 - step : 4/100 - loss: 0.6403920650482178
epoch : 7 - step : 5/100 - loss: 0.9785756468772888
epoch : 7 - step : 6/100 - loss: 1.2868642807006836
epoch : 7 - step : 7/100 - loss: 0.7481834888458252
epoch : 7 - step : 8/100 - loss: 1.073380470275879
epoch : 7 - step : 9/100 - loss: 1.085951328277588
epoch : 7 - step : 10/100 - loss: 1.259090542793274
epoch : 7 - step : 11/100 - loss: 1.22862708568573
epoch : 7 - step : 12/100 - loss: 1.0673229694366455
epoch : 7 - step : 13/100 - loss: 1.0509519577026367
epoch : 7 - step : 14/100 - loss: 1.06611967086792
epoch : 7 - step : 15/100 - loss: 1.007073163986206
epoch : 7 - step : 16/100 - loss: 1.1566424369812012
epoch : 7 - step : 17/100 - loss: 1.0148100852966309
epoch : 7 - step : 18/1

 70%|███████   | 7/10 [43:42<18:45, 375.23s/it]

Epoch: 7, Valid_Loss:  1.0677329325675964
epoch : 8 - step : 0/100 - loss: 0.7220598459243774
epoch : 8 - step : 1/100 - loss: 1.190659523010254
epoch : 8 - step : 2/100 - loss: 0.6646177172660828
epoch : 8 - step : 3/100 - loss: 0.9843756556510925
epoch : 8 - step : 4/100 - loss: 1.1163620948791504
epoch : 8 - step : 5/100 - loss: 1.0407356023788452
epoch : 8 - step : 6/100 - loss: 1.3228981494903564
epoch : 8 - step : 7/100 - loss: 0.9495016932487488
epoch : 8 - step : 8/100 - loss: 1.1921871900558472
epoch : 8 - step : 9/100 - loss: 0.8666924238204956
epoch : 8 - step : 10/100 - loss: 0.9412743449211121
epoch : 8 - step : 11/100 - loss: 1.122666835784912
epoch : 8 - step : 12/100 - loss: 0.925174355506897
epoch : 8 - step : 13/100 - loss: 0.8587319850921631
epoch : 8 - step : 14/100 - loss: 0.979549765586853
epoch : 8 - step : 15/100 - loss: 0.9644507169723511
epoch : 8 - step : 16/100 - loss: 0.7420514822006226
epoch : 8 - step : 17/100 - loss: 0.9479364156723022
epoch : 8 - step :

 80%|████████  | 8/10 [50:02<12:33, 376.96s/it]

Epoch: 8, Valid_Loss:  1.0138829803466798
epoch : 9 - step : 0/100 - loss: 1.0058503150939941
epoch : 9 - step : 1/100 - loss: 1.0210704803466797
epoch : 9 - step : 2/100 - loss: 0.889901876449585
epoch : 9 - step : 3/100 - loss: 0.8693370819091797
epoch : 9 - step : 4/100 - loss: 1.2519901990890503
epoch : 9 - step : 5/100 - loss: 0.8669031858444214
epoch : 9 - step : 6/100 - loss: 1.1830183267593384
epoch : 9 - step : 7/100 - loss: 0.7014245390892029
epoch : 9 - step : 8/100 - loss: 1.0002721548080444
epoch : 9 - step : 9/100 - loss: 0.8426473736763
epoch : 9 - step : 10/100 - loss: 0.9966603517532349
epoch : 9 - step : 11/100 - loss: 1.106451153755188
epoch : 9 - step : 12/100 - loss: 0.8935810923576355
epoch : 9 - step : 13/100 - loss: 1.0383384227752686
epoch : 9 - step : 14/100 - loss: 1.0647411346435547
epoch : 9 - step : 15/100 - loss: 1.0439105033874512
epoch : 9 - step : 16/100 - loss: 1.157841682434082
epoch : 9 - step : 17/100 - loss: 1.152036190032959
epoch : 9 - step : 18

 90%|█████████ | 9/10 [56:15<06:15, 375.71s/it]

Epoch: 9, Valid_Loss:  0.9707102751731873
epoch : 10 - step : 0/100 - loss: 0.9551087617874146
epoch : 10 - step : 1/100 - loss: 0.9268535375595093
epoch : 10 - step : 2/100 - loss: 0.8697043657302856
epoch : 10 - step : 3/100 - loss: 0.816990077495575
epoch : 10 - step : 4/100 - loss: 0.8447631001472473
epoch : 10 - step : 5/100 - loss: 0.9637778997421265
epoch : 10 - step : 6/100 - loss: 0.5477986931800842
epoch : 10 - step : 7/100 - loss: 0.970766007900238
epoch : 10 - step : 8/100 - loss: 0.9488482475280762
epoch : 10 - step : 9/100 - loss: 0.8797046542167664
epoch : 10 - step : 10/100 - loss: 0.9117159843444824
epoch : 10 - step : 11/100 - loss: 0.8255541324615479
epoch : 10 - step : 12/100 - loss: 0.8005479574203491
epoch : 10 - step : 13/100 - loss: 0.7647018432617188
epoch : 10 - step : 14/100 - loss: 0.9565283060073853
epoch : 10 - step : 15/100 - loss: 0.9900828003883362
epoch : 10 - step : 16/100 - loss: 0.8712703585624695
epoch : 10 - step : 17/100 - loss: 0.894489169120788

100%|██████████| 10/10 [1:02:28<00:00, 374.87s/it]

Epoch: 10, Valid_Loss:  0.9545117878913879





In [27]:
# Early stopping 안됐을때 모델, 결과 따로 저장
torch.save(model.state_dict(), './savedmodel/base_bestmodel_epoch10.pth')
loss_dic_df = pd.DataFrame(loss_dic)
loss_dic_df.to_csv('./results/base_loss_epoch10.csv', index=False)

# 추론 시작

In [15]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [1]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig,HfArgumentParser, get_scheduler, set_seed

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler

from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = {'mode_ID':"microsoft/Phi-3-mini-4k-instruct",
          'seed': 1 ,
          'max_seq_len' : 4096,
          'epochs': 3,
          'lr': 2e-4,
          'batch': 4,
          'lora_r':8,
          'lora_alpha':32,
          'target_module':["q_proj", "up_proj", "o_proj", "k_proj", "down_proj","gate_proj", "v_proj"],
          'lora_dropout':0.05,
          'lora_tasktype' :'CAUSAL_LM',
          'lora_bias' : 'none',
          'optimizer': 'paged_adamw_8bit',
          'scheduler':'cosine'}

In [3]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=config['lora_r'],  # LoRA 모델의 r 값
    lora_alpha=config['lora_alpha'],  # LoRA 모델의 alpha 값
    target_modules=config['target_module'],  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=config['lora_dropout'],  # LoRA 모델의 드롭아웃 비율
    bias=config['lora_bias'],  # LoRA 모델의 편향 설정
    task_type=config['lora_tasktype']  # LoRA 모델의 태스크 유형
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )


In [4]:
model = AutoModelForCausalLM.from_pretrained(
	config['mode_ID'],
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
	use_cache=False,
    # attn_implementation='flash_attention_2'
	# quantization_config=bnb_config,
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.73s/it]


In [5]:
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

In [6]:
model = get_peft_model(model, peft_config) # PEFT 적용 

In [7]:
model.load_state_dict(torch.load('./savedmodel/base_bestmodel_epoch10.pth'))

<All keys matched successfully>

In [8]:
# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(config['mode_ID'], trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt


import pandas as pd
import json
with open('./data/pqaa_train_set.json','r') as f:
    train_data = json.load(f)
with open('./data/pqaa_dev_set.json','r') as f:
    test_data = json.load(f)
    
# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in test_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=50, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=50, random_state=42)

# 두 데이터 프레임 합치기
combined_df_test = pd.concat([no_df, yes_df])

# print(combined_df_test)

X_test = combined_df_test['input']
y_test = combined_df_test['final_decision']

test_data_prompt_list = []
for x3,y3 in zip(X_test, y_test):
    test_data_prompt_list.append(make_prompt(x3,y3))
    test_data_prompt_list = [test_data.split('<|end|>')[0] + '<|end|>\n<|assistant|>\n' for test_data in test_data_prompt_list]

class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [9]:
test_dataset = Dataset(test_data_prompt_list)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [10]:
from transformers import pipeline 

def test(loader):
    output_li = []
    model.eval()
    loss_avg = 0

    pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    ) 

    generation_args = { 
        "max_new_tokens": 500, 
        "return_full_text": False, 
        "temperature": 0.5, 
        "do_sample": False, 
    } 

    with torch.no_grad():
        for output in tqdm(pipe(loader, **generation_args)):
            # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            # outputs = model(**inputs, labels=inputs["input_ids"])
            output_li.append(output)
            
    #         loss = outputs.loss
    #         loss_avg += loss.item()
            
    #         del inputs
    #         del outputs
    #         del loss
            
    # print(f'Epoch: {epoch}, Valid_Loss:  {loss_avg/len(loader)}')
    # loss_dic['Val'].append(loss_avg/len(loader))
    return output_li

In [11]:
outputs = test(test_dataset)
print(outputs[0])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM'

[{'generated_text': ' no'}]





In [12]:
pred_li = []
for output in outputs:
    # print(output[0].get('generated_text').strip())
    pred_li.append(output[0].get('generated_text').strip())


In [13]:
df = pd.DataFrame({'true': y_test, 'pred':pred_li})
df

Unnamed: 0,true,pred
2774,no,no
4083,no,yes
10063,no,no
3060,no,no
7219,no,no
...,...,...
1469,yes,no
3216,yes,yes
8590,yes,yes
4770,yes,yes


In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(df['true'],df['pred'])

0.82