In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig,HfArgumentParser, get_scheduler, set_seed

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler
    
from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
config = {'mode_ID':"microsoft/Phi-3-mini-4k-instruct",
          'seed': 1 ,
          'max_seq_len' : 4096,
          'epochs': 3,
          'lr': 2e-4,
          'batch': 4,
          'lora_r':8,
          'lora_alpha':32,
          'target_module':["q_proj", "up_proj", "o_proj", "k_proj", "down_proj","gate_proj", "v_proj"],
          'lora_dropout':0.05,
          'lora_tasktype' :'CAUSAL_LM',
          'lora_bias' : 'none',
          'optimizer': 'paged_adamw_8bit',
          'scheduler':'cosine'}

## Model 초기화

In [6]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=config['lora_r'],  # LoRA 모델의 r 값
    lora_alpha=config['lora_alpha'],  # LoRA 모델의 alpha 값
    target_modules=config['target_module'],  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=config['lora_dropout'],  # LoRA 모델의 드롭아웃 비율
    bias=config['lora_bias'],  # LoRA 모델의 편향 설정
    task_type=config['lora_tasktype']  # LoRA 모델의 태스크 유형
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )


In [7]:
# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(config['mode_ID'], trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
	config['mode_ID'],
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
	use_cache=False,
    # attn_implementation='flash_attention_2'
	# quantization_config=bnb_config,
)

model.gradient_checkpointing_enable() # 모델에서 그래디언트 체크포인팅 활성화 (메모리 효율 향상)

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|██████████| 2/2 [00:28<00:00, 14.34s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.20s/it]


In [8]:
print(f'Phi3 크기 : {model.num_parameters()/1000**2:.1f}M개의 파라미터')

Phi3 크기 : 3821.1M개의 파라미터


In [9]:
from peft import prepare_model_for_kbit_training # peft 라이브러리에서 k 비트 학습 준비 함수 임포트

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # CUDA 사용 가능 여부 확인

model = prepare_model_for_kbit_training(model)# k 비트 학습을 위해 모델 준비 - prepare_model_for_kbit_training 함수 사용
model = get_peft_model(model, peft_config) # PEFT 적용 
model = model.to(device) # 모델을 학습 장치 (GPU 등)로 이동
model.print_trainable_parameters()# 훈련 가능한 파라미터 출력 

Using device: cuda
trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165


In [10]:
def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt

In [13]:
# train 만들기

import pandas as pd
import json
with open('./data/pqaa_train_set.json','r') as f:
    train_data = json.load(f)
    
# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in train_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=400, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=400, random_state=42)

# 두 데이터 프레임 합치기
combined_df_train = pd.concat([no_df, yes_df])

print(combined_df_train)

                                                    input final_decision
133561  Question:\nDoes aspirin increase bleeding comp...             no
123846  Question:\nAre measures of socioeconomic posit...             no
143951  Question:\nDoes dialysis within 24 hours of tr...             no
79644   Question:\nIs mild renal pelvic dilatation pre...             no
108150  Question:\nDoes acute blood pressure reduction...             no
...                                                   ...            ...
27494   Question:\nIs sOX7 down-regulated in lung canc...            yes
169373  Question:\nDoes dimethyl sulfoxide promote the...            yes
22938   Question:\nDoes high-Dose Methylprednisolone h...            yes
180033  Question:\nDoes [ Comparative assessment of cr...            yes
83370   Question:\nDo normal Human Lung Epithelial Cel...            yes

[8000 rows x 2 columns]


In [14]:
# valid 만들기
del_li = combined_df_train['input'].to_list()
df = df[~df['input'].isin(del_li)]

no_df = df[df['final_decision'] == 'no'].sample(n=50, random_state=42)

yes_df = df[df['final_decision'] == 'yes'].sample(n=50, random_state=42)

# 두 데이터 프레임 합치기
combined_df_valid = pd.concat([no_df, yes_df])

print(combined_df_valid)

                                                    input final_decision
198521  Question:\nDoes developmental care alter sleep...             no
113081  Question:\nDo genetic variants of the tumour n...             no
6058    Question:\nDoes the Magnitude of Peripheral Mu...             no
176781  Question:\nDoes small carbon monoxide formatio...             no
167253  Question:\nDoes spirometry training guarantee ...             no
...                                                   ...            ...
142775  Question:\nDoes financial strain predict recur...            yes
186427  Question:\nIs pro12Ala sequence variant of the...            yes
75183   Question:\nIs anticoagulation the gold standar...            yes
198118  Question:\nIs pancreaticobiliary anomalies the...            yes
126605  Question:\nIs severe tricuspid regurgitation p...            yes

[1000 rows x 2 columns]


In [15]:
# test 만들기

with open('./data/pqaa_dev_set.json','r') as f:
    test_data = json.load(f)

# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in test_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=50, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=50, random_state=42)

# 두 데이터 프레임 합치기
combined_df_test = pd.concat([no_df, yes_df])

print(combined_df_test)

                                                   input final_decision
2774   Question:\nDoes trimetazidine modify blood lev...             no
4083   Question:\nDo aDAMTS-5 deficient mice develop ...             no
10063  Question:\nIs brucellosis a major cause of feb...             no
3060   Question:\nDoes sertraline alter the beta-adre...             no
7219   Question:\nAre salivary biomarkers suitable fo...             no
...                                                  ...            ...
2413   Question:\nDoes contact allergy to topical cor...            yes
3093   Question:\nDo fGFR3 mutations indicate better ...            yes
9316   Question:\nIs cytomegalovirus immunoglobulin G...            yes
340    Question:\nDoes interatrial septum motion but ...            yes
4633   Question:\nDoes maternal Helminth Infection be...            yes

[1000 rows x 2 columns]


In [16]:
from sklearn.model_selection import train_test_split

X_train = combined_df_train['input']
y_train = combined_df_train['final_decision']

X_valid = combined_df_valid['input']
y_valid = combined_df_valid['final_decision']

# test 데이터셋
X_test = combined_df_test['input']
y_test = combined_df_test['final_decision']

In [17]:
combined_df_train['final_decision'].value_counts(), combined_df_valid['final_decision'].value_counts(), combined_df_test['final_decision'].value_counts()

(final_decision
 no     4000
 yes    4000
 Name: count, dtype: int64,
 final_decision
 no     500
 yes    500
 Name: count, dtype: int64,
 final_decision
 no     500
 yes    500
 Name: count, dtype: int64)

In [19]:
train_data_prompt_list = []
for x,y in zip(X_train, y_train):
    train_data_prompt_list.append(make_prompt(x,y))

valid_data_prompt_list = []
for x2,y2 in zip(X_valid, y_valid):
    valid_data_prompt_list.append(make_prompt(x2,y2))

test_data_prompt_list = []
for x3,y3 in zip(X_test, y_test):
    test_data_prompt_list.append(make_prompt(x3,y3))
    test_data_prompt_list = [test_data.split('<|end|>')[0] + '<|end|>' for test_data in test_data_prompt_list]

In [20]:
test_data_prompt_list[0].split('<|end|>')[0]

'<|user|>\nQuestion:\nDoes trimetazidine modify blood levels and immunosuppressant effects of cyclosporine A in renal allograft recipients?\nPlease give me the answer in formats: yes or no\nContext:\n(OBJECTIVE) In renal allograft recipients, trimetazidine (Vastarel) was proposed to be associated with the classic immunosuppressant treatments because it displays anti-ischaemic effects which may protect against cyclosporine A nephrotoxicity. The objective of this work was to assess the possibility of coadministering cyclosporin A, Sandimmun, and trimetazidine.\n(METHODS) Twelve renal transplant patients were selected on the basis of the stability of their cyclosporine A blood concentrations for the previous 3 months. They received trimetazidine, 40 mg twice daily orally for 5 days. Other coadministered drugs were kept unchanged during the study. Before and after trimetazidine administration, cyclosporine A blood concentrations, plasma interleukin-2 and soluble interleukin-2 receptor leve

In [21]:
test_data_prompt_list[0]

'<|user|>\nQuestion:\nDoes trimetazidine modify blood levels and immunosuppressant effects of cyclosporine A in renal allograft recipients?\nPlease give me the answer in formats: yes or no\nContext:\n(OBJECTIVE) In renal allograft recipients, trimetazidine (Vastarel) was proposed to be associated with the classic immunosuppressant treatments because it displays anti-ischaemic effects which may protect against cyclosporine A nephrotoxicity. The objective of this work was to assess the possibility of coadministering cyclosporin A, Sandimmun, and trimetazidine.\n(METHODS) Twelve renal transplant patients were selected on the basis of the stability of their cyclosporine A blood concentrations for the previous 3 months. They received trimetazidine, 40 mg twice daily orally for 5 days. Other coadministered drugs were kept unchanged during the study. Before and after trimetazidine administration, cyclosporine A blood concentrations, plasma interleukin-2 and soluble interleukin-2 receptor leve

In [22]:
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [23]:
train_dataset = Dataset(train_data_prompt_list)
valid_dataset = Dataset(valid_data_prompt_list)

In [24]:
def train(epoch, loader):

    model.train()
    loss_avg = 0
    for i, prompt in enumerate(loader):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        #loss.backward()
        #optimizer.step()
        scaler.update()
        print(f"epoch : {epoch} - step : {i}/{len(loader)} - loss: {loss.item()}")
        loss_avg += loss.item()
        
        del inputs
        del outputs
        del loss
        
    print(f'Epoch: {epoch}, train_Loss:  {loss_avg/len(loader)}')
    loss_dic['Train'].append(loss_avg/len(loader))

        

In [25]:
def validate(epoch,loader):  
    model.eval()
    loss_avg = 0
    with torch.no_grad():       
        for i, prompt in enumerate(loader):
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            loss_avg += loss.item()
            
            del inputs
            del outputs
            del loss
            
    print(f'Epoch: {epoch}, Valid_Loss:  {loss_avg/len(loader)}')
    loss_dic['Val'].append(loss_avg/len(loader))

In [26]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False)

In [27]:
scaler = GradScaler()

  scaler = GradScaler()


# lr 수정 해당 파트 (optimizer)

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [29]:
lr_scheduler = get_scheduler(
    name='cosine',
    optimizer=optimizer,
    num_warmup_steps=227,
    num_training_steps=15000
)

In [31]:
from tqdm import tqdm
import time

loss_dic = {"epoch":[],"Train":[], "Val":[]}
best_loss = 100
early_stop_count = 0

for epoch in tqdm(range(1, 11)):
    
    loss_dic['epoch'].append(epoch)
    train(epoch, train_loader)
    validate(epoch, valid_loader)
    lr_scheduler.step()
    
    if loss_dic['Val'][epoch - 1] > best_loss:
        early_stop_count += 1
        if early_stop_count >= 2:
            loss_dic_df = pd.DataFrame(loss_dic)
            loss_dic_df.to_csv(f'./results/240822_base_loss_1e-5.csv', index=False)
            torch.save(model.state_dict(), f'./savedmodel/240822_base_bestmodel_1e-5.pth')
            break
    else:
        best_loss = loss_dic['Val'][epoch - 1]
        early_stop_count = 0

  with torch.cuda.amp.autocast():


epoch : 1 - step : 0/4000 - loss: 1.8413985967636108
epoch : 1 - step : 1/4000 - loss: 3.4181082248687744
epoch : 1 - step : 2/4000 - loss: 2.9682884216308594
epoch : 1 - step : 3/4000 - loss: 2.215909719467163
epoch : 1 - step : 4/4000 - loss: 1.8580421209335327
epoch : 1 - step : 5/4000 - loss: 2.5541908740997314
epoch : 1 - step : 6/4000 - loss: 2.1940808296203613
epoch : 1 - step : 7/4000 - loss: 1.5143660306930542
epoch : 1 - step : 8/4000 - loss: 2.7098793983459473
epoch : 1 - step : 9/4000 - loss: 2.0974881649017334
epoch : 1 - step : 10/4000 - loss: 2.648761034011841
epoch : 1 - step : 11/4000 - loss: 2.814225196838379
epoch : 1 - step : 12/4000 - loss: 3.2433524131774902
epoch : 1 - step : 13/4000 - loss: 2.1561543941497803
epoch : 1 - step : 14/4000 - loss: 2.474846601486206
epoch : 1 - step : 15/4000 - loss: 2.816920757293701
epoch : 1 - step : 16/4000 - loss: 1.6893850564956665
epoch : 1 - step : 17/4000 - loss: 2.553205728530884
epoch : 1 - step : 18/4000 - loss: 2.5966844

 10%|█         | 1/10 [1:41:34<15:14:14, 6094.94s/it]

Epoch: 1, Valid_Loss:  2.4526045093536375
epoch : 2 - step : 0/4000 - loss: 2.2086706161499023
epoch : 2 - step : 1/4000 - loss: 2.1268160343170166
epoch : 2 - step : 2/4000 - loss: 2.210325002670288
epoch : 2 - step : 3/4000 - loss: 3.9870309829711914
epoch : 2 - step : 4/4000 - loss: 2.1197891235351562
epoch : 2 - step : 5/4000 - loss: 2.3140487670898438
epoch : 2 - step : 6/4000 - loss: 2.7216198444366455
epoch : 2 - step : 7/4000 - loss: 1.9508252143859863
epoch : 2 - step : 8/4000 - loss: 3.242093801498413
epoch : 2 - step : 9/4000 - loss: 2.417687177658081
epoch : 2 - step : 10/4000 - loss: 2.966301441192627
epoch : 2 - step : 11/4000 - loss: 2.453122854232788
epoch : 2 - step : 12/4000 - loss: 1.6420389413833618
epoch : 2 - step : 13/4000 - loss: 2.6914827823638916
epoch : 2 - step : 14/4000 - loss: 2.097768545150757
epoch : 2 - step : 15/4000 - loss: 2.3950912952423096
epoch : 2 - step : 16/4000 - loss: 3.3500609397888184
epoch : 2 - step : 17/4000 - loss: 2.3672218322753906
ep

 20%|██        | 2/10 [3:22:57<13:31:39, 6087.42s/it]

Epoch: 2, Valid_Loss:  1.1188555258512496
epoch : 3 - step : 0/4000 - loss: 0.9424212574958801
epoch : 3 - step : 1/4000 - loss: 1.2405478954315186
epoch : 3 - step : 2/4000 - loss: 0.9733678698539734
epoch : 3 - step : 3/4000 - loss: 1.0233335494995117
epoch : 3 - step : 4/4000 - loss: 0.8186633586883545
epoch : 3 - step : 5/4000 - loss: 0.932309627532959
epoch : 3 - step : 6/4000 - loss: 1.1599006652832031
epoch : 3 - step : 7/4000 - loss: 1.1711139678955078
epoch : 3 - step : 8/4000 - loss: 1.5685957670211792
epoch : 3 - step : 9/4000 - loss: 0.7390902638435364
epoch : 3 - step : 10/4000 - loss: 1.1573878526687622
epoch : 3 - step : 11/4000 - loss: 1.0792540311813354
epoch : 3 - step : 12/4000 - loss: 1.309802532196045
epoch : 3 - step : 13/4000 - loss: 0.9980466961860657
epoch : 3 - step : 14/4000 - loss: 1.0577605962753296
epoch : 3 - step : 15/4000 - loss: 1.1663298606872559
epoch : 3 - step : 16/4000 - loss: 1.2193845510482788
epoch : 3 - step : 17/4000 - loss: 0.989596962928772

 30%|███       | 3/10 [5:04:27<11:50:22, 6088.93s/it]

Epoch: 3, Valid_Loss:  1.1059253170490264
epoch : 4 - step : 0/4000 - loss: 0.9956625699996948
epoch : 4 - step : 1/4000 - loss: 0.8955054879188538
epoch : 4 - step : 2/4000 - loss: 0.9376346468925476
epoch : 4 - step : 3/4000 - loss: 1.0149590969085693
epoch : 4 - step : 4/4000 - loss: 1.0521469116210938
epoch : 4 - step : 5/4000 - loss: 0.9072536826133728
epoch : 4 - step : 6/4000 - loss: 1.183533787727356
epoch : 4 - step : 7/4000 - loss: 1.1491912603378296
epoch : 4 - step : 8/4000 - loss: 1.409759283065796
epoch : 4 - step : 9/4000 - loss: 1.0762966871261597
epoch : 4 - step : 10/4000 - loss: 1.0882341861724854
epoch : 4 - step : 11/4000 - loss: 1.2276324033737183
epoch : 4 - step : 12/4000 - loss: 1.1450666189193726
epoch : 4 - step : 13/4000 - loss: 1.1666617393493652
epoch : 4 - step : 14/4000 - loss: 0.9444427490234375
epoch : 4 - step : 15/4000 - loss: 0.9315544962882996
epoch : 4 - step : 16/4000 - loss: 0.8645615577697754
epoch : 4 - step : 17/4000 - loss: 1.067085504531860

 40%|████      | 4/10 [6:45:56<10:08:53, 6088.94s/it]

Epoch: 4, Valid_Loss:  1.1013658717870711
epoch : 5 - step : 0/4000 - loss: 1.2178235054016113
epoch : 5 - step : 1/4000 - loss: 1.5116533041000366
epoch : 5 - step : 2/4000 - loss: 0.8695870637893677
epoch : 5 - step : 3/4000 - loss: 0.8625642657279968
epoch : 5 - step : 4/4000 - loss: 0.9944226145744324
epoch : 5 - step : 5/4000 - loss: 0.724787712097168
epoch : 5 - step : 6/4000 - loss: 1.2537933588027954
epoch : 5 - step : 7/4000 - loss: 1.1115950345993042
epoch : 5 - step : 8/4000 - loss: 1.4359279870986938
epoch : 5 - step : 9/4000 - loss: 1.0287734270095825
epoch : 5 - step : 10/4000 - loss: 1.2021616697311401
epoch : 5 - step : 11/4000 - loss: 0.7096300721168518
epoch : 5 - step : 12/4000 - loss: 1.5095971822738647
epoch : 5 - step : 13/4000 - loss: 0.6909888386726379
epoch : 5 - step : 14/4000 - loss: 1.3347523212432861
epoch : 5 - step : 15/4000 - loss: 0.927076518535614
epoch : 5 - step : 16/4000 - loss: 0.9412278532981873
epoch : 5 - step : 17/4000 - loss: 1.223964333534240

 50%|█████     | 5/10 [8:27:15<8:27:05, 6085.19s/it] 

Epoch: 5, Valid_Loss:  1.098713474869728
epoch : 6 - step : 0/4000 - loss: 0.9945482611656189
epoch : 6 - step : 1/4000 - loss: 1.1057990789413452
epoch : 6 - step : 2/4000 - loss: 1.2161355018615723
epoch : 6 - step : 3/4000 - loss: 1.0912106037139893
epoch : 6 - step : 4/4000 - loss: 0.7263728380203247
epoch : 6 - step : 5/4000 - loss: 1.2163118124008179
epoch : 6 - step : 6/4000 - loss: 1.0382018089294434
epoch : 6 - step : 7/4000 - loss: 1.1856024265289307
epoch : 6 - step : 8/4000 - loss: 1.0767500400543213
epoch : 6 - step : 9/4000 - loss: 1.1538711786270142
epoch : 6 - step : 10/4000 - loss: 0.8889639973640442
epoch : 6 - step : 11/4000 - loss: 1.383607029914856
epoch : 6 - step : 12/4000 - loss: 1.0277127027511597
epoch : 6 - step : 13/4000 - loss: 1.2937442064285278
epoch : 6 - step : 14/4000 - loss: 1.1714812517166138
epoch : 6 - step : 15/4000 - loss: 1.0042076110839844
epoch : 6 - step : 16/4000 - loss: 1.1133897304534912
epoch : 6 - step : 17/4000 - loss: 0.995542645454406

 60%|██████    | 6/10 [10:08:42<6:45:42, 6085.74s/it]

Epoch: 6, Valid_Loss:  1.0969689185619353
epoch : 7 - step : 0/4000 - loss: 0.9462687373161316
epoch : 7 - step : 1/4000 - loss: 1.1126197576522827
epoch : 7 - step : 2/4000 - loss: 1.4194965362548828
epoch : 7 - step : 3/4000 - loss: 1.3338603973388672
epoch : 7 - step : 4/4000 - loss: 1.2758533954620361
epoch : 7 - step : 5/4000 - loss: 0.9210602045059204
epoch : 7 - step : 6/4000 - loss: 1.246117115020752
epoch : 7 - step : 7/4000 - loss: 1.2967602014541626
epoch : 7 - step : 8/4000 - loss: 1.1435872316360474
epoch : 7 - step : 9/4000 - loss: 1.0666099786758423
epoch : 7 - step : 10/4000 - loss: 0.8767785429954529
epoch : 7 - step : 11/4000 - loss: 1.297690749168396
epoch : 7 - step : 12/4000 - loss: 1.2116256952285767
epoch : 7 - step : 13/4000 - loss: 1.2415839433670044
epoch : 7 - step : 14/4000 - loss: 0.8128910660743713
epoch : 7 - step : 15/4000 - loss: 0.6814886331558228
epoch : 7 - step : 16/4000 - loss: 1.0990442037582397
epoch : 7 - step : 17/4000 - loss: 0.999799489974975

 70%|███████   | 7/10 [11:50:03<5:04:13, 6084.35s/it]

Epoch: 7, Valid_Loss:  1.0958995306491852
epoch : 8 - step : 0/4000 - loss: 1.3496307134628296
epoch : 8 - step : 1/4000 - loss: 0.9904746413230896
epoch : 8 - step : 2/4000 - loss: 1.3087517023086548
epoch : 8 - step : 3/4000 - loss: 1.2613896131515503
epoch : 8 - step : 4/4000 - loss: 1.1676698923110962
epoch : 8 - step : 5/4000 - loss: 0.8926048874855042
epoch : 8 - step : 6/4000 - loss: 1.3913910388946533
epoch : 8 - step : 7/4000 - loss: 1.096596121788025
epoch : 8 - step : 8/4000 - loss: 1.20622718334198
epoch : 8 - step : 9/4000 - loss: 0.9206022620201111
epoch : 8 - step : 10/4000 - loss: 0.9292935729026794
epoch : 8 - step : 11/4000 - loss: 1.14582097530365
epoch : 8 - step : 12/4000 - loss: 1.0728400945663452
epoch : 8 - step : 13/4000 - loss: 0.8489524722099304
epoch : 8 - step : 14/4000 - loss: 0.6543124318122864
epoch : 8 - step : 15/4000 - loss: 0.8984063863754272
epoch : 8 - step : 16/4000 - loss: 1.26884925365448
epoch : 8 - step : 17/4000 - loss: 1.2760248184204102
epo

 80%|████████  | 8/10 [13:31:28<3:22:48, 6084.40s/it]

Epoch: 8, Valid_Loss:  1.095483589053154
epoch : 9 - step : 0/4000 - loss: 1.1223114728927612
epoch : 9 - step : 1/4000 - loss: 1.1698811054229736
epoch : 9 - step : 2/4000 - loss: 1.3685400485992432
epoch : 9 - step : 3/4000 - loss: 1.1673814058303833
epoch : 9 - step : 4/4000 - loss: 0.9751491546630859
epoch : 9 - step : 5/4000 - loss: 1.2315553426742554
epoch : 9 - step : 6/4000 - loss: 0.7235553860664368
epoch : 9 - step : 7/4000 - loss: 1.2510769367218018
epoch : 9 - step : 8/4000 - loss: 1.0100582838058472
epoch : 9 - step : 9/4000 - loss: 0.8123218417167664
epoch : 9 - step : 10/4000 - loss: 0.961699903011322
epoch : 9 - step : 11/4000 - loss: 0.9673562049865723
epoch : 9 - step : 12/4000 - loss: 1.0396398305892944
epoch : 9 - step : 13/4000 - loss: 0.7607871294021606
epoch : 9 - step : 14/4000 - loss: 0.9574764370918274
epoch : 9 - step : 15/4000 - loss: 1.0404504537582397
epoch : 9 - step : 16/4000 - loss: 1.4197527170181274
epoch : 9 - step : 17/4000 - loss: 1.175457596778869

 90%|█████████ | 9/10 [15:12:51<1:41:24, 6084.13s/it]

Epoch: 9, Valid_Loss:  1.0948799550533295
epoch : 10 - step : 0/4000 - loss: 1.2451852560043335
epoch : 10 - step : 1/4000 - loss: 0.8613892197608948
epoch : 10 - step : 2/4000 - loss: 1.2203820943832397
epoch : 10 - step : 3/4000 - loss: 1.0775158405303955
epoch : 10 - step : 4/4000 - loss: 0.9568296670913696
epoch : 10 - step : 5/4000 - loss: 1.1456000804901123
epoch : 10 - step : 6/4000 - loss: 1.140978217124939
epoch : 10 - step : 7/4000 - loss: 1.0383843183517456
epoch : 10 - step : 8/4000 - loss: 0.9151192307472229
epoch : 10 - step : 9/4000 - loss: 0.9451432824134827
epoch : 10 - step : 10/4000 - loss: 1.0543676614761353
epoch : 10 - step : 11/4000 - loss: 1.0250577926635742
epoch : 10 - step : 12/4000 - loss: 1.0643426179885864
epoch : 10 - step : 13/4000 - loss: 1.0667442083358765
epoch : 10 - step : 14/4000 - loss: 1.1127121448516846
epoch : 10 - step : 15/4000 - loss: 1.103274941444397
epoch : 10 - step : 16/4000 - loss: 1.0979442596435547
epoch : 10 - step : 17/4000 - loss:

100%|██████████| 10/10 [16:54:13<00:00, 6085.32s/it] 

Epoch: 10, Valid_Loss:  1.0947736012935638





In [32]:
# Early stopping 안됐을때 모델, 결과 따로 저장
torch.save(model.state_dict(), './savedmodel/240822_base_bestmodel_1e-5.pth')
loss_dic_df = pd.DataFrame(loss_dic)
loss_dic_df.to_csv('./results/240822_base_loss_1e-5.csv', index=False)

# 추론 시작

In [2]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
import torch
from transformers import  AutoTokenizer, PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig,HfArgumentParser, get_scheduler, set_seed

import pandas as pd
import numpy as np

from torch import nn
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import cuda
from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler

from tqdm import tqdm

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes as bnb
import os
import random

import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = {'mode_ID':"microsoft/Phi-3-mini-4k-instruct",
          'seed': 1 ,
          'max_seq_len' : 4096,
          'epochs': 3,
          'lr': 2e-4,
          'batch': 4,
          'lora_r':8,
          'lora_alpha':32,
          'target_module':["q_proj", "up_proj", "o_proj", "k_proj", "down_proj","gate_proj", "v_proj"],
          'lora_dropout':0.05,
          'lora_tasktype' :'CAUSAL_LM',
          'lora_bias' : 'none',
          'optimizer': 'paged_adamw_8bit',
          'scheduler':'cosine'}

In [4]:
from peft import (
    get_peft_config,  # PEFT 설정을 가져오기 위한 함수
    get_peft_model,  # PEFT 모델을 가져오기 위한 함수
    get_peft_model_state_dict,  # PEFT 모델 상태 사전을 가져오기 위한 함수
    set_peft_model_state_dict,  # PEFT 모델 상태 사전을 설정하기 위한 함수
    LoraConfig,  # LoRA 모델 구성을 정의하는 클래스
    PeftType,  # PEFT 모델의 타입을 정의
    PrefixTuningConfig,  # PrefixTuning 모델 구성을 정의하는 클래스
    PromptEncoderConfig,  # PromptEncoder 모델 구성을 정의하는 클래스
    PeftModel,  # PEFT 모델을 정의하는 클래스
    PeftConfig,  # PEFT 모델의 구성을 정의하는 클래스
)

# PEFT 모델의 타입 설정 (LoRA로 설정)
peft_type = PeftType.LORA

# LoRA 모델을 위한 설정
peft_config = LoraConfig(
    r=config['lora_r'],  # LoRA 모델의 r 값
    lora_alpha=config['lora_alpha'],  # LoRA 모델의 alpha 값
    target_modules=config['target_module'],  # LoRA 모델의 타겟 모듈 리스트
    lora_dropout=config['lora_dropout'],  # LoRA 모델의 드롭아웃 비율
    bias=config['lora_bias'],  # LoRA 모델의 편향 설정
    task_type=config['lora_tasktype']  # LoRA 모델의 태스크 유형
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )


In [5]:
model = AutoModelForCausalLM.from_pretrained(
	config['mode_ID'],
	device_map="cuda",
	torch_dtype=torch.float16,
	trust_remote_code=True, 
	use_cache=False,
    # attn_implementation='flash_attention_2'
	# quantization_config=bnb_config,
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.56s/it]


In [6]:
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

In [7]:
model = get_peft_model(model, peft_config) # PEFT 적용 

In [8]:
model.load_state_dict(torch.load('./savedmodel/240822_base_bestmodel_1e-5.pth'))

  model.load_state_dict(torch.load('./savedmodel/240815_base_bestmodel.pth'))


<All keys matched successfully>

In [9]:
# AutoTokenizer를 사용하여 토크나이저 생성
tokenizer = AutoTokenizer.from_pretrained(config['mode_ID'], trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def make_prompt(user_request, answer):
    
    conversation = [ {'role': 'user', 'content': user_request},
                  {'role': 'assistant', 'content': answer}]
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    return prompt


import pandas as pd
import json

with open('./data/pqaa_dev_set.json','r') as f:
    test_data = json.load(f)
    
# 데이터프레임에 넣을 리스트 초기화
rows = []

# 딕셔너리를 순회하며 데이터프레임용 리스트 생성
for num, details in test_data.items():
    contexts_with_labels = '\n'.join([f"({label}) {context}" for label, context in zip(details['LABELS'], details['CONTEXTS'])])
    input = 'Question:\n' + details['QUESTION'] + '\nPlease give me the answer in formats: yes or no' + '\n' + 'Context:\n' + contexts_with_labels
    row = {
        'input' : input,
        'final_decision': details['final_decision']
    }
    rows.append(row)

# 데이터프레임 생성
df = pd.DataFrame(rows)

no_df = df[df['final_decision'] == 'no'].sample(n=500, random_state=42)

# 'yes'인 값 10000개 추출
yes_df = df[df['final_decision'] == 'yes'].sample(n=500, random_state=42)

# 두 데이터 프레임 합치기
combined_df_test = pd.concat([no_df, yes_df])

# print(combined_df_test)

X_test = combined_df_test['input']
y_test = combined_df_test['final_decision']

test_data_prompt_list = []
for x3,y3 in zip(X_test, y_test):
    test_data_prompt_list.append(make_prompt(x3,y3))
    test_data_prompt_list = [test_data.split('<|end|>')[0] + '<|end|>\n<|assistant|>\n' for test_data in test_data_prompt_list]

class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
test_dataset = Dataset(test_data_prompt_list)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [11]:
from transformers import pipeline 

def test(loader):
    output_li = []
    model.eval()
    loss_avg = 0

    pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    ) 

    generation_args = { 
        "max_new_tokens": 500, 
        "return_full_text": False, 
        "temperature": 0.5, 
        "do_sample": False, 
    } 

    with torch.no_grad():
        for output in tqdm(pipe(loader, **generation_args)):
            output_li.append(output)
            
    return output_li

In [12]:
outputs = test(test_dataset)
print(outputs[0])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

[{'generated_text': ' no'}]





In [13]:
pred_li = []
for output in outputs:
    # print(output[0].get('generated_text').strip())
    pred_li.append(output[0].get('generated_text').strip().lower())


In [14]:
df = pd.DataFrame({'true': y_test, 'pred':pred_li})
df

Unnamed: 0,true,pred
2774,no,no
4083,no,yes
10063,no,no
3060,no,no
7219,no,no
...,...,...
2413,yes,yes
3093,yes,yes
9316,yes,yes
340,yes,yes


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(df['true'],df['pred'])

0.919

# Acc 기록

- 1e-5 : 
- 3e-5 : 
- 5e-5 : 
- 1e-4 : 
- 3e-4 : 
- 5e-4 :
- 1e-3 :
- 3e-3 :
- 5e-3 :