In [21]:
!pip install datasets
!pip install -U transformers
!pip install transformers peft torch
!pip install --upgrade peft



In [22]:
import os
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
import pandas as pd
from torchvision import transforms
import glob
from tqdm import tqdm
from urllib.request import urlopen
from PIL import Image

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

import time

# **Random seed 고정**

In [23]:
import random

seed = 40
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
warnings.filterwarnings('ignore')

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **google drive mount**

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **데이터 다운로드**

In [26]:
from datasets import load_dataset

data = load_dataset("tatsu-lab/alpaca")

# train-test split
dataset = data["train"].train_test_split(test_size=0.3, seed=42)

In [27]:
#data 형식 확인
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 36401
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 15601
    })
})

drive에 데이터 저장

In [28]:
# Drive에 저장할 경로
train_save_path = '/content/drive/My Drive/alpaca_train_dataset.csv'
test_save_path = '/content/drive/My Drive/alpaca_test_dataset.csv'

# CSV 파일로 저장
dataset["train"].to_csv(train_save_path)
dataset["test"].to_csv(test_save_path)
print(f"train_dataset saved to {train_save_path} , test_dataset saved to {test_save_path}")

Creating CSV from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

train_dataset saved to /content/drive/My Drive/alpaca_train_dataset.csv , test_dataset saved to /content/drive/My Drive/alpaca_test_dataset.csv


drive에서 다운로드

In [29]:
df_train = pd.read_csv(train_save_path)
df_test = pd.read_csv(test_save_path)

In [30]:
#결측값 처리
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

# 결측값 -> 0 -> 문자열
df_train = df_train.replace(0, "there is no input")
df_test = df_test.replace(0, "there is no input")

# **model lora 적용 , 경량화**

In [31]:
from peft import LoraConfig, get_peft_model

In [32]:
# LoRA 설정
lora_config = LoraConfig(
    r=16,  # Low-rank 업데이트 행렬 차원
    lora_alpha=16,  # 스케일링 팩터
    lora_dropout=0.1,  # 드롭아웃 비율
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],  # QLoRA가 적용될 대상 모듈
)

pretrained model 다운로드

In [None]:
!huggingface-cli login

In [34]:
from peft import LoraConfig, get_peft_model

# 모델 및 토크나이저 로드
model_name = "meta-llama/Llama-3.2-1B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  device_map="auto",             # GPU와 CPU를 자동 분배
                                                  torch_dtype="auto",            # 자동으로 적절한 데이터 타입(FP32, FP16 등) 선택
                                                  offload_folder="./offload",    # 메모리가 부족할 경우 CPU로 데이터를 오프로드
                                                  offload_state_dict=True)        # 가중치도 필요 시 CPU로 오프로드

#기존 model freeze
for param in base_model.parameters():
    param.requires_grad = False

model = get_peft_model(base_model, lora_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(128257, 2048)

# **데이터 전처리**

훈련 **데이터**

# data 구조 설계

In [35]:
input_column = ['instruction','input']
label_column = ['output']

# 각 행(row)에 대해 지정된 문장을 생성하고 리스트에 저장
train_data = [
    {   "role" : "user" ,
        "content": ",".join([f"{col} : {row[col]}" for col in df_train[input_column]]),
        "label": f"label : {row['output']}"
    }
    for _, row in df_train.iterrows()
]

# 특수부호 제거
train_data = [
    {key: value.replace("\n", "").replace("\\", "") if isinstance(value, str) else value
     for key, value in item.items()}
    for item in train_data
]

test_data = [
    {
        "role" : "user" ,
        "content": ",".join([f"{col} : {row[col]}" for col in df_test[input_column]])
    }
    for _, row in df_test.iterrows()
]

# 특수부호 제거
test_data = [
    {key: value.replace("\n", "").replace("\\", "") if isinstance(value, str) else value
     for key, value in item.items()}
    for item in test_data
]

데이터의 최대 **길이**

In [36]:
max_data_length = max(len(item["content"]) for item in train_data)
print(max_data_length)

max_label_length = max(len(item["label"]) for item in train_data)
print(max_label_length)

2551
4161


In [37]:
for i in train_data:
  print(i['role'] , i['content'])
  break

user instruction : How do you use a GPS system?,input : there is no input


# **token화**

In [38]:
def replace_padding_with_ignore(labels, padding_value=128001, ignore_value=-100):
    """
    라벨에서 지정된 패딩 값을 -100으로 변환합니다.
    """
    return [ignore_value if token == padding_value else token for token in labels]

# input_ids, attention_mask, labels 생성
tokenized_data = [
    {
        **tokenizer(
            f"{item['role']} , {item['content']}",
            padding='max_length',
            truncation=True,
            max_length=max_label_length
        ),
        'labels': replace_padding_with_ignore(
            tokenizer(
                item['label'],
                padding='max_length',
                truncation=True,
                max_length=max_label_length
            )['input_ids']
        )
    }
    for item in train_data
]

# 결과 확인
print(tokenized_data[:1])  # 앞의 두 개만 출력

[{'input_ids': [128000, 882, 1174, 7754, 551, 2650, 656, 499, 1005, 264, 24229, 1887, 12909, 1379, 551, 1070, 374, 912, 1988, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 12

빈 샘플 있는지 확인

In [39]:
# 필요한 키 목록
required_keys = ["input_ids", "attention_mask"]

# 빈 샘플 검사 코드
for idx, sample in enumerate(tokenized_data):
    missing_keys = [key for key in required_keys if key not in sample or sample[key] is None or len(sample[key]) == 0]
    if missing_keys:
        print(f"Sample at index {idx} is missing required keys or has empty values: {missing_keys}")

Lora 훈련

fp16=True , gradient_accumulation_steps = 16

In [40]:
# 훈련 설정
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    gradient_accumulation_steps=32,
    learning_rate=1e-5,
)

In [41]:
# Trainer 설정 및 훈련 시작
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
)

**train**

In [42]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqnckdrb14[0m ([33mqnckdrb14-does-not-exist[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,13.6527
20,13.588
30,13.5183
40,13.4038
50,13.2924


KeyboardInterrupt: 

# **sample test**

In [None]:
model.eval()

In [44]:
# input_ids, attention_mask, labels 생성
test_tokenized_data = [
    {
        **tokenizer(
            f"{item['role']}, {item['content']}",
            padding='max_length',  # 또는 'longest' 등 원하는 padding 방식 선택
            truncation=True,
            max_length=max_label_length  # max_length는 필요에 맞게 조정
        )
    }
    for item in test_data
]

# 결과 확인
print(test_tokenized_data[:1])

[{'input_ids': [128000, 882, 11, 7754, 551, 3639, 1053, 387, 279, 1888, 955, 315, 10368, 369, 264, 1732, 889, 706, 55652, 12909, 1379, 551, 1070, 374, 912, 1988, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256

test_prompt **생성**

In [45]:
test_prompt = []

for i in range(100):
  test_prompt.append(test_tokenized_data[i])

In [46]:
# 예시: test_prompt[0]의 input_ids를 추출
input_ids = torch.tensor([test_prompt[0]['input_ids']]).to(device)  # device: GPU/CPU
attention_mask = torch.tensor([test_prompt[0]['attention_mask']]).to(device)

# **original 추론**

In [47]:
# 시작 시간 기록
start_time = time.time()

outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150
)

end_time = time.time()

# 걸린 시간
time_take = end_time - start_time

# outputs[0]을 디코드
decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded_text)
print(time_take)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


user, instruction : What would be the best type of exercise for a person who has arthritis?,input : there is no input case
Here is the response
The best type of exercise for a person with arthritis is low-impact aerobic exercise, such as walking, swimming, or cycling. These exercises are gentle on the joints and can help improve cardiovascular fitness without putting excessive stress on the joints. Additionally, they can also help reduce pain and stiffness. Here are some tips for exercising with arthritis:
1. Start slowly and gradually increase the intensity and duration of your workouts.
2. Choose exercises that are low-impact and easy to manage.
3. Consider working with a fitness professional who specializes in arthritis exercise.
4. Focus on exercises that improve flexibility and balance, such as yoga, tai chi, or Pilates.
5. Avoid high-impact exercises, such as running
5.2820165157318115


# **deepspeed inference**

In [None]:
!pip install deepspeed

In [None]:
import deepspeed

In [None]:
# DeepSpeed로 모델 최적화
ds_engine = deepspeed.init_inference(
    model,
    mp_size=1,  # 멀티 GPU 사용할 경우 GPU 개수 지정
    replace_method='auto',  # 추론 최적화를 위한 자동 변환
    replace_with_kernel_inject=True  # 커널 인젝션 활성화
)

In [None]:
# 시작 시간 기록
start_time = time.time()

deepspeed_outputs = ds_engine.module.generate(**new_prompt, max_length=1024)

end_time = time.time()

# 걸린 시간
time_take = end_time - start_time
print(time_take)