In [None]:
!pip install datasets
!pip install -U transformers
!pip install transformers peft torch
!pip install --upgrade peft

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
import pandas as pd
from torchvision import transforms
import glob
from tqdm import tqdm
from urllib.request import urlopen
from PIL import Image

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

import time

# **Random seed 고정**

In [3]:
import random

seed = 40
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
warnings.filterwarnings('ignore')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **google drive mount**

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



# **데이터 다운로드**

In [6]:
df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
mapping = pd.read_csv('/content/drive/MyDrive/misconception_mapping.csv')

# **model lora 적용 , 경량화**

In [7]:
from peft import LoraConfig, get_peft_model

In [8]:
# LoRA 설정
lora_config = LoraConfig(
    r=16,  # Low-rank 업데이트 행렬 차원
    lora_alpha=16,  # 스케일링 팩터
    lora_dropout=0.1,  # 드롭아웃 비율
    target_modules=["q_proj"],  # QLoRA가 적용될 대상 모듈
)

pretrained model 다운로드

In [9]:
from peft import LoraConfig, get_peft_model

# 모델 및 토크나이저 로드
model_name = "ibm-granite/granite-3.0-8b-instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  device_map="auto",             # GPU와 CPU를 자동 분배
                                                  torch_dtype="auto",            # 자동으로 적절한 데이터 타입(FP32, FP16 등) 선택
                                                  offload_folder="./offload",    # 메모리가 부족할 경우 CPU로 데이터를 오프로드
                                                  offload_state_dict=True)        # 가중치도 필요 시 CPU로 오프로드

#기존 model freeze
for param in base_model.parameters():
    param.requires_grad = False

model = get_peft_model(base_model, lora_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

**model** gpu로 옮기기

In [10]:
model = model.to(device)

# **데이터 전처리**

훈련 **데이터**

# train_data 구조 설계

In [11]:
# 제외할 열 이름
exclude_columns = ['MisconceptionAId',
       'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']

# 제외한 나머지 열 이름 리스트에 담기
remaining_columns = [col for col in df_train.columns if col not in exclude_columns]

# 만약 answer = None -> 0으로 교체
df_train = df_train.fillna(0)

# 각 행(row)에 대해 지정된 문장을 생성하고 리스트에 저장
train_data = [
    {   "instruction" : "You will choose one answer that is correspond to label's Misconceptionid that is not zero if number of non zero value is one. If number of non zero value for each question are more than one, you have to give correspond answers. \
          eaxmple for Answer format when number of non zero value is one is 'questionid_misconceptionid value' . And you have to choose value from context. Now look at data structure carefully",
        "data structure": ",".join([f"{col} is {row[col]}" for col in df_train[remaining_columns]]) ,
        "label_structure" : ",".join([f"{col} is {row[col]}" for col in df_train[exclude_columns]])
    }
    for _, row in df_train.iterrows()
]

# 특수부호 제거
train_data = [
    {key: value.replace("\n", "").replace("\\", "") if isinstance(value, str) else value
     for key, value in item.items()}
    for item in train_data
]

# 데이터베이스 구축

In [12]:
pip install faiss-gpu



misconception embedding space에 구축

In [13]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# encode 모델이 없는 경우 전용 encoding해주는 모델 사용
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# embedding
misconceptions = mapping.MisconceptionName.values
embedding_Misconception = embedder.encode(misconceptions, convert_to_tensor=True)

# 4. FAISS 인덱스 생성 및 선택지 임베딩 추가
dimension = embedding_Misconception.shape[1]  # 임베딩 차원
index = faiss.IndexFlatL2(dimension)    # L2 거리 기반 인덱스 생성
index.add(np.array(embedding_Misconception.cpu().float()))  # 선택지 임베딩 추가 , cuda를 np로 바꿀수 없으므로 cpu로 옮긴다

combined_data **구조**

In [14]:
# instruction과 data structure 결합
combined_data = [
    {
        "combined_text": f"{item['instruction']} {item['data structure']}" ,
        "label": f"{item['label_structure']}"
    }
    for item in train_data
]

misconception 선택지 상위 25개 가져오는 **함수**

In [15]:
def choose_k_misconceptions(prompt):

  embedding_prompt = embedder.encode(prompt, convert_to_tensor=True)

  # FAISS에서 검색을 위해 prompt 임베딩을 numpy 배열로 변환 (FAISS는 numpy float32 필요)
  embedding_prompt_np = np.array(embedding_prompt.cpu(), dtype='float32').reshape(1, -1)

  # 상위 유사 항목 검색
  top_k = 25  # 예: 상위 25개 유사 항목 가져오기
  distances, indices = index.search(embedding_prompt_np, top_k)

  # 검색 결과로부터 misconception 텍스트 가져오기
  similar_misconceptions = [misconceptions[i] for i in indices[0]]
  similar_misconceptions_text = ",".join(similar_misconceptions)

  return similar_misconceptions_text

data + context -> **tokenizer**

In [None]:
# embedding space 생성
# encode 모델이 없는 경우 전용 encoding해주는 모델 사용
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# input_ids, attention_mask, labels 생성
tokenized_data = [
    {
        **tokenizer(
            item['combined_text'] + ". From now, you have to choose one id from these contexts(multiple choices). These are contexts(multiple choices),".join(choose_k_misconceptions(item['combined_text'])),
            padding='max_length',  # 또는 'longest' 등 원하는 padding 방식 선택
            truncation=True,
            max_length=1024  # max_length는 필요에 맞게 조정
        ),
        'labels': tokenizer(
            item['label'],
            padding='max_length',  # 동일한 padding 방식을 적용
            truncation=True,
            max_length=1024  # 동일한 max_length를 적용
        )['input_ids']
    }
    for item in combined_data
]

# 결과 확인
print(tokenized_data[:1])  # 앞의 두 개만 출력

빈 샘플 있는지 확인

In [17]:
# 필요한 키 목록
required_keys = ["input_ids", "attention_mask"]

# 빈 샘플 검사 코드
for idx, sample in enumerate(tokenized_data):
    missing_keys = [key for key in required_keys if key not in sample or sample[key] is None or len(sample[key]) == 0]
    if missing_keys:
        print(f"Sample at index {idx} is missing required keys or has empty values: {missing_keys}")

Lora 훈련

fp16=True , gradient_accumulation_step2 = 16

In [18]:
# 훈련 설정
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    gradient_accumulation_steps=16,
)

In [19]:
# Trainer 설정 및 훈련 시작
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
)

api key :

In [20]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqnckdrb14[0m ([33mqnckdrb14-does-not-exist[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,12.7392
20,11.8148
30,10.1063
40,7.8462
50,6.1197
60,4.8109
70,3.3959
80,2.3525
90,1.5282
100,1.2774


TrainOutput(global_step=232, training_loss=2.993615009404462, metrics={'train_runtime': 1203.8471, 'train_samples_per_second': 3.105, 'train_steps_per_second': 0.193, 'total_flos': 1.8187641061952717e+17, 'train_loss': 2.993615009404462, 'epoch': 1.9860888175494917})

sample test

In [None]:
model.eval()

In [None]:
df_test

In [23]:
# 각 행(row)에 대해 지정된 문장을 생성하고 리스트에 저장
test_data = [
    {   "instruction" : "You will choose one answer that is correspond to label's Misconceptionid that is not zero if number of non zero value is one. If number of non zero value for each question are more than one, you have to give correspond answers. \
          eaxmple for Answer format when number of non zero value is one is 'questionid_misconceptionid value' . And you have to choose value from context. Now look at data structure carefully",
        "data structure": ",".join([f"{col} is {row[col]}" for col in df_test])
    }
    for _, row in df_test.iterrows()
]

# 특수부호 제거
test_data = [
    {key: value.replace("\n", "").replace("\\", "") if isinstance(value, str) else value
     for key, value in item.items()}
    for item in test_data
]

# test = [ {} , {} , {} , ... ]

In [None]:
test_data

In [25]:
# 테스트 데이터 구조 설계
combined_test_data = [
    {
        "combined_text": f"{item['instruction']}"
    }
    for item in test_data
]

# 딕셔너리 리스트에서 텍스트만 추출
texts = [item['combined_text'] + ". From now, you have to choose one id from these contexts(multiple choices). These are contexts(multiple choices),".join(choose_k_misconceptions(item['combined_text'])) \
         for item in combined_test_data]  # 'text' 키에 따라 변경 필요

# 텍스트 리스트를 tokenizer에 전달
new_prompt = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)

# embedding space에서 misconception 찾아오기

# **original 추론**

In [26]:
# 모델 생성 결과, top_k와 temperature 추가 (필요 시 추가 가능)

# 시작 시간 기록
start_time = time.time()

outputs = model.generate(
    **new_prompt,
    max_new_tokens=100
)

end_time = time.time()

# 걸린 시간
time_take = end_time - start_time
print(time_take)

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.17 GiB. GPU 0 has a total capacity of 39.56 GiB of which 710.81 MiB is free. Process 232086 has 38.86 GiB memory in use. Of the allocated memory 31.42 GiB is allocated by PyTorch, and 6.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# **deepspeed inference**

In [None]:
!pip install deepspeed

In [None]:
import deepspeed

In [None]:
# DeepSpeed로 모델 최적화
ds_engine = deepspeed.init_inference(
    model,
    mp_size=1,  # 멀티 GPU 사용할 경우 GPU 개수 지정
    replace_method='auto',  # 추론 최적화를 위한 자동 변환
    replace_with_kernel_inject=True  # 커널 인젝션 활성화
)

In [None]:
# 시작 시간 기록
start_time = time.time()

deepspeed_outputs = ds_engine.module.generate(**new_prompt, max_length=1024)

end_time = time.time()

# 걸린 시간
time_take = end_time - start_time
print(time_take)

# **텍스트로 만든 후 df화**

original

In [None]:
outputs

# 각 배치의 결과를 개별적으로 디코딩하여 리스트로 저장
#generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

deepspeed