In [1]:
import os
import torch # 텐서 연산과 GPU 처리를 위한 PyTorch 라이브러리
from transformers import AutoModelForCausalLM, AutoTokenizer # 허깅페이스에서 제공하는 모델과 토크나이저를 쉽게 불러오기 위한 라이브러리

In [9]:
hf_token = os.getenv("HF_TOKEN")

In [None]:
# 모델 로딩
model = AutoModelForCausalLM.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    token=hf_token,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

model-00005-of-00007.safetensors:   5%|5         | 273M/5.27G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   6%|5         | 315M/5.31G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   9%|9         | 482M/5.31G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:  14%|#4        | 283M/1.96G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   4%|3         | 199M/5.03G [00:00<?, ?B/s]

model-00001-of-00007.safetensors:   8%|7         | 419M/5.35G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   7%|6         | 372M/5.37G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [22]:
# 모델 로딩
model = AutoModelForCausalLM.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    token=hf_token,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [35]:
print(model.hf_device_map)

{'transformer.wte': 0, 'transformer.drop': 0, 'transformer.h.0': 0, 'transformer.h.1': 0, 'transformer.h.2': 0, 'transformer.h.3': 0, 'transformer.h.4': 0, 'transformer.h.5': 0, 'transformer.h.6': 1, 'transformer.h.7': 1, 'transformer.h.8': 1, 'transformer.h.9': 1, 'transformer.h.10': 1, 'transformer.h.11': 1, 'transformer.h.12': 1, 'transformer.h.13': 1, 'transformer.h.14': 1, 'transformer.h.15': 1, 'transformer.h.16': 2, 'transformer.h.17': 2, 'transformer.h.18': 2, 'transformer.h.19': 2, 'transformer.h.20': 2, 'transformer.h.21': 2, 'transformer.h.22': 2, 'transformer.h.23': 2, 'transformer.h.24': 2, 'transformer.h.25': 2, 'transformer.h.26': 3, 'transformer.h.27': 3, 'transformer.h.28': 3, 'transformer.h.29': 3, 'transformer.h.30': 3, 'transformer.h.31': 3, 'transformer.ln_f': 3, 'transformer.rotary': 3, 'lm_head': 3}


| GPU      | 로드된 레이어                                    |
| -------- | ------------------------------------------ |
| `cuda:0` | `embedding` 계층, `drop`, `h.0 ~ h.5`        |
| `cuda:1` | `h.6 ~ h.15`                               |
| `cuda:2` | `h.16 ~ h.25`                              |
| `cuda:3` | `h.26 ~ h.31`, `ln_f`, `rotary`, `lm_head` |


- `transformer.wte`: 토큰 임베딩 (word token embedding)
- `transformer.h.n`: Transformer 블록 (총 32개, h.0 ~ h.31)
- `transformer.ln_f`: 마지막 LayerNorm
- `lm_head`: 최종 로짓 계산 계층
- `rotary`: Rotary positional embedding 관련 파트

In [23]:
# 토크나이저 로딩
tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")
tokenizer

GPT2TokenizerFast(name_or_path='LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct', vocab_size=102400, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[|endofturn|]', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("                              ", rstrip=False, lstrip=False, sing

In [39]:
# 프롬프트 설정
prompt = "Explain who you are"
# prompt = "너의 소원을 말해봐"

# 메시지 구성
messages = [
    {
        "role": "system",
        "content": "You are EXAONE model from LG AI Research, a helpful assistant."
    },
    {
        "role": "user", 
        "content": prompt
    }
]

# 템플릿에 맞게 토큰화
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True, # 텍스트 -> 토큰 id로 변환
    add_generation_prompt=True, # 마지막에 모델이 응답할 자리 표시 추가
    return_tensors="pt" # PyTorch 텐서로 변환
    )
input_ids

tensor([[  420,   453, 47982,   453,   422,  5094,   937, 11522,   394,  5746,
          1932,  1005,  7401, 10680,  8385,   373,   619, 12913, 19415,   375,
           361,   560,   420,   453, 14719,   453,   422, 42090,   921,  1497,
           904,   937,   560,   420,   453,  1167,  8659,   453,   422]])

In [40]:
# 모델 추론 (텍스트 생성)
import time
start_time = time.time()
output = model.generate(
    input_ids.to("cuda"),
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=128
) # 추론 출력 시간 측정
end_time = time.time()
print(f"추론 출력 시간: {end_time - start_time:.2f}초")

추론 출력 시간: 3.09초


- cpu 사용시 52초
- gpu 사용시 3.23초 (device_map="auto")
- device_map="auto" 가 어떻게 잡혔는지 확인할 수 있을까?

In [41]:
output

tensor([[  420,   453, 47982,   453,   422,  5094,   937, 11522,   394,  5746,
          1932,  1005,  7401, 10680,  8385,   373,   619, 12913, 19415,   375,
           361,   560,   420,   453, 14719,   453,   422, 42090,   921,  1497,
           904,   937,   560,   420,   453,  1167,  8659,   453,   422, 33381,
           362,   768,   368,   438, 11522,   394,  5746,   582,   380,   375,
           377,   373,   662,  9944,  5004,  1932,  5218,   956,  7401, 10680,
          8385,   375,  4134,  5364,  1748,   772,   681, 12148,  5288,   956,
          7315,  2454,   373, 32720,  5344,   373,   686, 12429,   851,  4158,
          9867,  1935,  5295,  5004,   375,   768,   368,   438,  5878,   681,
          3527,   686,  8435,  3495,   374, 12541,  2591,  2357,   807,   629,
          1602,   768,   368,   777,  1441, 13078,   807,   375,  4134,  6837,
           772,   681,   821,   619, 12913,   686, 33076, 19415,   666,  1314,
          7551,  6270,   375,  2386,   904,  1072,  

In [43]:
res = tokenizer.decode(output[0])
print(res)

[|system|]You are EXAONE model from LG AI Research, a helpful assistant.[|endofturn|]
[|user|]Explain who you are
[|assistant|]Hello! I'm EXAONE 3.0, an advanced language model developed by LG AI Research. My primary function is to assist users by providing information, answering questions, and helping with various tasks using natural language. I'm designed to understand and generate human-like text based on the data I've been trained on. My goal is to be a helpful and informative assistant in your daily activities. If you have any questions or need assistance, feel free to ask![|endofturn|]
