# GGUF format으로 만들어지 모델 사용하기   
### GGUF는 llama.cpp를 위한 file format  
### llama.cpp는 llm 모델을 실행환경에 맞도록 실행 최적화 엔진  
> 저사양 하드웨어에서도 대형 모델을 실행할 수 있도록 양자화(4-bit, 8-bit)와 최적화된 연산 환경을 제공해 주는 wrapper   
> CPU 및 GPU(NVIDIA CUDA, Apple Metal) 지원   
### bitsandbytes와 차이점  
> Hugging Face(transformers)에서 사용하는 4-bit, 8-bit 양자화 lib.
>메모리 효율적, GPU 효율적 실행이 목적(GPU환경에 최적화)

In [1]:
%%time
!pip install -q langchain langchain_community huggingface_hub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.5/2.5 MB[0m [31m43.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/438.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/363.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

### llama-cpp-python을 CUDA 지원으로 설치

In [4]:
#!pip uninstall -y llama-cpp-python
!pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu122
Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.9.tar.gz (67.9 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.9-cp311-cp311-linux_x86_64.whl size=4123161 sha256=0435d3b6175e110bdd6b7e252c034736c7cbe05dc62e46fa9f9faaf25371ed14
  Stored in directory: /root/.cache/pip/wheels/9e/8f/bf/148c8eb7d69021eccd6eae6444f3accd48347587054ffd24e

In [5]:
import os
from google.colab import userdata

# Colab Secrets에서 토큰 읽어오기
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

!huggingface-cli whoami

gshong
[1morgs: [0m LLM2506,aicmap


## LlamaCpp로 모델 로드 : GGUF 모델 다운로드


In [6]:
# LlamaCpp로 모델 로드 : GGUF 모델 다운로드
from huggingface_hub import hf_hub_download
from langchain_community.llms import LlamaCpp

model_repo = "unsloth/gemma-3-12b-it-qat-GGUF"
model_file = "gemma-3-12b-it-qat-Q4_0.gguf"

model_path = hf_hub_download(repo_id=model_repo, filename=model_file, local_dir="./")

llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=40,  # GPU에 최대 레이어 오프로드 (8B 모델에 적합)
    n_batch=512,
    n_ctx=2048,       # 컨텍스트 길이
    f16_kv=True,      # FP16 메모리 최적화
    verbose=False,    #True,
    temperature=0.7,
    max_tokens=256
)

gemma-3-12b-it-qat-Q4_0.gguf:   0%|          | 0.00/6.91G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [7]:
# 테스트
query = "첫 번째 달 착륙자는 누구인가요? 한줄로 답변하세요."
response = llm.invoke(query)#, max_tokens=512)
print("Query:", query)
print("Response:", response)

Query: 첫 번째 달 착륙자는 누구인가요? 한줄로 답변하세요.
Response: 

닐 암스트롱은 최초로 달에 착륙한 사람이었습니다.
