# Final Project

## LLM Quantization for LLaMA-2(7B) with NF4, GPTQ and Pruning

2024174549 Kim Gi-jung

2024129844 Park Yong-min



### Pip install & import

In [1]:
print('Installing packages...')
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install torch tqdm
!pip install -U bitsandbytes

Installing packages...
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-pf81_rxe
  Resolved https://github.com/huggingface/transformers.git to commit f42084e6411c39b74309af4a7d6ed640c01a4c9e
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-pf81_rxe'


Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-cbta8r9k
  Resolved https://github.com/huggingface/accelerate.git to commit 200c9eb7833cfa505907f6f224ebf5a275aa6d92
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-cbta8r9k'




#### 런타임 - 세션 다시 시작

In [2]:
# Import Libraries
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, BitsAndBytesConfig
import datasets
from datasets import load_dataset
from functools import partial
import gc
import copy

In [3]:
# Set evaluation
import torch
import psutil
from tqdm import tqdm
from datasets import load_dataset
import torch.nn as nn
def evaluate(model, tokenizer):
    testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    testenc = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    # Initialize memory tracking
    memory_stats = {
        "gpu_allocated": [],
        "gpu_reserved": [],
        "cpu_memory": []
    }

    nlls = []
    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = (i + 1) * 2048
        batch = testenc[:, start_idx:end_idx].to(model.device)

        if batch.size(1) == 0:
            continue

        # Measure memory usage before forward pass
        if torch.cuda.is_available():
            memory_stats["gpu_allocated"].append(torch.cuda.memory_allocated())
            memory_stats["gpu_reserved"].append(torch.cuda.memory_reserved())

        process = psutil.Process()
        memory_stats["cpu_memory"].append(process.memory_info().rss)  # Resident Set Size (RSS)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = batch[:, 1:]  # Corrected from `testenc` to `batch`
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    perplexity = torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))
    return perplexity, memory_stats


### Part0. LLaMA-2(7B) FP Model

In [4]:
model_path = "meta-llama/Llama-2-7b-hf"

In [13]:
# 모델 경로와 인증 토큰
model_path = "meta-llama/Llama-2-7b-hf"
token = "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"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
fp_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)
for para in fp_model.parameters():
    para.requires_grad = False
fp_model.config.use_cache = False
fp_model.eval()
gc.collect()
torch.cuda.empty_cache()
print(fp_model)
print(f"Parameter data type: {fp_model.dtype}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [14]:
fp_model_memory = 0

In [16]:
# Evaluate FP16 model with memory tracking
fp_model_perplexity, memory_stats_fp = evaluate(fp_model, tokenizer)

print(torch.cuda.is_available())

print(f"\n=== Evaluation Results ===")
print(f"FP16 Model Perplexity: {fp_model_perplexity:.2f}\n")

# Display GPU memory statistics
gpu_allocated_fp_mb = [mem / (1024**2) for mem in memory_stats_fp["gpu_allocated"]]
gpu_reserved_fp_mb = [mem / (1024**2) for mem in memory_stats_fp["gpu_reserved"]]
cpu_memory_fp_mb = [mem / (1024**2) for mem in memory_stats_fp["cpu_memory"]]
if gpu_allocated_fp_mb:
    print(f"Allocated: Min = {min(gpu_allocated_fp_mb):.2f}, Max = {max(gpu_allocated_fp_mb):.2f}, Average = {sum(gpu_allocated_fp_mb)/len(gpu_allocated_fp_mb):.2f}")
else:
    print("GPU Memory Allocated: No data available (Empty sequence).")

if gpu_reserved_fp_mb:
    print(f"Reserved:  Min = {min(gpu_reserved_fp_mb):.2f}, Max = {max(gpu_reserved_fp_mb):.2f}, Average = {sum(gpu_reserved_fp_mb)/len(gpu_reserved_fp_mb):.2f}")
else:
    print("GPU Memory Reserved: No data available (Empty sequence).")

print("=== GPU Memory Usage (MB) ===")
print(f"Allocated: Min = {min(gpu_allocated_fp_mb):.2f}, Max = {max(gpu_allocated_fp_mb):.2f}, Average = {sum(gpu_allocated_fp_mb)/len(gpu_allocated_fp_mb):.2f}")
print(f"Reserved:  Min = {min(gpu_reserved_fp_mb):.2f}, Max = {max(gpu_reserved_fp_mb):.2f}, Average = {sum(gpu_reserved_fp_mb)/len(gpu_reserved_fp_mb):.2f}\n")

# Display CPU memory statistics
print("=== CPU Memory Usage (MB) ===")
print(f"Min = {min(cpu_memory_fp_mb):.2f}, Max = {max(cpu_memory_fp_mb):.2f}, Average = {sum(cpu_memory_fp_mb)/len(cpu_memory_fp_mb):.2f}")
print(f"Model dtype: {fp_model.dtype}")
gc.collect()
torch.cuda.empty_cache()

evaluating...: 100%|██████████| 40/40 [1:00:22<00:00, 90.57s/it]


True

=== Evaluation Results ===
FP16 Model Perplexity: 5.82

Allocated: Min = 1901.81, Max = 1901.81, Average = 1901.81
Reserved:  Min = 2038.00, Max = 2038.00, Average = 2038.00
=== GPU Memory Usage (MB) ===
Allocated: Min = 1901.81, Max = 1901.81, Average = 1901.81
Reserved:  Min = 2038.00, Max = 2038.00, Average = 2038.00

=== CPU Memory Usage (MB) ===
Min = 37243.95, Max = 38689.31, Average = 37944.62
Model dtype: torch.float32


In [17]:
fp_model_ppl = 0

### Part1. LLaMA-2(7B) NormalFloat 4bit Quantization


- 런타임 - 세션 다시 시작

In [18]:
# Import Libraries
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, BitsAndBytesConfig
import datasets
from datasets import load_dataset
from functools import partial
import gc
import copy

In [19]:
# Set evaluation
import torch
import psutil
from tqdm import tqdm
from datasets import load_dataset
import torch.nn as nn
def evaluate(model, tokenizer):
    testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    testenc = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    # Initialize memory tracking
    memory_stats = {
        "gpu_allocated": [],
        "gpu_reserved": [],
        "cpu_memory": []
    }

    nlls = []
    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = (i + 1) * 2048
        batch = testenc[:, start_idx:end_idx].to(model.device)

        if batch.size(1) == 0:
            continue

        # Measure memory usage before forward pass
        if torch.cuda.is_available():
            memory_stats["gpu_allocated"].append(torch.cuda.memory_allocated())
            memory_stats["gpu_reserved"].append(torch.cuda.memory_reserved())

        process = psutil.Process()
        memory_stats["cpu_memory"].append(process.memory_info().rss)  # Resident Set Size (RSS)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = batch[:, 1:]  # Corrected from `testenc` to `batch`
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    perplexity = torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))
    return perplexity, memory_stats


In [20]:
# Load model in NF4
model_path = "meta-llama/Llama-2-7b-hf"
token = "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"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
q_model_nf = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
print(q_model_nf)
gc.collect()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=

In [21]:
q_model_nf_memory = 0

In [22]:
# Evaluate NF4 model with memory tracking
q_model_nf_perplexity, memory_stats_nf = evaluate(q_model_nf, tokenizer)

print(f"\n=== Evaluation Results ===")
print(f"NF4 Model Perplexity: {q_model_nf_perplexity:.2f}\n")

# Display GPU memory statistics
gpu_allocated_nf_mb = [mem / (1024**2) for mem in memory_stats_nf["gpu_allocated"]]
gpu_reserved_nf_mb = [mem / (1024**2) for mem in memory_stats_nf["gpu_reserved"]]
cpu_memory_nf_mb = [mem / (1024**2) for mem in memory_stats_nf["cpu_memory"]]

print("=== GPU Memory Usage (MB) ===")
print(f"Allocated: Min = {min(gpu_allocated_nf_mb):.2f}, Max = {max(gpu_allocated_nf_mb):.2f}, Average = {sum(gpu_allocated_nf_mb)/len(gpu_allocated_nf_mb):.2f}")
print(f"Reserved:  Min = {min(gpu_reserved_nf_mb):.2f}, Max = {max(gpu_reserved_nf_mb):.2f}, Average = {sum(gpu_reserved_nf_mb)/len(gpu_reserved_nf_mb):.2f}\n")

# Display CPU memory statistics
print("=== CPU Memory Usage (MB) ===")
print(f"Min = {min(cpu_memory_nf_mb):.2f}, Max = {max(cpu_memory_nf_mb):.2f}, Average = {sum(cpu_memory_nf_mb)/len(cpu_memory_nf_mb):.2f}")
gc.collect()
torch.cuda.empty_cache()

evaluating...: 100%|██████████| 40/40 [54:12<00:00, 81.30s/it]



=== Evaluation Results ===
NF4 Model Perplexity: 6.01

=== GPU Memory Usage (MB) ===
Allocated: Min = 4065.36, Max = 4441.36, Average = 4431.02
Reserved:  Min = 4174.00, Max = 6478.00, Average = 6414.15

=== CPU Memory Usage (MB) ===
Min = 25979.50, Max = 27832.00, Average = 27741.74


In [23]:
q_model_nf_ppl = 0

### Part2. LLaMA-2(7B) GPTQ INT4 Quantization

LLaMA Quantization with GPTQ 4-bit


- 런타임 - 런타임 연결해제 및 삭제

In [12]:
print('Installing packages...')
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install torch tqdm
!pip install autoawq
!pip install -U auto-gptq optimum
!pip install --upgrade torchvision

Installing packages...
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\esslab\appdata\local\temp\pip-req-build-pkkzj2_7
  Resolved https://github.com/huggingface/transformers.git to commit 9e420e02698f73a70ec1c99961f166c1b5df98bd
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\esslab\AppData\Local\Temp\pip-req-build-pkkzj2_7'


Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to c:\users\esslab\appdata\local\temp\pip-req-build-sephw5_q
  Resolved https://github.com/huggingface/accelerate.git to commit cb8b7c637a8588668c52bd306f9b2828f69d9585
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git 'C:\Users\esslab\AppData\Local\Temp\pip-req-build-sephw5_q'


Collecting autoawq
  Downloading autoawq-0.2.7.post3-py3-none-any.whl.metadata (18 kB)
INFO: pip is looking at multiple versions of autoawq to determine which version is compatible with other requirements. This could take a while.
  Using cached autoawq-0.2.7.post2-py3-none-any.whl.metadata (18 kB)
  Using cached autoawq-0.2.7.post1-py3-none-any.whl.metadata (18 kB)
  Using cached autoawq-0.2.7-py3-none-any.whl.metadata (18 kB)
  Using cached autoawq-0.2.6-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting torch==2.3.1 (from autoawq)
  Using cached torch-2.3.1-cp39-cp39-win_amd64.whl.metadata (26 kB)
Collecting autoawq-kernels (from autoawq)
  Using cached autoawq_kernels-0.0.9-cp39-cp39-win_amd64.whl.metadata (2.5 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch==2.3.1->autoawq)
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
INFO: pip is looking at multiple versions of autoawq-kernels to determine which version is compatible with other requirements. This 

ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.




- 세션 다시 시작

In [2]:
# Import Libraries
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, BitsAndBytesConfig
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import datasets
from datasets import load_dataset
from functools import partial
import gc
import copy

CUDA extension not installed.
CUDA extension not installed.


In [3]:
import torch
print(torch.cuda.is_available())  # True여야 함
print(torch.cuda.device_count())  # GPU 개수 출력
print(torch.cuda.get_device_name(0))  # GPU 이름 출력
gc.collect()
torch.cuda.empty_cache()


True
1
NVIDIA GeForce GTX 1660 SUPER


In [4]:
# 4-bit GPTQ with LLaMA2 (7B) (group size = 128)
model_path = "meta-llama/Llama-2-7b-hf"
token = "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"

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
quantization_config = GPTQConfig(bits=4, group_size= 128, dataset="c4", tokenizer=tokenizer)
q_model_gptq = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", quantization_config=quantization_config, token=token)

for para in q_model_gptq.parameters():
    para.requires_grad = False
q_model_gptq.config.use_cache = False
q_model_gptq.eval()

gc.collect()
torch.cuda.empty_cache()

print(q_model_gptq)
print(f"Parameter data type: {q_model_gptq.dtype}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)
Parameter data type: torch.float16


In [5]:
q_model_gptq.save_pretrained("llama-2-7b-gptq")
tokenizer.save_pretrained("llama-2-7b-gptq")



Saving checkpoint shards:   0%|          | 0/1 [00:00<?, ?it/s]

('llama-2-7b-gptq\\tokenizer_config.json',
 'llama-2-7b-gptq\\special_tokens_map.json',
 'llama-2-7b-gptq\\tokenizer.model',
 'llama-2-7b-gptq\\added_tokens.json')


- 런타임 - 세션 다시 시작

In [1]:
# Import Libraries
import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, BitsAndBytesConfig
import datasets
from datasets import load_dataset
from functools import partial
import gc
import copy

In [2]:
# Set evaluation
import torch
import psutil
from tqdm import tqdm
from datasets import load_dataset
import torch.nn as nn
def evaluate(model, tokenizer):
    testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    testenc = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')

    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    # Initialize memory tracking
    memory_stats = {
        "gpu_allocated": [],
        "gpu_reserved": [],
        "cpu_memory": []
    }

    nlls = []
    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = (i + 1) * 2048
        batch = testenc[:, start_idx:end_idx].to(model.device)

        if batch.size(1) == 0:
            continue

        # Measure memory usage before forward pass
        if torch.cuda.is_available():
            memory_stats["gpu_allocated"].append(torch.cuda.memory_allocated())
            memory_stats["gpu_reserved"].append(torch.cuda.memory_reserved())

        process = psutil.Process()
        memory_stats["cpu_memory"].append(process.memory_info().rss)  # Resident Set Size (RSS)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = batch[:, 1:]  # Corrected from `testenc` to `batch`
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    perplexity = torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))
    return perplexity, memory_stats

In [3]:

token = "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"

q_model_gptq = AutoModelForCausalLM.from_pretrained("llama-2-7b-gptq", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("llama-2-7b-gptq", use_fast=False, token = token)
print(q_model_gptq)
gc.collect()
torch.cuda.empty_cache()

CUDA extension not installed.
CUDA extension not installed.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)


In [4]:
q_model_gptq_memory = 0

In [12]:
q_model_gptq_perplexity, memory_stats_gptq = evaluate(q_model_gptq, tokenizer)

print(f"\n=== Evaluation Results ===")
print(f"GPTQ Model Perplexity: {q_model_gptq_perplexity:.2f}\n")

# Display GPU memory statistics
gpu_allocated_gptq_mb = [mem / (1024**2) for mem in memory_stats_gptq["gpu_allocated"]]
gpu_reserved_gptq_mb = [mem / (1024**2) for mem in memory_stats_gptq["gpu_reserved"]]
cpu_memory_gptq_mb = [mem / (1024**2) for mem in memory_stats_gptq["cpu_memory"]]

print("=== GPU Memory Usage (MB) ===")
print(f"Allocated: Min = {min(gpu_allocated_gptq_mb):.2f}, Max = {max(gpu_allocated_gptq_mb):.2f}, Average = {sum(gpu_allocated_gptq_mb)/len(gpu_allocated_gptq_mb):.2f}")
print(f"Reserved:  Min = {min(gpu_reserved_gptq_mb):.2f}, Max = {max(gpu_reserved_gptq_mb):.2f}, Average = {sum(gpu_reserved_gptq_mb)/len(gpu_reserved_gptq_mb):.2f}\n")

# Display CPU memory statistics
print("=== CPU Memory Usage (MB) ===")
print(f"Min = {min(cpu_memory_gptq_mb):.2f}, Max = {max(cpu_memory_gptq_mb):.2f}, Average = {sum(cpu_memory_gptq_mb)/len(cpu_memory_gptq_mb):.2f}")
gc.collect()
torch.cuda.empty_cache()

evaluating...: 100%|██████████| 40/40 [34:48<00:00, 52.22s/it]



=== Evaluation Results ===
GPTQ Model Perplexity: 6.03

=== GPU Memory Usage (MB) ===
Allocated: Min = 3772.33, Max = 4147.35, Average = 4137.97
Reserved:  Min = 4040.00, Max = 5174.00, Average = 5139.40

=== CPU Memory Usage (MB) ===
Min = 1007.66, Max = 1565.83, Average = 1538.02


In [6]:
q_model_gptq_ppl = 0

### Part3. LLaMA-2(7B) Pruning

In [5]:
# Pruning에 필요한 라이브러리 설치
print('Installing packages...')
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install torch tqdm
!pip install -U bitsandbytes

Installing packages...
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-xxgmedib
  Resolved https://github.com/huggingface/transformers.git to commit f42084e6411c39b74309af4a7d6ed640c01a4c9e
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-xxgmedib'


Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-2jl7ipix
  Resolved https://github.com/huggingface/accelerate.git to commit 200c9eb7833cfa505907f6f224ebf5a275aa6d92
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-2jl7ipix'




In [7]:
# 필요한 라이브러리 임포트
import tqdm
from tqdm import tqdm
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import datasets
from datasets import load_dataset
from functools import partial
import gc
import copy
from torch.nn.utils import prune
import psutil

In [8]:
# 평가 함수 정의
def evaluate(model, tokenizer):
    testenc = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    testenc = tokenizer("\n\n".join(testenc['text']), return_tensors='pt')
    testenc = testenc.input_ids.to(model.device)
    nsamples = 40
    model = model.eval()

    memory_stats = {
        "gpu_allocated": [],
        "gpu_reserved": [],
        "cpu_memory": []
    }

    nlls = []
    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = (i + 1) * 2048
        batch = testenc[:, start_idx:end_idx].to(model.device)

        if batch.size(1) == 0:
            continue

        if torch.cuda.is_available():
            memory_stats["gpu_allocated"].append(torch.cuda.memory_allocated())
            memory_stats["gpu_reserved"].append(torch.cuda.memory_reserved())

        process = psutil.Process()
        memory_stats["cpu_memory"].append(process.memory_info().rss)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = batch[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)

    perplexity = torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))
    return perplexity, memory_stats

In [12]:
# 모델 경로와 인증 토큰
model_path = "meta-llama/Llama-2-7b-hf"
token = "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"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
fp_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
for para in fp_model.parameters():
    para.requires_grad = False
fp_model.config.use_cache = False
fp_model.eval()
gc.collect()
torch.cuda.empty_cache()
print(fp_model)
print(f"Parameter data type: {fp_model.dtype}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [10]:
def apply_pruning(model, pruning_amount=0.1):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=pruning_amount)
            prune.remove(module, 'weight')  # Remove hooks to finalize the pruning

pruned_model = copy.deepcopy(fp_model)
apply_pruning(pruned_model, pruning_amount=0.1)
pruned_model.eval()
gc.collect()
torch.cuda.empty_cache()

In [11]:
# Pruned 모델 평가
print("\nEvaluating Pruned model...")
pruned_model_perplexity, memory_stats_pruned = evaluate(pruned_model, tokenizer)

print(f"\n=== Evaluation Results ===")
print(f"Pruned Model Perplexity: {pruned_model_perplexity:.2f}\n")

# Display GPU memory statistics
gpu_allocated_pruned_mb = [mem / (1024**2) for mem in memory_stats_pruned["gpu_allocated"]]
gpu_reserved_pruned_mb = [mem / (1024**2) for mem in memory_stats_pruned["gpu_reserved"]]
cpu_memory_pruned_mb = [mem / (1024**2) for mem in memory_stats_pruned["cpu_memory"]]

print("=== GPU Memory Usage (MB) ===")
print(f"Allocated: Min = {min(gpu_allocated_pruned_mb):.2f}, Max = {max(gpu_allocated_pruned_mb):.2f}, Average = {sum(gpu_allocated_pruned_mb)/len(gpu_allocated_pruned_mb):.2f}")
print(f"Reserved:  Min = {min(gpu_reserved_pruned_mb):.2f}, Max = {max(gpu_reserved_pruned_mb):.2f}, Average = {sum(gpu_reserved_pruned_mb)/len(gpu_reserved_pruned_mb):.2f}\n")

# Display CPU memory statistics
print("=== CPU Memory Usage (MB) ===")
print(f"Min = {min(cpu_memory_pruned_mb):.2f}, Max = {max(cpu_memory_pruned_mb):.2f}, Average = {sum(cpu_memory_pruned_mb)/len(cpu_memory_pruned_mb):.2f}")
gc.collect()
torch.cuda.empty_cache()


Evaluating Pruned model...


evaluating...:  20%|██        | 8/40 [19:35<1:18:23, 147.00s/it]


KeyboardInterrupt: 

### Part4. LLaMA-2(7B) Pruning + NF4

LLaMA-2-7b Pruning + NF4

In [None]:
# Pruning에 필요한 라이브러리 설치
print('Installing packages...')
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install torch tqdm
!pip install -U bitsandbytes

Installing packages...
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-s2vy5nc2
  Resolved https://github.com/huggingface/transformers.git to commit 0531d7513b617f7c5f8b5f333985c63f0edd5fe2
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-s2vy5nc2'


Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to c:\users\public\documents\estsoft\creatortemp\pip-req-build-5zt0zi4y
  Resolved https://github.com/huggingface/accelerate.git to commit 200c9eb7833cfa505907f6f224ebf5a275aa6d92
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git 'C:\Users\Public\Documents\ESTsoft\CreatorTemp\pip-req-build-5zt0zi4y'




In [None]:
import torch
from torch.nn.utils import prune
from torch import nn
import gc
import psutil
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import copy

In [None]:
def apply_pruning(model, pruning_amount=0.1):
    """
    모델에 Pruning을 적용합니다.
    """
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=pruning_amount)
            prune.remove(module, 'weight')  # Remove hooks to finalize the pruning

# Pruning 적용 및 모델 저장
gc.collect()
torch.cuda.empty_cache()

pruned_model = copy.deepcopy(quantized_model)
apply_pruning(pruned_model, pruning_amount=0.1)
pruned_model.eval()
print("Pruning 적용 완료")

# Pruned 모델 저장
torch.save(pruned_model.state_dict(), "nf4_pruned_model.pth")
del quantized_model  # 메모리 해제
gc.collect()
torch.cuda.empty_cache()

Pruning 적용 완료


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    state_dict=torch.load("pruned_model.pth"),
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=token
)
print("NF4 양자화 모델 로드 완료")

gc.collect()
torch.cuda.empty_cache()



  state_dict=torch.load("pruned_model.pth"),


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

NF4 양자화 모델 로드 완료


In [None]:
print("Pruned 및 NF4 양자화된 모델 평가 중...")
loaded_pruned_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    state_dict=torch.load("nf4_pruned_model.pth"),
    device_map="auto",
    use_auth_token=token
)
perplexity, memory_stats = evaluate(loaded_pruned_model, tokenizer)

print(f"\n=== 평가 결과 ===")
print(f"Perplexity: {perplexity:.2f}")

# GPU 메모리 사용량 출력
gpu_allocated_mb = [mem / (1024**2) for mem in memory_stats["gpu_allocated"]]
gpu_reserved_mb = [mem / (1024**2) for mem in memory_stats["gpu_reserved"]]
cpu_memory_mb = [mem / (1024**2) for mem in memory_stats["cpu_memory"]]

print("=== GPU 메모리 사용량 (MB) ===")
print(f"Allocated: Min = {min(gpu_allocated_mb):.2f}, Max = {max(gpu_allocated_mb):.2f}, Average = {sum(gpu_allocated_mb)/len(gpu_allocated_mb):.2f}")
print(f"Reserved:  Min = {min(gpu_reserved_mb):.2f}, Max = {max(gpu_reserved_mb):.2f}, Average = {sum(gpu_reserved_mb)/len(gpu_reserved_mb):.2f}\n")

# CPU 메모리 사용량 출력
print("=== CPU 메모리 사용량 (MB) ===")
print(f"Min = {min(cpu_memory_mb):.2f}, Max = {max(cpu_memory_mb):.2f}, Average = {sum(cpu_memory_mb)/len(cpu_memory_mb):.2f}")



Pruned 및 NF4 양자화된 모델 평가 중...


  state_dict=torch.load("nf4_pruned_model.pth"),


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

evaluating...: 100%|██████████| 40/40 [46:09<00:00, 69.24s/it]


=== 평가 결과 ===
Perplexity: 5.82
=== GPU 메모리 사용량 (MB) ===
Allocated: Min = 7066.99, Max = 7066.99, Average = 7066.99
Reserved:  Min = 19596.00, Max = 19596.00, Average = 19596.00

=== CPU 메모리 사용량 (MB) ===
Min = 28163.58, Max = 43423.80, Average = 31793.68



