In [25]:
# --- Clean & Reinstall: Colab T4 (CUDA 12.6) ---
!pip -q uninstall -y bitsandbytes triton torch torchvision torchaudio transformers

# 1) PyTorch stack (cu126) — ให้ตรงกับ Colab
!pip -q install --index-url https://download.pytorch.org/whl/cu126 \
  torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0

# 2) Triton ให้ "เข้าชุด" กับ Torch 2.8 (ต้อง 3.4.0)
!pip -q install --no-cache-dir triton==3.4.0

# 3) LLM stack + bnb รุ่นใหม่ที่เข้ากับ Triton 3.x
!pip -q install --no-cache-dir transformers==4.45.2 accelerate sentencepiece
!pip -q install --no-cache-dir bitsandbytes==0.45.2

# 4) ตรวจสอบ
import torch, transformers, pathlib
import bitsandbytes as bnb

print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())

libs = [p.name for p in pathlib.Path(bnb.__file__).parent.glob('libbitsandbytes_cuda*.so')]
print("bitsandbytes libs:", libs)  # ต้องมีอย่างน้อย 'libbitsandbytes_cuda126.so'


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.17.1 requires transformers, which is not installed.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, which is not installed.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m260.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.8.0+cu126
Transformers: 4.45.2
CUDA available: True
bitsandbytes libs: ['libbitsandbytes_cuda120.so', 'libbitsandbytes_cuda123.so', 'libbitsandbytes_cuda126.so', 'libbitsandbytes_cuda117.so', 'libbitsandbytes_cuda121.so', 'libbitsandbytes_cuda125.so', 'libbitsandbytes_cuda118.so', 'libbit

In [26]:
MODEL_NAME = "gpt2"  # small causal LM; good for speed tests
PROMPT = "In one sentence, explain why code reviews improve software quality."
MAX_NEW_TOKENS = 128 #comparing between 128 and 256

In [27]:
import time, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

def count_new_tokens(tokenizer, input_text, output_text):
    in_ids = tokenizer.encode(input_text, add_special_tokens=False)
    out_ids = tokenizer.encode(output_text, add_special_tokens=False)
    return max(1, len(out_ids))

def run_generation(pipe, prompt, max_new_tokens=64):
    start = time.time()
    out = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
    text = out[0]['generated_text'] if isinstance(out, list) else out[0]['generated_text']
    elapsed = time.time() - start
    return text, elapsed

def show_gpu_mem():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated()/1e9
        reserved = torch.cuda.memory_reserved()/1e9
        print(f"CUDA Memory - allocated: {allocated:.2f} GB, reserved: {reserved:.2f} GB")
    else:
        print("CUDA not available")


In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_cpu = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
pipe_cpu = pipeline('text-generation', model=model_cpu, tokenizer=tokenizer, device=-1)

print('Running CPU FP32...')
text, t = run_generation(pipe_cpu, PROMPT, MAX_NEW_TOKENS)
new_tokens = count_new_tokens(tokenizer, PROMPT, text)
print(f"Latency: {t:.3f}s | Approx new tokens: {new_tokens} | ~tokens/sec: {new_tokens/max(1e-6,t):.2f}")
print('\nOUTPUT (truncated):\n', text[:400])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Running CPU FP32...
Latency: 11.896s | Approx new tokens: 140 | ~tokens/sec: 11.77

OUTPUT (truncated):
 In one sentence, explain why code reviews improve software quality.

"The best way to improve code quality is to make it easier to understand and understand the code," says Dr. David S. Karp, a professor of computer science at the University of California, Berkeley. "If you can understand the code, you can understand the problem."

The problem is that code reviews are often not as thorough as they


In [29]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

if torch.cuda.is_available():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model_fp16 = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map='auto')
    pipe_fp16 = pipeline('text-generation', model=model_fp16, tokenizer=tokenizer)
    show_gpu_mem()
    print('Running GPU FP16...')
    text, t = run_generation(pipe_fp16, PROMPT, MAX_NEW_TOKENS)
    new_tokens = count_new_tokens(tokenizer, PROMPT, text)
    print(f"Latency: {t:.3f}s | Approx new tokens: {new_tokens} | ~tokens/sec: {new_tokens/max(1e-6,t):.2f}")
    show_gpu_mem()
    print('\nOUTPUT (truncated):\n', text[:400])
else:
    print('No CUDA GPU detected; skip FP16 test.')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


CUDA Memory - allocated: 0.59 GB, reserved: 0.89 GB
Running GPU FP16...
Latency: 3.133s | Approx new tokens: 140 | ~tokens/sec: 44.69
CUDA Memory - allocated: 0.59 GB, reserved: 0.89 GB

OUTPUT (truncated):
 In one sentence, explain why code reviews improve software quality.

"The best way to improve code quality is to make it easier to understand and understand the code," says Dr. David S. Karp, a professor of computer science at the University of California, Berkeley. "The more you understand the code, the more you can understand the code."

The problem is that code reviews are not always easy to un


In [30]:
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

if torch.cuda.is_available():
    bnb_8 = BitsAndBytesConfig(load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model_int8 = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_8,
        device_map='auto'
    )
    pipe_int8 = pipeline('text-generation', model=model_int8, tokenizer=tokenizer)
    show_gpu_mem()
    print('Running GPU INT8...')
    text, t = run_generation(pipe_int8, PROMPT, MAX_NEW_TOKENS)
    new_tokens = count_new_tokens(tokenizer, PROMPT, text)
    print(f"Latency: {t:.3f}s | Approx new tokens: {new_tokens} | ~tokens/sec: {new_tokens/max(1e-6,t):.2f}")
    show_gpu_mem()
    print('\nOUTPUT (truncated):\n', text[:400])
else:
    print('No CUDA GPU detected; skip INT8 test.')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


CUDA Memory - allocated: 0.59 GB, reserved: 0.79 GB
Running GPU INT8...
Latency: 4.521s | Approx new tokens: 140 | ~tokens/sec: 30.96
CUDA Memory - allocated: 0.59 GB, reserved: 0.79 GB

OUTPUT (truncated):
 In one sentence, explain why code reviews improve software quality.

"The most important thing is to make sure that you're not making mistakes," says Dr. Michael J. Karp, a professor of computer science at the University of California, Berkeley. "If you're not making mistakes, you're not making mistakes. You're making mistakes because you're making mistakes."

The problem is that code reviews are 


In [31]:
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

if torch.cuda.is_available():
    bnb_4 = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True
    )
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model_int4 = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_4,
            device_map='auto'
        )
        pipe_int4 = pipeline('text-generation', model=model_int4, tokenizer=tokenizer)
        show_gpu_mem()
        print('Running GPU INT4...')
        text, t = run_generation(pipe_int4, PROMPT, MAX_NEW_TOKENS)
        new_tokens = count_new_tokens(tokenizer, PROMPT, text)
        print(f"Latency: {t:.3f}s | Approx new tokens: {new_tokens} | ~tokens/sec: {new_tokens/max(1e-6,t):.2f}")
        show_gpu_mem()
        print('\nOUTPUT (truncated):\n', text[:400])
    except Exception as e:
        print('INT4 load failed:', e)
else:
    print('No CUDA GPU detected; skip INT4 test.')


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


CUDA Memory - allocated: 0.64 GB, reserved: 0.78 GB
Running GPU INT4...
Latency: 1.910s | Approx new tokens: 140 | ~tokens/sec: 73.31
CUDA Memory - allocated: 0.64 GB, reserved: 0.78 GB

OUTPUT (truncated):
 In one sentence, explain why code reviews improve software quality.

"The best way to improve software quality is to make sure that the code is written in a way that is easy to understand and maintain," says the paper. "The best way to do that is to make sure that the code is written in a way that is easy to understand and maintain."

The paper's authors, from the University of California, Berkele
