In [None]:
!pip install transformers bitsandbytes llama-stack --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/87.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.2/232.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.9 MB/s[

In [None]:
!nvidia-smi

In [None]:
import torch
print(torch.cuda.device_count())
print(torch.cuda.current_device())

# Set to GPU
torch.cuda.set_device("cuda:0")
print(torch.cuda.current_device())

In [None]:
from google.colab import userdata
access_token = userdata.get("HF_TOKEN")
print(access_token)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    device_map="auto",
    token=access_token
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    device_map="auto",
    token=access_token
)

prompt = "Tell me about gravity"

# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=access_token)
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")

output = model.generate(**model_inputs)
print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
# Spoof data
prompts = [
        "Explain quantum computing in simple terms.",
        "What is the history of artificial intelligence?",
        "Describe the structure of DNA.",
        "What is the difference between Java and Python?",
        "Explain the concept of recursion in programming."
    ]

N_ITERATIONS = 3
MAX_NEW_TOKENS = 20

In [None]:
import time
import numpy as np

def benchmark_test(benchmark_prompts):
  elapsed_times = np.array([])
  input_token_sizes = np.array([])
  output_token_sizes = np.array([])

  for it in range(N_ITERATIONS):
    for p in benchmark_prompts:
      with torch.no_grad():
        model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")
        input_token_sizes = np.append(input_token_sizes,
                                      len(model_inputs['input_ids'][1]))

        # Measure inference time
        start_time = time.time()
        output = model.generate(**model_inputs, max_new_tokens=MAX_NEW_TOKENS,
                                pad_token_id=tokenizer.eos_token_id)
        end_time = time.time()

        output_token_sizes = np.append(output_token_sizes,
                                       len(output[0]))

      elapsed_t = end_time - start_time
      elapsed_times = np.append(elapsed_times, elapsed_t)
  return { "inference_times": elapsed_times,
           "input_token_sizes": input_token_sizes,
           "output_token_sizes": output_token_sizes }


In [None]:
# Run benchmark in here
benchmark_results = benchmark_test(prompts)
print(f"Mean inference time (sec): {benchmark_results['inference_times'].mean()}")
print(f"Mean input token size: {benchmark_results['input_token_sizes'].mean()}")
print(f"Mean sec/token: { (benchmark_results['inference_times'] / benchmark_results['input_token_sizes']).mean() }")