# Import and init libs

TBD : Quantization with https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

In [1]:
!pip install torch
# !pip install -i https://pypi.org/simple/ bitsandbytes
!pip install bitsandbytes
# !pip install git+https://github.com/huggingface/accelerate.git
!pip install accelerate
# !pip install git+https://github.com/huggingface/transformers.git
!pip install transformers



In [2]:
import os

os.environ["HF_ENDPOINT"] = "https://huggingface.co"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["ACCELERATE_USE_MPS_DEVICE"] = "True"

In [3]:
import torch

print(f'PyTorch Version: {torch.__version__}')
if torch.backends.mps.is_available():
    active_device = torch.device('mps')
elif torch.cuda.is_available():
    active_device = torch.device('cuda', 0)
else:
    active_device = torch.device('cpu')

# torch.set_default_device("mps")  # Set to MPS if needed

print(f'PyTorch Active Device: {active_device}')

PyTorch Version: 2.2.1
PyTorch Active Device: mps


In [4]:
from accelerate import Accelerator

accelerator = Accelerator()
print(f'Accelerator device: {accelerator.device}')

Accelerator device: mps


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

model_id = "gpt2"
# model_id = "bert-base-cased"
device_map={"":active_device}

tm_start = time.monotonic()
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    torch_dtype="auto",
    offload_folder="offload",
    offload_state_dict=True,
)
print(f'Model [{model_id}] Loaded in {time.monotonic() - tm_start} seconds. Memory footprint [{model.get_memory_footprint()}]')

Model [gpt2] Loaded in 1.5370634169998993 seconds. Memory footprint [510342192]


# Runtime

Tokenizer playground : https://tiktokenizer.vercel.app/

In [7]:
import textwrap

prompt = input(f'Request to [{model_id}]: ')

# prompt = '''def print_prime(n):
#    """
#    Print all primes between 1 and n
#    """'''

Request to [bert-base-cased]:  Tell me a joke


In [None]:
tm_start = time.monotonic()
inputs = tokenizer.encode(
    prompt, 
    return_tensors="pt",
    return_attention_mask=False,
).to(active_device)
print(f'Encoded in {time.monotonic() - tm_start} seconds.')

tm_start = time.monotonic()
outputs = model.generate(
    inputs, max_new_tokens=2048, 
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2)
print(f'Generated in {time.monotonic() - tm_start} seconds.')

tm_start = time.monotonic()
response = tokenizer.decode(outputs[0])
print(f'Decoded in {time.monotonic() - tm_start} seconds.')

print("\n".join(textwrap.wrap(response, width=120)))

Encoded in 0.008685917000093468 seconds.
