In [None]:
!apt-get update

In [None]:
!apt-get install -y wget

In [None]:
!apt-get update

In [None]:
!apt-get install -y git

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install -U safetensors

In [None]:
!pip install -U tokenizers

In [None]:
!pip install --upgrade --no-deps --force-reinstall -U huggingface_hub

In [None]:
!pip install --upgrade --no-deps --force-reinstall -U git+https://github.com/huggingface/transformers.git

In [None]:
!pip install  --upgrade --no-deps --force-reinstall -U git+https://github.com/huggingface/peft.git 

In [None]:
!pip install  --upgrade --no-deps --force-reinstall -U git+https://github.com/huggingface/accelerate.git

In [None]:
#!conda install -y cudatoolkit

In [1]:
import json
import time
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList
from transformers import BitsAndBytesConfig

In [2]:
torch.__version__

'2.1.0.dev20230621'

In [3]:
transformers.__version__

'4.31.0.dev0'

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.version.cuda

'12.1'

In [6]:
# Possible solution to missing libcudart
#!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/libcudart.so
#!ls /opt/conda/lib/python3.10/site-packages/bitsandbytes/
#!cp /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so


In [7]:
def get_model_size(model, in_gb:bool=True) -> str:
    param_size:int = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size:int = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    if in_gb:
        return f"{round((param_size + buffer_size) / 1024**3, 1)} GB"
    else:
        return f"{int((param_size + buffer_size) / 1024**2)} MB"

In [8]:
def time_generation(model, tokenizer, prompt:str, target_token_count:int, is_use_gpu:bool=True) -> dict:
    """
    Given a transformer model, evaluate how long it takes to generate output.
    
    Parameters:
    -----------
    model : transformers.models
        A trained HuggingFace CasualLanguage model to use for generating text
    tokenizer 
        An accompanying tokenizer used by the lanugage model
    target_token_count : int
        The number of tokens to generate, can be less if stopping tokens are encountered
    is_use_gpu : bool
        Flag whether or not to use the gpu for inference
        
    Returns:
    --------
    dict : Containing results from the generation process including:
        output : The generated text
        output_count : The number of tokens in the output
        tokens_per_sec : The number of tokens generated per second
            
    """
    if is_use_gpu:
        tokenized_items = tokenizer(prompt, return_tensors="pt").to("cuda")
    else: 
        tokenized_items = tokenizer(prompt, return_tensors="pt")
    start = time.time()
    logits = model.generate(
                            min_length=target_token_count, 
                            max_length=target_token_count, 
                            do_sample=True,
                            **tokenized_items
                           )
    output = tokenizer.decode(logits[0], skip_special_tokens=True)
    runtime = time.time() - start
    output_count = len(output.split(" "))
    tokens_per_sec = output_count / runtime
    output_count, int(runtime), int(tokens_per_sec)
    return {"output" : output, "output_count": output_count, "tokens_per_sec": tokens_per_sec}

In [9]:
model_id = "PygmalionAI/pygmalion-2.7b"
prompt = '''Billy's Persona: Billy is an angry pirate lost at sea. He misses his leg.
<START>
You: What do you look for in a woman?
Billy:'''

In [10]:
is_run_quant = False

In [12]:
if is_run_quant:
    # Init bits and bytes config
    nf4_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.bfloat16
    )
    model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    results = time_generation(model_nf4, tokenizer, prompt, 1000)
    # Clear memory
    torch.cuda.empty_cache()
    #print(json.dumps(results,indent=3))
    tokens_per_sec = int(results["tokens_per_sec"])
    print ("Model quantized")
    print (f"Size: {get_model_size(model_nf4)}")
    print (f"Speed: {tokens_per_sec} tps")
else:
    # Init word tokenizer
    full_tokenizer = AutoTokenizer.from_pretrained(model_id)
    # Init language model
    full_model = AutoModelForCausalLM.from_pretrained(model_id)
    full_model.to("cuda")
    results = time_generation(full_model, full_tokenizer, prompt, 1000)
    # Clear memory
    torch.cuda.empty_cache()
    #print(json.dumps(results,indent=3))
    tokens_per_sec = int(results["tokens_per_sec"])
    print ("Non quantized")
    print (f"Size: {get_model_size(full_model)}")
    print (f"Speed: {tokens_per_sec} tps")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Non quantized
Size: 10.0 GB
Speed: 4 tps


Model quantized

Size: 1.5 GB

Speed: 7 tps

Non quantized

Size: 10.0 GB

Speed: 4 tps