# BitsAndBytes

In [1]:
from transformers import  AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
#from codecarbon import EmissionsTracker, track_emissions

In [2]:
model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct"
quantized_model_name = "Llama-3.1-70B-Instruct-8bit-bnb-quant"

In [3]:
#quantization configurations - so you quantize the model while inferencing
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    #load_in_8bit=True,
    #llm_int8_skip_modules=["lm_head"],
    #llm_int8_threshold=4.0,
    #llm_int8_has_fp16_weight=True,
    #bnb_4bit_qunat_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
)

In [4]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.bos_token

In [5]:


# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #load_in_8bit=True,
    #model_path,
    #num_labels=4,  # Adjust if necessary based on the specific task
    #quantization_config=bnb_config,
    #device_map="auto",
    device_map="sequential",
    torch_dtype=torch.bfloat16,
    #device_map=device_map,
    #load_in_8bit_fp32_cpu_offload=True,
)
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [6]:
model.save_pretrained(quantized_model_name)


In [7]:
tokenizer.save_pretrained(quantized_model_name)

('Llama-3-70B-Instruct-4bit-bnb-quant/tokenizer_config.json',
 'Llama-3-70B-Instruct-4bit-bnb-quant/special_tokens_map.json',
 'Llama-3-70B-Instruct-4bit-bnb-quant/tokenizer.json')

In [8]:
model.push_to_hub(quantized_model_name)

model-00005-of-00009.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/3.02G [00:00<?, ?B/s]

model-00001-of-00009.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dwetzel/Llama-3-70B-Instruct-4bit-bnb-quant/commit/8f14e6f53313e25bab70b44b9672f24f0b780e18', commit_message='Upload LlamaForCausalLM', commit_description='', oid='8f14e6f53313e25bab70b44b9672f24f0b780e18', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
tokenizer.push_to_hub(quantized_model_name)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dwetzel/Llama-3-70B-Instruct-4bit-bnb-quant/commit/ec499df15c60bf5948293481fa09850b31464c01', commit_message='Upload tokenizer', commit_description='', oid='ec499df15c60bf5948293481fa09850b31464c01', pr_url=None, pr_revision=None, pr_num=None)

# FP8

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig

In [2]:
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
quantized_model_name = "Meta-Llama-3-70B-Instruct-8bit-FP8-quant"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Load and tokenize 512 dataset samples for calibration of activation scales
ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(128))
examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
# Define quantization config with static activation scales
quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")

# For dynamic activation scales, there is no need for calbration examples
#examples = []

# Load the model, quantize, and save checkpoint
model = AutoFP8ForCausalLM.from_pretrained(model_name, quantize_config)

Loading model with the following kwargs: {'torch_dtype': 'auto', 'device_map': 'auto', 'cache_dir': None, 'force_download': False, 'proxies': None, 'resume_download': False, 'local_files_only': False, 'use_auth_token': None, 'revision': None, 'subfolder': '', '_commit_hash': None}




Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [6]:
model.quantize(examples)
model.save_quantized(quantized_model_name)

Quantizing weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1126/1126 [00:00<00:00, 3556.46it/s]
Calibrating activation scales: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [33:41<00:00, 15.79s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 8192)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): FP8StaticLinear()
          (k_proj): FP8StaticLinear()
          (v_proj): FP8StaticLinear()
          (o_proj): FP8StaticLinear()
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): FP8StaticLinear()
          (up_proj): FP8StaticLinear()
          (down_proj): FP8StaticLinear()
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=8192, out_features=128256, bias=False)
)
Saving the model to Meta-Llama-3-70B-Instruct-8bit-FP8-quant


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer.save_pretrained(quantized_model_name)

('Meta-Llama-3-70B-Instruct-8bit-FP8-quant/tokenizer_config.json',
 'Meta-Llama-3-70B-Instruct-8bit-FP8-quant/special_tokens_map.json',
 'Meta-Llama-3-70B-Instruct-8bit-FP8-quant/tokenizer.json')

In [8]:
# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
repo_id = f"{quantized_model_name}"
commit_message = f"AutoFP8 model for {model_name}: 8bits quantization with static activation scales"
model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

AttributeError: 'AutoFP8ForCausalLM' object has no attribute 'push_to_hub'

# GPTQ

In [1]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset, concatenate_datasets

import json
import random

In [2]:
#pretrained_model_dir = "meta-llama/Llama-2-70b-chat-hf"
#quantized_model_dir = "Llama-2-70b-chat-8bit-GPTQ-quant"

pretrained_model_dir = "meta-llama/Meta-Llama-3-70B-Instruct"
quantized_model_dir = "Meta-Llama-3-70B-Instruct-8bit-GPTQ-quant"


In [3]:
# Function to read and extract content from question JSONL file
def extract_question_content(file_path):
    contents = []
    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            content = data['turns'][0]['content']
            contents.append(content)
    return contents

In [4]:
# Function to read and extract content from answer JSONL file
def extract_answer_content(file_path):
    contents = []
    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            content = data['choices'][0]['turns'][0]['content']
            contents.append(content)
    return contents

In [5]:
# Extract questions and answers
question_file_path = 'arena_hard_tests/question.jsonl'
answer_file_path = 'arena_hard_tests/model_answers/gpt-4o.jsonl'

In [6]:
questions = extract_question_content(question_file_path)
answers = extract_answer_content(answer_file_path)

In [7]:
print(questions)



In [8]:
questions = random.sample(questions, 250)
answers = random.sample(answers, 250)

sampled_data = questions + answers

print(sampled_data)
print(len(sampled_data))

500


In [9]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir)

tokenizer.pad_token = tokenizer.eos_token

tokenized_examples = [tokenizer(example,padding='max_length', truncation=True, max_length=1024) for example in sampled_data]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
quantize_config = BaseQuantizeConfig(
    bits=8,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

In [11]:
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)



Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00003-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00005-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00007-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00008-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00009-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00010-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00011-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00012-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00013-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00014-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00015-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00016-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00017-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00018-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
print(tokenized_examples[0])

{'input_ids': [128000, 17059, 685, 10991, 7761, 1749, 512, 998, 6205, 505, 264, 4288, 3977, 1630, 449, 281, 962, 840, 282, 55, 11, 2980, 2500, 4288, 198, 10014, 816, 449, 13072, 282, 56, 1174, 1778, 430, 1070, 6866, 264, 6926, 272, 871, 220, 15, 449, 198, 69, 55, 2120, 340, 69, 56, 320, 87, 340, 126863, 272, 1174, 55800, 87, 449, 282, 55, 2120, 8, 871, 220, 15, 16853, 6806, 20400, 379, 505, 279, 8141, 449, 17915, 734, 282, 56, 16853, 6806, 20400, 577, 505, 264, 14113, 320, 15, 11, 220, 16, 8, 8141, 627, 6806, 1442, 577, 38394, 282, 55, 7166, 25239, 9991, 56, 320, 88, 5850, 1243, 1935, 379, 439, 279, 12974, 49803, 26, 6062, 345, 693, 311, 3094, 220, 16, 627, 56, 1288, 387, 1054, 46122, 863, 311, 7068, 323, 272, 1288, 387, 1903, 439, 2678, 439, 3284, 13, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 12800

In [None]:
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(tokenized_examples)

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
INFO - Quantizing self_attn.v_proj in layer 1/32...
INFO - Quantizing self_attn.q_proj in layer 1/32...
INFO - Quantizing self_attn.o_proj in layer 1/32...
INFO - Quantizing mlp.up_proj in layer 1/32...
INFO - Quantizing mlp.gate_proj in layer 1/32...
INFO - Quantizing mlp.down_proj in layer 1/32...
INFO - Start quantizing layer 2/32
INFO - Quantizing self_attn.k_proj in layer 2/32...
INFO - Quantizing self_attn.v_proj in layer 2/32...
INFO - Quantizing self_attn.q_proj in layer 2/32...
INFO - Quantizing self_attn.o_proj in layer 2/32...
INFO - Quantizing mlp.up_proj in layer 2/32...
INFO - Quantizing mlp.gate_proj in layer 2/32...
INFO - Quantizing mlp.down_proj in layer 2/32...
INFO - Start quantizing layer 3/32
INFO - Quantizing self_attn.k_proj in layer 3/32...
INFO - Quantizing self_attn.v_proj in layer 3/32...
INFO - Quantizing self_attn.q_proj in layer 3/32...
INFO - Quantizing self_attn.o_pro

In [None]:
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
tokenizer.save_pretrained(quantized_model_dir)

('Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant/tokenizer_config.json',
 'Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant/special_tokens_map.json',
 'Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant/tokenizer.json')

In [None]:
# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
repo_id = f"{quantized_model_dir}"
commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

INFO - Uploading the following files to dwetzel/Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant: config.json,tokenizer.json,tokenizer_config.json,special_tokens_map.json,gptq_model-8bit-128g.safetensors,quantize_config.json


gptq_model-8bit-128g.safetensors:   0%|          | 0.00/9.25G [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/dwetzel/Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant/commit/75703a41f002a09ada8673595e2371f3095f29b8', commit_message='AutoGPTQ model for meta-llama/Meta-Llama-3-8B-Instruct: 8bits, gr128, desc_act=False', commit_description='', oid='75703a41f002a09ada8673595e2371f3095f29b8', pr_url=None, pr_revision=None, pr_num=None)

# Test Model (with vLLM) - Quantization Support in vLLM is currently not ideal

In [None]:
# Execute in Shell
#%%sh
#python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 --tensor-parallel-size 8 --max-model-len 4096 --gpu-memory-utilization 0.85 --enable-chunked-prefill --served-model-name llama3.1-70b-fp8

In [None]:
# Execute in Shell - For Docker
# %%sh
# docker run --runtime nvidia --gpus all -v ~/.cache/huggingface:/root/.cache/huggingface --env "HUGGING_FACE_HUB_TOKEN=..." -p 8000:8000 --ipc=host vllm/vllm-openai:latest --model neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 --tensor-parallel-size 8 --max-model-len 4096 --gpu-memory-utilization 0.85 --enable-chunked-prefill --served-model-name llama3.1-70b-fp8

In [36]:
from openai import OpenAI

from datasets import load_dataset, concatenate_datasets
from transformers import  AutoTokenizer

import torch
import json
from time import time
import pandas as pd

In [37]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    CBLACKBG  = '\33[40m'
    CREDBG    = '\33[41m'
    CGREENBG  = '\33[42m'
    CYELLOWBG = '\33[43m'
    CBLUEBG   = '\33[44m'
    CVIOLETBG = '\33[45m'
    CBEIGEBG  = '\33[46m'
    CWHITEBG  = '\33[47m'
    CBLACK  = '\33[30m'
    CRED    = '\33[31m'
    CGREEN  = '\33[32m'
    CYELLOW = '\33[33m'
    CBLUE   = '\33[34m'
    CVIOLET = '\33[35m'
    CBEIGE  = '\33[36m'
    CWHITE  = '\33[37m'

In [38]:
%%sh 
nvidia-smi

Sun Aug  4 20:58:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      On  | 00000000:9F:00.0 Off |                    0 |
| N/A   62C    P0              30W /  72W |  21650MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA L4                      On  | 00000000:A1:00.0 Off |  

|   4  NVIDIA L4                      On  | 00000000:AE:00.0 Off |                    0 |
| N/A   64C    P0              31W /  72W |  21650MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA L4                      On  | 00000000:B0:00.0 Off |                    0 |
| N/A   59C    P0              30W /  72W |  21650MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   6  NVIDIA L4                      On  | 00000000:B2:00.0 Off |                    0 |
| N/A   60C    P0              30W /  72W |  21650MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+---------

In [39]:
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
    base_url=openai_api_base,
)

#model_name = "dwetzel/Meta-Llama-3-8B-Instruct-8bit-GPTQ-quant"
#model_name = "llama3.1-70b-fp8"
model_name = "llama3.1-70b"

In [40]:
prompt = [{"role": "user", "content": "Hello!"}]

completion = client.chat.completions.create(
  model=model_name,
  messages=prompt
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Hello. Is there something I can help you with or would you like to chat?', role='assistant', function_call=None, tool_calls=[])


In [41]:
def print_prompt(prompt, completion, with_system=False):
    print("="*30 + f" Chat with  --- {model_name} ---  LLM using vLLM " + "="*30 + "\n")
    for idx, message in enumerate(prompt):
        if prompt[idx]['role'] == 'user':
            color = bcolors.CBLUE
            print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
            print(prompt[idx]['content']+ "\n")
        else: 
            if with_system:
                color = bcolors.CVIOLET
                print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
                print(prompt[idx]['content']+ "\n")

    print(bcolors.OKGREEN + f"[ {completion.choices[0].message.role.upper()} ]" + bcolors.ENDC)
    print(completion.choices[0].message.content)

In [42]:
print_prompt(prompt, completion)


[34m[ USER ][0m
Hello!

[92m[ ASSISTANT ][0m
Hello. Is there something I can help you with or would you like to chat?


In [43]:
instruction = "Who are you and how can you help me?\n\n Also can you help me with the following question: \n\n What is the Distance from Pluto to the Sun?"
user_message = f"###Question:\n{instruction} \n\n ###Answer:"

question = [
    {"role": "system", "content": "You are a helpful assistant. Your task is to help the user with the following question:"},
    {"role": "user", "content": user_message},
]

In [44]:

## GEMMA 2 does not support the "system" role, so we will only use the "user" role.

# system = "You are a helpful assistant. Your task is to help the user with the following question:"
# instruction = "Who are you and how can you help me?\n\n Also can you help me with the following question: \n\n What is the Distance from Pluto to the Sun?"
# user_message = f"###SYSTEM:\n {system} \n\n ###QUESTION:\n{instruction} \n\n ###ANSWER:"

# question = [
#     {"role": "user", "content": user_message},
# ]

In [45]:
def print_stream(prompt, stream, with_system=False):
    print("="*30 + f" Chat with  --- {model_name} ---  LLM using vLLM " + "="*30 + "\n")
    for idx, message in enumerate(prompt):
        if prompt[idx]['role'] == 'user':
            color = bcolors.CBLUE
            print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
            print(prompt[idx]['content']+ "\n")
        else: 
            if with_system:
                color = bcolors.CVIOLET
                print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
                print(prompt[idx]['content']+ "\n")

    print(bcolors.OKGREEN + f"[ ASSISTANT ]" + bcolors.ENDC + "\n")
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            print(chunk.choices[0].delta.content, end="")

In [46]:
stream = client.chat.completions.create(
    model=model_name,
    messages=question,
    stream=True, 
    extra_body={
        "min_p" : 0.05 # If it's set to 0.05, then it will allow tokens at least 1/20th as probable as the top token to be generated.
    }
)
print_stream(question , stream)


[34m[ USER ][0m
###Question:
Who are you and how can you help me?

 Also can you help me with the following question: 

 What is the Distance from Pluto to the Sun? 

 ###Answer:

[92m[ ASSISTANT ][0m

**I'm an AI Assistant**

I'm an artificial intelligence language model designed to assist and provide helpful information to users. I can help with a wide range of topics, from answering simple questions to providing more in-depth explanations and guidance.

**How can I help you?**

I can assist with:

* Answering general knowledge questions
* Providing definitions and explanations
* Offering suggestions and recommendations
* Helping with language-related tasks, such as proofreading and text analysis
* Generating text and responding to prompts

**Now, about your question...**

**What is the Distance from Pluto to the Sun?**

The average distance from Pluto to the Sun is approximately **3.67 billion miles (5.9 billion kilometers)**. This is also known as an astronomical unit (AU). Ho

In [47]:
def load_questions_to_df(question_file: str):
    """Load questions from a file into a DataFrame."""
    questions = []
    with open(question_file, "r") as ques_file:
        for line in ques_file:
            if line:
                questions.append(json.loads(line))
    
    df = pd.DataFrame([{
        "question_id": question["question_id"],
        "content": question["turns"][0]["content"],
        "cluster": question["cluster"]
    } for question in questions])
    
    return df

In [48]:
question_df = load_questions_to_df("arenahard_question.jsonl")
question_df.head()

Unnamed: 0,question_id,content,cluster
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges
3,9f25ff7c0d6a4d74846bfe76af8d925c,Design a semikinematic mounting for a right an...,AI & Sequence Alignment Challenges
4,04ba0aeb79524f6c8520d47cada34f25,I have a dataset which contains a list of 2D i...,AI Image Upscaling


In [49]:
def prepare_prompts(df):
    """Prepare prompts dynamically based on question DataFrame."""
    prompts = []

    for _, row in df.iterrows():
        system_prompt = f"""
        You are a sophisticated AI-Expert there to help users solve tasks in several domains efficiently and accurately.
        Now solve the following task from the domain "{row['cluster']}".\n
        """

        user_message = f"{row['content']}"
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ]

        prompts.append(messages)

    df['prompt'] = prompts
    return df

In [50]:
question_df = prepare_prompts(question_df)
question_df.head()

Unnamed: 0,question_id,content,cluster,prompt
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups,"[{'role': 'system', 'content': '  You a..."
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups,"[{'role': 'system', 'content': '  You a..."
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges,"[{'role': 'system', 'content': '  You a..."
3,9f25ff7c0d6a4d74846bfe76af8d925c,Design a semikinematic mounting for a right an...,AI & Sequence Alignment Challenges,"[{'role': 'system', 'content': '  You a..."
4,04ba0aeb79524f6c8520d47cada34f25,I have a dataset which contains a list of 2D i...,AI Image Upscaling,"[{'role': 'system', 'content': '  You a..."


In [51]:
def store_results_to_df(df, outputs, model_id):
    """Store model outputs in the DataFrame and save to a file."""
    answer_ids = list(range(1, len(outputs) + 1))
    answers = [output['generated_text'] for output in outputs]
    token_lens = [len(answer.split()) for answer in answers]

    df['answer_id'] = answer_ids
    df['model_id'] = model_id
    df['answer'] = answers
    df['token_len'] = token_lens
    df['timestamp'] = time()

    return df

In [52]:
question_df_subset = question_df.head(3)
question_df_subset

Unnamed: 0,question_id,content,cluster,prompt
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups,"[{'role': 'system', 'content': '  You a..."
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups,"[{'role': 'system', 'content': '  You a..."
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges,"[{'role': 'system', 'content': '  You a..."


In [53]:
for idx, row in question_df_subset.iterrows():
    print("\n\n\n" + "-"*10 + f"Question {idx + 1} of {len(question_df_subset)}" + "-"*10)
    question = row['prompt']
    stream = client.chat.completions.create(
        model=model_name,
        messages=question,
        stream=True, 
        extra_body={
            "min_p" : 0.05 # If it's set to 0.05, then it will allow tokens at least 1/20th as probable as the top token to be generated.
        }
    )
    print_stream(question, stream)




----------Question 1 of 3----------

[34m[ USER ][0m
Use ABC notation to write a melody in the style of a folk tune.

[92m[ ASSISTANT ][0m

Here's a melody in the style of a folk tune written in ABC notation:

X:1
T:Folk Tune
M:4/4
L:1/8
R:folk
K:C
C2 E2 G2 G2 | A2 G2 F2 E2 | D2 D2 E2 G2 | C2 E2 G2 G2 |
A2 G2 F2 E2 | D2 D2 E2 G2 | E2 D2 C2 E2 | G2 G2 F2 E2 |
D2 D2 E2 G2 | C2 E2 G2 G2 | A2 G2 F2 E2 | D2 D2 E2 G2 |
E2 D2 C2 E2 | G2 G2 F2 E2 |

Let's break down this ABC notation:

- `X:1` is the reference number for the tune.
- `T:Folk Tune` is the title of the tune.
- `M:4/4` specifies the time signature (common time).
- `L:1/8` specifies the default note length (an eighth note).
- `R:folk` specifies the rhythm (folk style).
- `K:C` specifies the key (C major).

The vertical bars (`|`) separate measures. The letters `C`, `E`, `G`, `A`, `D`, `F` represent the corresponding musical notes. The numbers `2` after the notes specify that they are eighth notes.

You can play around with t

# Test Model with Transformers Pipeline

In [1]:
from transformers import  AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import json
from time import time
import pandas as pd

In [2]:
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"

In [3]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.bos_token

In [4]:
# Load the model with flash attention 2
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    #attn_implementation="flash_attention_2", #FlashAttention only supports Ampere GPUs or newer.
)
model.generation_config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
# Initialize the pipeline
text_generation_pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer, device_map="auto", batch_size=64, num_workers=48 )

In [6]:
def query_model_pipeline(prompt_list, temperature=0.8, min_p=0.05, max_length=2048):

    # Start Timer for Inference
    start_time = time()

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = text_generation_pipeline(
        prompt_list,
        do_sample=True,
        min_p=min_p,
        temperature=temperature,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        #return_tensors=True,
        #return_text = True,
        pad_token_id=tokenizer.pad_token_id
    )

    # End Timer for Inference
    end_time = time()

    ttime = end_time-start_time

    return outputs, ttime

In [7]:
def load_questions_to_df(question_file: str):
    """Load questions from a file into a DataFrame."""
    questions = []
    with open(question_file, "r") as ques_file:
        for line in ques_file:
            if line:
                questions.append(json.loads(line))
    
    df = pd.DataFrame([{
        "question_id": question["question_id"],
        "content": question["turns"][0]["content"],
        "cluster": question["cluster"]
    } for question in questions])
    
    return df


In [8]:
question_df = load_questions_to_df("arenahard_question.jsonl")
question_df.head()

Unnamed: 0,question_id,content,cluster
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges
3,9f25ff7c0d6a4d74846bfe76af8d925c,Design a semikinematic mounting for a right an...,AI & Sequence Alignment Challenges
4,04ba0aeb79524f6c8520d47cada34f25,I have a dataset which contains a list of 2D i...,AI Image Upscaling


In [9]:
        # Follow these guidelines when responding to user instructions:
        
        # 1. **Understand the Task**: Carefully read and comprehend the given instruction or task description.
        # 2. **Generate a Plan**: Create a Detailed Plan to solve the task, including all necessary steps and considerations.
        # 3. **Solve the Task Step by Step**: Implement the plan step by step, ensuring that each step is accurate and complete. Write down each step and the corresponding output.
        # 4. **Create an Answer Summary**: Summarize the Results of each Step and come up with a Final Solution for the Task.
        # 5. **Critically Review the Solution**: Review the Final Solution and ensure that it is accurate, complete, and well-structured.

In [10]:
def prepare_prompts(df):
    """Prepare prompts dynamically based on question DataFrame."""
    prompts = []

    for _, row in df.iterrows():
        system_prompt = f"""
        You are a sophisticated AI-Expert there to help users solve tasks in several domains efficiently and accurately.
        Now solve the following task from the domain "{row['cluster']}".\n
        """

        user_message = f"{row['content']}"
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ]
        prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )

        prompts.append(prompt)

    df['prompt'] = prompts
    return df

In [11]:
question_df = prepare_prompts(question_df)
question_df.head()

Unnamed: 0,question_id,content,cluster,prompt
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups,<|begin_of_text|><|start_header_id|>system<|en...
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups,<|begin_of_text|><|start_header_id|>system<|en...
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges,<|begin_of_text|><|start_header_id|>system<|en...
3,9f25ff7c0d6a4d74846bfe76af8d925c,Design a semikinematic mounting for a right an...,AI & Sequence Alignment Challenges,<|begin_of_text|><|start_header_id|>system<|en...
4,04ba0aeb79524f6c8520d47cada34f25,I have a dataset which contains a list of 2D i...,AI Image Upscaling,<|begin_of_text|><|start_header_id|>system<|en...


In [12]:
def store_results_to_df(df, outputs, model_id):
    """Store model outputs in the DataFrame and save to a file."""
    answer_ids = list(range(1, len(outputs) + 1))
    answers = [output['generated_text'] for output in outputs]
    token_lens = [len(answer.split()) for answer in answers]

    df['answer_id'] = answer_ids
    df['model_id'] = model_id
    df['answer'] = answers
    df['token_len'] = token_lens
    df['timestamp'] = time()

    return df

In [13]:
question_df_subset = question_df.head(3)
question_df_subset

Unnamed: 0,question_id,content,cluster,prompt
0,328c149ed45a41c0b9d6f14659e63599,Use ABC notation to write a melody in the styl...,ABC Sequence Puzzles & Groups,<|begin_of_text|><|start_header_id|>system<|en...
1,b43c07656ead4150b360294ee932b410,SOLVE THIS IN C++ : There are three cards with...,ABC Sequence Puzzles & Groups,<|begin_of_text|><|start_header_id|>system<|en...
2,1f07cf6d146d4038b2b93aaba3935ce0,Explain the book the Alignment problem by Bria...,AI & Sequence Alignment Challenges,<|begin_of_text|><|start_header_id|>system<|en...


In [14]:
prompts = question_df_subset['prompt'].tolist()
outputs, inference_time = query_model_pipeline(prompts)
print(f"Total Inference Time: {inference_time:.2f} seconds \n")
print(f"Finished total number of {len(outputs)} prompts \n")
print("="*20 + "Output" + "="*20 + f"\n\n QUESTION:\n {prompts[0]}\n\n ANSWER:\n {outputs[0][0]['generated_text']} \n\n" + f"\n\n QUESTION:\n {prompts[1]}\n\n ANSWER:\n {outputs[1][0]['generated_text']} \n\n" + f"\n\n QUESTION:\n {prompts[2]}\n\n ANSWER:\n {outputs[2][0]['generated_text']} \n\n")


Total Inference Time: 455.45 seconds 

Finished total number of 3 prompts 


 QUESTION:
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a sophisticated AI-Expert there to help users solve tasks in several domains efficiently and accurately.
        Now solve the following task from the domain "ABC Sequence Puzzles & Groups".<|eot_id|><|start_header_id|>user<|end_header_id|>

Use ABC notation to write a melody in the style of a folk tune.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



 ANSWER:
 What a delightful task!

Here's a melody in the style of a folk tune, written in ABC notation:

X:1
T:Folkish Delight
M:4/4
L:1/8
R:Folk Tune
K:C
C E G c | e g a g | f e d c | d e g a |
g a g f | e d c d | e g a g | f e d c |
d e g a | g a g f | e d c d | e g a g |
f e d c | d e g a | g a g f | e d c d |

Let me explain what each line represents:

* `X:1`: This is the file header, indicating that this is an ABC file, version 1.
* `T:Folkish Delight`: This is the t

In [None]:
runs = 150
num_prompts = len(words)
total_prompts = runs * num_prompts


total_input_tok = 0
total_output_tok = 0

print("="*10 + f" INFERENCE TEST with {model_name}" + "="*10 + 
"\n\n" + 
f"""
Starting Test with {runs} Runs and {num_prompts} Prompts / Run. \n
Total Prompts: {total_prompts}\n\n
""")

name=f"vLLM_{model_name}_1GPUs"

prompts = prepare_prompts(words, runs=runs, num_examples=5, multiply_by=1)


wandb.init(
    # set the wandb project where this run will be logged
    project="Inference_Params_Comp",

    # track hyperparameters and run metadata
    config={
    "runs": runs,
    "num_prompts": num_prompts,
    "total_prompts": total_prompts,
    "framework": 'vLLM',
    "model": model_name,
    "num_gpus": 1,
    },

    name=name,
)

tracker = EmissionsTracker(save_to_file=True, project_name=f"{name}", log_level="error", pue = 1.22, output_file=f"emissions_params.csv")
tracker.start()


outputs, ttime = query_model_vllm(prompts, max_length=100*5)

emissions: float = tracker.stop()



for output in outputs: 


    # Extracting information
    prompt = output.prompt
    generated_text = output.outputs[0].text
    input_tokens = output.prompt_token_ids
    output_tokens = output.outputs[0].token_ids
    num_input_tokens = len(input_tokens)
    num_output_tokens = len(output_tokens)

    # Updating cumulative counts
    total_input_tok += num_input_tokens
    total_output_tok += num_output_tokens


# Calculate averages
avg_time_per_prompt = (ttime / total_prompts)*1000
avg_toks_per_sec = total_output_tok/ttime
avg_input_tokens = total_input_tok / total_prompts
avg_output_tokens = total_output_tok / total_prompts

em_i = emissions/total_input_tok *1_000_000
em_o = emissions/total_output_tok *1_000_000
em_p = emissions/total_prompts *10_000

print("="*15 + f" RESULTS for {name} " + "="*15 + 
    "\n\n" + 
    f"""
    Finished {runs} Runs with {num_prompts} Prompts/Run.\n\n
    Total Time: {ttime:.2f}s, AVG/Prompt: {avg_time_per_prompt:.2f}ms\n\n
    Average tokens per second: {avg_toks_per_sec:.2f}\n\n
    Total Prompts: {total_prompts}\n
    Total Input Tokens: {total_input_tok}, AVG/Prompt: {avg_input_tokens}\n
    Total Output Tokens: {total_output_tok}, AVG/Prompt: {avg_output_tokens}\n
    """ + 
    
    "-"*50 + "\n" +
    
    f"""
    Total Inference Emissions: {emissions:.3f}kg CO‚ÇÇeq\n\n
    Emissions / 1.000.000 Input Tokens: {em_i:.3f}kg CO‚ÇÇeq\n
    Emissions / 1.000.000 Output Tokens: {em_o:.3f}kg CO‚ÇÇeq\n
    Emissions / 10.000 Prompts: {em_p:.3f}kg CO‚ÇÇeq\n

    """
    )

wandb.log({"Total Time": ttime,
    "AVG. Time / Prompt": avg_time_per_prompt,
            "AVG. Tokens / Second": avg_toks_per_sec,
            "AVG. Input Tokens": avg_input_tokens,
            "AVG. Output Tokens": avg_output_tokens,
            "Total Emissions": emissions,
            "Emissions / 1.000.000 Input Tokens": em_i,
            "Emissions / 1.000.000 Output Tokens": em_o,
            "Emissions / 10.000 Prompts": em_p,
            })

wandb.finish()

# Save results to a CSV file
results = [
    ["Runs", runs],
    ["Prompts / Run", num_prompts],
    ["Total Prompts", total_prompts],
    ["Total Time", ttime], 
    ["AVG. Time / Prompt", avg_time_per_prompt],
    ["AVG. Tokens / Second", avg_toks_per_sec],
    ["Total Input Tokens", total_input_tok],
    ["AVG. Input Tokens / Prompt", avg_input_tokens],
    ["Total Output Tokens", total_output_tok],
    ["AVG. Output Tokens / Prompt", avg_output_tokens],
    ["Total Emissions", emissions],
    ["Emissions / 1.000.000 Input Tokens", em_i],
    ["Emissions / 1.000.000 Output Tokens", em_o],
    ["Emissions / 10.000 Prompts", em_p]
]

# Ensure the directory exists
output_file_path = f"emission_data/{name}_emission_data.csv"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

with open(output_file_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Metric", "Value"])
    writer.writerows(results)

print(f"Results saved to {output_file_path}\n\n")