In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [2]:

import torch
from transformers import  AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

from codecarbon import EmissionsTracker
from time import time
import csv
from vllm import LLM, SamplingParams
from openai import OpenAI

import wandb
import gc

In [3]:
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


In [4]:
# Show active GPUs - pyTorch Stores the number of GPUs once Cuda ist firstly initialized. Therefore this needs to be executed on first startup
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPUs")


Found 2 GPUs


In [5]:
#model_name = "meta-llama/CodeLlama-7b-Instruct-hf"
model_name = "meta-llama/CodeLlama-13b-Instruct-hf"
#model_name = "meta-llama/CodeLlama-34b-Instruct-hf"
#model_name = "meta-llama/CodeLlama-70b-Instruct-hf"

In [6]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    CBLACKBG  = '\33[40m'
    CREDBG    = '\33[41m'
    CGREENBG  = '\33[42m'
    CYELLOWBG = '\33[43m'
    CBLUEBG   = '\33[44m'
    CVIOLETBG = '\33[45m'
    CBEIGEBG  = '\33[46m'
    CWHITEBG  = '\33[47m'
    CBLACK  = '\33[30m'
    CRED    = '\33[31m'
    CGREEN  = '\33[32m'
    CYELLOW = '\33[33m'
    CBLUE   = '\33[34m'
    CVIOLET = '\33[35m'
    CBEIGE  = '\33[36m'
    CWHITE  = '\33[37m'

# Preparing the Test Data

In [7]:
# List of words that should be used to create sentences.

words = [
    "apple", "book", "car", "dog", "elephant", "forest", "guitar", "house", 
    "island", "jacket", "kangaroo", "lamp", "mountain", "notebook", "ocean", 
    "pencil", "queen", "river", "star", "tree", "umbrella", "village", 
    "window", "xylophone", "yacht", "zebra", "balloon", "camera", "desert", 
    "engine", "flower", "garden", "honey", "iceberg", "jungle", "kite", 
    "ladder", "moon", "nest", "octopus", "pirate", "quilt", "robot", "swan", 
    "telescope", "unicorn", "violin", "whale", "x-ray", "laptop"
]

In [8]:
# List of examples to be provided in the system prompt

example_sentences = [
    "Why did the bicycle fall over? Because it was two-tired!",
    "Why don't scientists trust atoms? Because they make up everything!",
    "Why did the scarecrow win an award? Because he was outstanding in his field!",
    "Why don't skeletons fight each other? They don't have the guts!",
    "Why did the computer go to the doctor? Because it had a virus!",
    "Why was the math book sad? Because it had too many problems!",
    "Why did the coffee file a police report? It got mugged!",
    "Why did the tomato turn red? Because it saw the salad dressing!",
    "Why don't eggs tell jokes? They might crack up!",
    "Why did the golfer bring two pairs of pants? In case he got a hole in one!",
    "Why do cows wear bells? Because their horns don't work!",
    "Why don't some couples go to the gym? Because some relationships don't work out!",
    "Why did the photo go to jail? It was framed!",
    "Why don't programmers like nature? It has too many bugs!",
    "Why did the bicycle stand up by itself? It was two-tired!",
    "Why did the music teacher need a ladder? To reach the high notes!",
    "Why did the cookie go to the doctor? Because it felt crummy!",
    "Why did the student eat his homework? Because his teacher told him it was a piece of cake!",
    "Why don't oysters donate to charity? Because they are shellfish!",
    "Why did the broom get a promotion? Because it swept the competition!",
    "Why don't we see elephants hiding in trees? Because they are so good at it!",
    "Why did the fish blush? Because it saw the ocean's bottom!",
    "Why did the barber win the race? Because he knew all the shortcuts!",
    "Why did the banana go to the doctor? Because it wasn't peeling well!",
    "Why don't dinosaurs talk? Because they are extinct!",
    "Why did the clock go back to school? To learn about time management!",
    "Why did the farmer win an award? Because he was outstanding in his field!",
    "Why did the astronaut break up with his girlfriend? He needed space!",
    "Why did the shoe go to the party alone? Because it didn't want to be a pair!",
    "Why did the tree go to the dentist? To get its roots checked!",
    "Why did the calendar go on a diet? It wanted to lose some days!",
    "Why did the stadium get hot? All the fans left!",
    "Why did the belt go to jail? It held up a pair of pants!",
    "Why did the cookie cry? Because its mom was a wafer too long!",
    "Why did the cow jump over the moon? To get to the milky way!",
    "Why did the skeleton go to the party alone? He had no body to go with!",
    "Why did the grape stop in the middle of the road? Because it ran out of juice!",
    "Why did the bee get married? Because he found his honey!",
    "Why did the soccer ball quit the team? It was tired of being kicked around!",
    "Why did the traffic light turn red? You would too if you had to change in the middle of the street!",
    "Why did the scarecrow become a successful neurosurgeon? He was outstanding in his field!",
    "Why did the chicken cross the playground? To get to the other slide!",
    "Why did the belt get a promotion? It was a cinch!",
    "Why did the teacher wear sunglasses? Because her students were so bright!",
    "Why did the clock get kicked out of class? It was tocking too much!",
    "Why did the frog call his insurance company? He had a jump in his car!",
    "Why did the keyboard get a speeding ticket? It had a problem with the space bar!",
    "Why did the painting go to art school? It wanted to brush up on its skills!",
    "Why did the scientist install a knocker on his door? He wanted to win the No-bell prize!",
    "Why did the tomato turn green? Because it was embarrassed to ketchup!",
    "Why did the pencil go to jail? It was caught in a sketchy situation!",
    "Why did the music note need a loan? It needed some major funding!",
    "Why did the fisherman put peanut butter into the sea? To go with the jellyfish!",
    "Why did the bicycle bring a map? Because it didn't want to get lost on its wheel-y big adventure!",
    "Why did the pizza go to the party? It wanted to slice up the dance floor!",
    "Why did the phone sit on a bench? It wanted to recharge its batteries!",
    "Why did the blanket get arrested? It was covering up a crime!",
    "Why did the chef go to jail? Because he beat the eggs and whipped the cream!",
    "Why did the sandwich go to the beach? To get a little bologna-sun!",
    "Why did the banker switch careers? He lost interest!",
    "Why did the shovel go to therapy? It had too much dirt on its mind!",
    "Why did the soccer player bring string to the game? So he could tie the score!",
    "Why did the alarm clock break up with the pillow? It couldn't handle the pressure!",
    "Why did the frog take the bus to work? His car got toad away!",
    "Why did the dentist become a gardener? He wanted to brush up on his roots!",
    "Why did the light bulb fail his test? He wasn't too bright!",
    "Why did the ocean break up with the shore? It needed some space to tide things over!",
    "Why did the shoe store close down? It lost its sole!",
    "Why did the banana go out with the prune? Because it couldn't find a date!",
    "Why did the carpenter become a musician? He wanted to nail every note!"
]


In [9]:
def prepare_prompts(word_list, runs=1, num_examples=1, multiply_by=1):
    system_message = f"""
    You are an AI assistant designed to write python code that prints out a number of short jokes.
    Generate python code that prints out {num_examples*multiply_by} short joke(s) that are exactly 2 short sentences long. 
    Do not include any follow-up questions or explanations.

    For example, a joke for the word "yogurt" would look like this:
    'Why did the yogurt go to the art gallery? Because it wanted to be cultured!'

    Here are a few further examples: 
    - 'Why did the bicycle fall over? Because it was two-tired!'
    - 'Why don't scientists trust atoms? Because they make up everything!'

    
    Be as close as possible to the example jokes!
    Respond with only the code to print out the {num_examples*multiply_by} short joke(s). Do not include any introductory phrases.

    """

    prompts = []
    
    for r in range(runs):

        for word in word_list:
            user_message = "Question: " + f'Now, tell me {num_examples*multiply_by} short joke(s) for the word: "{word}"\n' + " Answer:"
            
            messages = [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ]
            prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )

            prompts.append(prompt)
    
    
    return prompts

# vLLM

## Creating the Model

In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.bos_token

tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [11]:
# Create an LLM.
llm = LLM(model=model_name,
          tensor_parallel_size=2, 
          dtype='bfloat16',
          enable_chunked_prefill=True,
          max_model_len=2048,
          gpu_memory_utilization=0.9,
          )

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

INFO 07-31 19:28:23 config.py:715] Defaulting to use mp for distributed inference
INFO 07-31 19:28:23 config.py:806] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 07-31 19:28:23 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='meta-llama/CodeLlama-13b-Instruct-hf', speculative_config=None, tokenizer='meta-llama/CodeLlama-13b-Instruct-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Cod

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO 07-31 19:28:23 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager
[1;36m(VllmWorkerProcess pid=34477)[0;0m INFO 07-31 19:28:23 multiproc_worker_utils.py:215] Worker ready; awaiting tasks
[1;36m(VllmWorkerProcess pid=34477)[0;0m ERROR 07-31 19:28:23 multiproc_worker_utils.py:226] Exception in worker VllmWorkerProcess while processing method init_device: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method, Traceback (most recent call last):
[1;36m(VllmWorkerProcess pid=34477)[0;0m ERROR 07-31 19:28:23 multiproc_worker_utils.py:226]   File "/opt/conda/envs/pytorch/lib/python3.11/site-packages/vllm/executor/multiproc_worker_utils.py", line 223, in _run_worker_process
[1;36m(VllmWorkerProcess pid=34477)[0;0m ERROR 07-31 19:28:23 multiproc_worker_utils.py:226]     output = executor(*args, **kwargs)
[1;36m(VllmWorkerProcess pid=34477)[0;0m ERRO

## Testing the Model

In [10]:
def query_model_vllm(prompt_list, temperature=0.8, min_p=0.05, max_length=500):

    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=temperature, min_p=min_p, max_tokens=max_length)

    # Start Timer for Inference
    start_time = time()

    outputs = llm.generate(prompt_list, sampling_params)

    # End Timer for Inference
    end_time = time()

    ttime = end_time-start_time

    return outputs, ttime

In [11]:
runs = 150
num_prompts = len(words)
total_prompts = runs * num_prompts


total_input_tok = 0
total_output_tok = 0

print("="*10 + f" INFERENCE TEST with {model_name}" + "="*10 + 
"\n\n" + 
f"""
Starting Test with {runs} Runs and {num_prompts} Prompts / Run. \n
Total Prompts: {total_prompts}\n\n
""")

name=f"vLLM_{model_name}_4GPUs"

prompts = prepare_prompts(words, runs=runs, num_examples=5, multiply_by=1)


wandb.init(
    # set the wandb project where this run will be logged
    project="Inference_Params_Comp",

    # track hyperparameters and run metadata
    config={
    "runs": runs,
    "num_prompts": num_prompts,
    "total_prompts": total_prompts,
    "framework": 'vLLM',
    "model": model_name,
    "num_gpus": 4,
    },

    name=name,
)

tracker = EmissionsTracker(save_to_file=True, project_name=f"{name}", log_level="error", pue = 1.22, output_file=f"emissions_params.csv")
tracker.start()


outputs, ttime = query_model_vllm(prompts, max_length=100*5)

emissions: float = tracker.stop()



for output in outputs: 


    # Extracting information
    prompt = output.prompt
    generated_text = output.outputs[0].text
    input_tokens = output.prompt_token_ids
    output_tokens = output.outputs[0].token_ids
    num_input_tokens = len(input_tokens)
    num_output_tokens = len(output_tokens)

    # Updating cumulative counts
    total_input_tok += num_input_tokens
    total_output_tok += num_output_tokens


# Calculate averages
avg_time_per_prompt = (ttime / total_prompts)*1000
avg_toks_per_sec = total_output_tok/ttime
avg_input_tokens = total_input_tok / total_prompts
avg_output_tokens = total_output_tok / total_prompts

em_i = emissions/total_input_tok *1_000_000
em_o = emissions/total_output_tok *1_000_000
em_p = emissions/total_prompts *10_000

print("="*15 + f" RESULTS for {name} " + "="*15 + 
    "\n\n" + 
    f"""
    Finished {runs} Runs with {num_prompts} Prompts/Run.\n\n
    Total Time: {ttime:.2f}s, AVG/Prompt: {avg_time_per_prompt:.2f}ms\n\n
    Average tokens per second: {avg_toks_per_sec:.2f}\n\n
    Total Prompts: {total_prompts}\n
    Total Input Tokens: {total_input_tok}, AVG/Prompt: {avg_input_tokens}\n
    Total Output Tokens: {total_output_tok}, AVG/Prompt: {avg_output_tokens}\n
    """ + 
    
    "-"*50 + "\n" +
    
    f"""
    Total Inference Emissions: {emissions:.3f}kg CO₂eq\n\n
    Emissions / 1.000.000 Input Tokens: {em_i:.3f}kg CO₂eq\n
    Emissions / 1.000.000 Output Tokens: {em_o:.3f}kg CO₂eq\n
    Emissions / 10.000 Prompts: {em_p:.3f}kg CO₂eq\n

    """
    )

wandb.log({"Total Time": ttime,
    "AVG. Time / Prompt": avg_time_per_prompt,
            "AVG. Tokens / Second": avg_toks_per_sec,
            "AVG. Input Tokens": avg_input_tokens,
            "AVG. Output Tokens": avg_output_tokens,
            "Total Emissions": emissions,
            "Emissions / 1.000.000 Input Tokens": em_i,
            "Emissions / 1.000.000 Output Tokens": em_o,
            "Emissions / 10.000 Prompts": em_p,
            })

wandb.finish()

# Save results to a CSV file
results = [
    ["Runs", runs],
    ["Prompts / Run", num_prompts],
    ["Total Prompts", total_prompts],
    ["Total Time", ttime], 
    ["AVG. Time / Prompt", avg_time_per_prompt],
    ["AVG. Tokens / Second", avg_toks_per_sec],
    ["Total Input Tokens", total_input_tok],
    ["AVG. Input Tokens / Prompt", avg_input_tokens],
    ["Total Output Tokens", total_output_tok],
    ["AVG. Output Tokens / Prompt", avg_output_tokens],
    ["Total Emissions", emissions],
    ["Emissions / 1.000.000 Input Tokens", em_i],
    ["Emissions / 1.000.000 Output Tokens", em_o],
    ["Emissions / 10.000 Prompts", em_p]
]

# Ensure the directory exists
output_file_path = f"emission_data/{name}_emission_data.csv"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

with open(output_file_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Metric", "Value"])
    writer.writerows(results)

print(f"Results saved to {output_file_path}\n\n")



Starting Test with 150 Runs and 50 Prompts / Run. 

Total Prompts: 7500





Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdaniel-wetzel[0m ([33mllm-emissions[0m). Use [1m`wandb login --relogin`[0m to force relogin


Processed prompts: 100%|██████████| 7500/7500 [15:34<00:00,  8.02it/s, est. speed input: 2053.99 toks/s, output: 1059.52 toks/s]




    Finished 150 Runs with 50 Prompts/Run.


    Total Time: 936.63s, AVG/Prompt: 124.88ms


    Average tokens per second: 1057.41


    Total Prompts: 7500

    Total Input Tokens: 1920000, AVG/Prompt: 256.0

    Total Output Tokens: 990403, AVG/Prompt: 132.05373333333333

    --------------------------------------------------

    Total Inference Emissions: 0.076kg CO₂eq


    Emissions / 1.000.000 Input Tokens: 0.039kg CO₂eq

    Emissions / 1.000.000 Output Tokens: 0.076kg CO₂eq

    Emissions / 10.000 Prompts: 0.101kg CO₂eq


    


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AVG. Input Tokens,▁
AVG. Output Tokens,▁
AVG. Time / Prompt,▁
AVG. Tokens / Second,▁
Emissions / 1.000.000 Input Tokens,▁
Emissions / 1.000.000 Output Tokens,▁
Emissions / 10.000 Prompts,▁
Total Emissions,▁
Total Time,▁

0,1
AVG. Input Tokens,256.0
AVG. Output Tokens,132.05373
AVG. Time / Prompt,124.884
AVG. Tokens / Second,1057.41115
Emissions / 1.000.000 Input Tokens,0.0394
Emissions / 1.000.000 Output Tokens,0.07638
Emissions / 10.000 Prompts,0.10086
Total Emissions,0.07564
Total Time,936.62999


Results saved to emission_data/vLLM_meta-llama/CodeLlama-7b-Instruct-hf_4GPUs_emission_data.csv




# Note: Idle Performance

- In idle each L4 GPU needs about 27W to store its maximum capacity in VRAM. 
- In full idle with empty VRAM each L4 needs about 16W