In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [2]:

import torch
from transformers import  AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

from codecarbon import EmissionsTracker
from time import time
import csv
from vllm import LLM, SamplingParams
from openai import OpenAI

import wandb
import gc

In [3]:
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


In [4]:
# Show active GPUs - pyTorch Stores the number of GPUs once Cuda ist firstly initialized. Therefore this needs to be executed on first startup
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPUs")


Found 1 GPUs


In [5]:
model_name = "meta-llama/CodeLlama-7b-Instruct-hf"
#model_name = "meta-llama/CodeLlama-13b-Instruct-hf"
#model_name = "meta-llama/CodeLlama-34b-Instruct-hf"
#model_name = "meta-llama/CodeLlama-70b-Instruct-hf"

In [6]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    CBLACKBG  = '\33[40m'
    CREDBG    = '\33[41m'
    CGREENBG  = '\33[42m'
    CYELLOWBG = '\33[43m'
    CBLUEBG   = '\33[44m'
    CVIOLETBG = '\33[45m'
    CBEIGEBG  = '\33[46m'
    CWHITEBG  = '\33[47m'
    CBLACK  = '\33[30m'
    CRED    = '\33[31m'
    CGREEN  = '\33[32m'
    CYELLOW = '\33[33m'
    CBLUE   = '\33[34m'
    CVIOLET = '\33[35m'
    CBEIGE  = '\33[36m'
    CWHITE  = '\33[37m'

# Preparing the Test Data

In [7]:
# List of words that should be used to create sentences.

words = [
    "apple", "book", "car", "dog", "elephant", "forest", "guitar", "house", 
    "island", "jacket", "kangaroo", "lamp", "mountain", "notebook", "ocean", 
    "pencil", "queen", "river", "star", "tree", "umbrella", "village", 
    "window", "xylophone", "yacht", "zebra", "balloon", "camera", "desert", 
    "engine", "flower", "garden", "honey", "iceberg", "jungle", "kite", 
    "ladder", "moon", "nest", "octopus", "pirate", "quilt", "robot", "swan", 
    "telescope", "unicorn", "violin", "whale", "x-ray", "laptop"
]

In [8]:
# List of examples to be provided in the system prompt

example_sentences = [
    "Why did the bicycle fall over? Because it was two-tired!",
    "Why don't scientists trust atoms? Because they make up everything!",
    "Why did the scarecrow win an award? Because he was outstanding in his field!",
    "Why don't skeletons fight each other? They don't have the guts!",
    "Why did the computer go to the doctor? Because it had a virus!",
    "Why was the math book sad? Because it had too many problems!",
    "Why did the coffee file a police report? It got mugged!",
    "Why did the tomato turn red? Because it saw the salad dressing!",
    "Why don't eggs tell jokes? They might crack up!",
    "Why did the golfer bring two pairs of pants? In case he got a hole in one!",
    "Why do cows wear bells? Because their horns don't work!",
    "Why don't some couples go to the gym? Because some relationships don't work out!",
    "Why did the photo go to jail? It was framed!",
    "Why don't programmers like nature? It has too many bugs!",
    "Why did the bicycle stand up by itself? It was two-tired!",
    "Why did the music teacher need a ladder? To reach the high notes!",
    "Why did the cookie go to the doctor? Because it felt crummy!",
    "Why did the student eat his homework? Because his teacher told him it was a piece of cake!",
    "Why don't oysters donate to charity? Because they are shellfish!",
    "Why did the broom get a promotion? Because it swept the competition!",
    "Why don't we see elephants hiding in trees? Because they are so good at it!",
    "Why did the fish blush? Because it saw the ocean's bottom!",
    "Why did the barber win the race? Because he knew all the shortcuts!",
    "Why did the banana go to the doctor? Because it wasn't peeling well!",
    "Why don't dinosaurs talk? Because they are extinct!",
    "Why did the clock go back to school? To learn about time management!",
    "Why did the farmer win an award? Because he was outstanding in his field!",
    "Why did the astronaut break up with his girlfriend? He needed space!",
    "Why did the shoe go to the party alone? Because it didn't want to be a pair!",
    "Why did the tree go to the dentist? To get its roots checked!",
    "Why did the calendar go on a diet? It wanted to lose some days!",
    "Why did the stadium get hot? All the fans left!",
    "Why did the belt go to jail? It held up a pair of pants!",
    "Why did the cookie cry? Because its mom was a wafer too long!",
    "Why did the cow jump over the moon? To get to the milky way!",
    "Why did the skeleton go to the party alone? He had no body to go with!",
    "Why did the grape stop in the middle of the road? Because it ran out of juice!",
    "Why did the bee get married? Because he found his honey!",
    "Why did the soccer ball quit the team? It was tired of being kicked around!",
    "Why did the traffic light turn red? You would too if you had to change in the middle of the street!",
    "Why did the scarecrow become a successful neurosurgeon? He was outstanding in his field!",
    "Why did the chicken cross the playground? To get to the other slide!",
    "Why did the belt get a promotion? It was a cinch!",
    "Why did the teacher wear sunglasses? Because her students were so bright!",
    "Why did the clock get kicked out of class? It was tocking too much!",
    "Why did the frog call his insurance company? He had a jump in his car!",
    "Why did the keyboard get a speeding ticket? It had a problem with the space bar!",
    "Why did the painting go to art school? It wanted to brush up on its skills!",
    "Why did the scientist install a knocker on his door? He wanted to win the No-bell prize!",
    "Why did the tomato turn green? Because it was embarrassed to ketchup!",
    "Why did the pencil go to jail? It was caught in a sketchy situation!",
    "Why did the music note need a loan? It needed some major funding!",
    "Why did the fisherman put peanut butter into the sea? To go with the jellyfish!",
    "Why did the bicycle bring a map? Because it didn't want to get lost on its wheel-y big adventure!",
    "Why did the pizza go to the party? It wanted to slice up the dance floor!",
    "Why did the phone sit on a bench? It wanted to recharge its batteries!",
    "Why did the blanket get arrested? It was covering up a crime!",
    "Why did the chef go to jail? Because he beat the eggs and whipped the cream!",
    "Why did the sandwich go to the beach? To get a little bologna-sun!",
    "Why did the banker switch careers? He lost interest!",
    "Why did the shovel go to therapy? It had too much dirt on its mind!",
    "Why did the soccer player bring string to the game? So he could tie the score!",
    "Why did the alarm clock break up with the pillow? It couldn't handle the pressure!",
    "Why did the frog take the bus to work? His car got toad away!",
    "Why did the dentist become a gardener? He wanted to brush up on his roots!",
    "Why did the light bulb fail his test? He wasn't too bright!",
    "Why did the ocean break up with the shore? It needed some space to tide things over!",
    "Why did the shoe store close down? It lost its sole!",
    "Why did the banana go out with the prune? Because it couldn't find a date!",
    "Why did the carpenter become a musician? He wanted to nail every note!"
]


In [9]:
def prepare_prompts(word_list, runs=1, num_examples=1, multiply_by=1):
    system_message = f"""
    You are an AI assistant designed to write python code that prints out a number of short jokes.
    Generate python code that prints out {num_examples*multiply_by} short joke(s) that are exactly 2 short sentences long. 
    Do not include any follow-up questions or explanations.

    For example, a joke for the word "yogurt" would look like this:
    'Why did the yogurt go to the art gallery? Because it wanted to be cultured!'

    Here are a few further examples: 
    - 'Why did the bicycle fall over? Because it was two-tired!'
    - 'Why don't scientists trust atoms? Because they make up everything!'

    
    Be as close as possible to the example jokes!
    Respond with only the code to print out the {num_examples*multiply_by} short joke(s). Do not include any introductory phrases.

    """

    prompts = []
    
    for r in range(runs):

        for word in word_list:
            user_message = "Question: " + f'Now, tell me {num_examples*multiply_by} short joke(s) for the word: "{word}"\n' + " Answer:"
            
            messages = [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ]
            prompt = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )

            prompts.append(prompt)
    
    
    return prompts

# vLLM

## Creating the Model

In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.bos_token

tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [11]:
# Create an LLM.
llm = LLM(model=model_name,
          tensor_parallel_size=1, 
          dtype='bfloat16',
          enable_chunked_prefill=True,
          max_model_len=2048,
          gpu_memory_utilization=0.9,
          )

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

INFO 07-31 20:10:18 config.py:806] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 07-31 20:10:18 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='meta-llama/CodeLlama-7b-Instruct-hf', speculative_config=None, tokenizer='meta-llama/CodeLlama-7b-Instruct-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/CodeLlama-7b-Instruct-hf, use_v2_block_manager=False, enable_prefix_caching=False)


generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO 07-31 20:10:19 model_runner.py:680] Starting to load model meta-llama/CodeLlama-7b-Instruct-hf...
INFO 07-31 20:10:19 weight_utils.py:223] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 07-31 20:10:44 model_runner.py:692] Loading model weights took 12.5562 GB
INFO 07-31 20:10:45 gpu_executor.py:102] # GPU blocks: 878, # CPU blocks: 512
INFO 07-31 20:10:48 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-31 20:10:48 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-31 20:11:03 model_runner.py:1181] Graph capturing finished in 16 secs.


## Testing the Model

In [12]:
def query_model_vllm(prompt_list, temperature=0.8, min_p=0.05, max_length=500):

    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=temperature, min_p=min_p, max_tokens=max_length)

    # Start Timer for Inference
    start_time = time()

    outputs = llm.generate(prompt_list, sampling_params)

    # End Timer for Inference
    end_time = time()

    ttime = end_time-start_time

    return outputs, ttime

In [13]:
runs = 150
num_prompts = len(words)
total_prompts = runs * num_prompts


total_input_tok = 0
total_output_tok = 0

print("="*10 + f" INFERENCE TEST with {model_name}" + "="*10 + 
"\n\n" + 
f"""
Starting Test with {runs} Runs and {num_prompts} Prompts / Run. \n
Total Prompts: {total_prompts}\n\n
""")

name=f"vLLM_{model_name}_1GPUs"

prompts = prepare_prompts(words, runs=runs, num_examples=5, multiply_by=1)


wandb.init(
    # set the wandb project where this run will be logged
    project="Inference_Params_Comp",

    # track hyperparameters and run metadata
    config={
    "runs": runs,
    "num_prompts": num_prompts,
    "total_prompts": total_prompts,
    "framework": 'vLLM',
    "model": model_name,
    "num_gpus": 1,
    },

    name=name,
)

tracker = EmissionsTracker(save_to_file=True, project_name=f"{name}", log_level="error", pue = 1.22, output_file=f"emissions_params.csv")
tracker.start()


outputs, ttime = query_model_vllm(prompts, max_length=100*5)

emissions: float = tracker.stop()



for output in outputs: 


    # Extracting information
    prompt = output.prompt
    generated_text = output.outputs[0].text
    input_tokens = output.prompt_token_ids
    output_tokens = output.outputs[0].token_ids
    num_input_tokens = len(input_tokens)
    num_output_tokens = len(output_tokens)

    # Updating cumulative counts
    total_input_tok += num_input_tokens
    total_output_tok += num_output_tokens


# Calculate averages
avg_time_per_prompt = (ttime / total_prompts)*1000
avg_toks_per_sec = total_output_tok/ttime
avg_input_tokens = total_input_tok / total_prompts
avg_output_tokens = total_output_tok / total_prompts

em_i = emissions/total_input_tok *1_000_000
em_o = emissions/total_output_tok *1_000_000
em_p = emissions/total_prompts *10_000

print("="*15 + f" RESULTS for {name} " + "="*15 + 
    "\n\n" + 
    f"""
    Finished {runs} Runs with {num_prompts} Prompts/Run.\n\n
    Total Time: {ttime:.2f}s, AVG/Prompt: {avg_time_per_prompt:.2f}ms\n\n
    Average tokens per second: {avg_toks_per_sec:.2f}\n\n
    Total Prompts: {total_prompts}\n
    Total Input Tokens: {total_input_tok}, AVG/Prompt: {avg_input_tokens}\n
    Total Output Tokens: {total_output_tok}, AVG/Prompt: {avg_output_tokens}\n
    """ + 
    
    "-"*50 + "\n" +
    
    f"""
    Total Inference Emissions: {emissions:.3f}kg CO₂eq\n\n
    Emissions / 1.000.000 Input Tokens: {em_i:.3f}kg CO₂eq\n
    Emissions / 1.000.000 Output Tokens: {em_o:.3f}kg CO₂eq\n
    Emissions / 10.000 Prompts: {em_p:.3f}kg CO₂eq\n

    """
    )

wandb.log({"Total Time": ttime,
    "AVG. Time / Prompt": avg_time_per_prompt,
            "AVG. Tokens / Second": avg_toks_per_sec,
            "AVG. Input Tokens": avg_input_tokens,
            "AVG. Output Tokens": avg_output_tokens,
            "Total Emissions": emissions,
            "Emissions / 1.000.000 Input Tokens": em_i,
            "Emissions / 1.000.000 Output Tokens": em_o,
            "Emissions / 10.000 Prompts": em_p,
            })

wandb.finish()

# Save results to a CSV file
results = [
    ["Runs", runs],
    ["Prompts / Run", num_prompts],
    ["Total Prompts", total_prompts],
    ["Total Time", ttime], 
    ["AVG. Time / Prompt", avg_time_per_prompt],
    ["AVG. Tokens / Second", avg_toks_per_sec],
    ["Total Input Tokens", total_input_tok],
    ["AVG. Input Tokens / Prompt", avg_input_tokens],
    ["Total Output Tokens", total_output_tok],
    ["AVG. Output Tokens / Prompt", avg_output_tokens],
    ["Total Emissions", emissions],
    ["Emissions / 1.000.000 Input Tokens", em_i],
    ["Emissions / 1.000.000 Output Tokens", em_o],
    ["Emissions / 10.000 Prompts", em_p]
]

# Ensure the directory exists
output_file_path = f"emission_data/{name}_emission_data.csv"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

with open(output_file_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Metric", "Value"])
    writer.writerows(results)

print(f"Results saved to {output_file_path}\n\n")



Starting Test with 150 Runs and 50 Prompts / Run. 

Total Prompts: 7500





Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdaniel-wetzel[0m ([33mllm-emissions[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112712800000837, max=1.0…

Processed prompts:   0%|          | 0/7500 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:   2%|▏         | 168/7500 [01:09<1:03:33,  1.92it/s, est. speed input: 621.97 toks/s, output: 318.92 toks/s]



Processed prompts:   6%|▌         | 419/7500 [02:42<43:38,  2.70it/s, est. speed input: 659.45 toks/s, output: 338.42 toks/s]  



Processed prompts:   8%|▊         | 637/7500 [04:11<1:31:46,  1.25it/s, est. speed input: 647.41 toks/s, output: 333.09 toks/s]



Processed prompts:  12%|█▏        | 881/7500 [05:40<30:28,  3.62it/s, est. speed input: 661.83 toks/s, output: 341.08 toks/s]  



Processed prompts:  15%|█▍        | 1093/7500 [07:02<1:33:19,  1.14it/s, est. speed input: 661.67 toks/s, output: 341.06 toks/s]



Processed prompts:  17%|█▋        | 1304/7500 [08:24<2:00:31,  1.17s/it, est. speed input: 662.23 toks/s, output: 340.96 toks/s]



Processed prompts:  21%|██        | 1551/7500 [09:56<24:19,  4.08it/s, est. speed input: 665.37 toks/s, output: 342.24 toks/s]  



Processed prompts:  23%|██▎       | 1761/7500 [11:19<2:16:47,  1.43s/it, est. speed input: 663.14 toks/s, output: 341.38 toks/s]



Processed prompts:  27%|██▋       | 2013/7500 [12:53<1:01:24,  1.49it/s, est. speed input: 666.49 toks/s, output: 342.84 toks/s]



Processed prompts:  30%|██▉       | 2221/7500 [14:12<55:54,  1.57it/s, est. speed input: 666.94 toks/s, output: 343.00 toks/s]  



Processed prompts:  32%|███▏      | 2431/7500 [15:30<39:03,  2.16it/s, est. speed input: 668.73 toks/s, output: 343.98 toks/s]  



Processed prompts:  36%|███▌      | 2672/7500 [17:02<14:44,  5.46it/s, est. speed input: 669.14 toks/s, output: 344.05 toks/s]  



Processed prompts:  39%|███▊      | 2891/7500 [18:23<13:41,  5.61it/s, est. speed input: 670.41 toks/s, output: 344.91 toks/s]  



Processed prompts:  41%|████▏     | 3104/7500 [19:45<31:31,  2.32it/s, est. speed input: 670.47 toks/s, output: 344.90 toks/s]  



Processed prompts:  44%|████▍     | 3315/7500 [21:06<42:00,  1.66it/s, est. speed input: 670.05 toks/s, output: 344.70 toks/s]  



Processed prompts:  47%|████▋     | 3520/7500 [22:24<48:18,  1.37it/s, est. speed input: 670.06 toks/s, output: 344.87 toks/s]  



Processed prompts:  50%|████▉     | 3731/7500 [23:43<28:45,  2.18it/s, est. speed input: 670.78 toks/s, output: 345.18 toks/s]  



Processed prompts:  53%|█████▎    | 3939/7500 [25:01<23:30,  2.52it/s, est. speed input: 671.57 toks/s, output: 345.58 toks/s]  



Processed prompts:  55%|█████▌    | 4155/7500 [26:28<34:36,  1.61it/s, est. speed input: 669.51 toks/s, output: 344.59 toks/s]  



Processed prompts:  59%|█████▊    | 4402/7500 [27:58<28:17,  1.82it/s, est. speed input: 671.27 toks/s, output: 345.60 toks/s]  



Processed prompts:  62%|██████▏   | 4654/7500 [29:32<28:03,  1.69it/s, est. speed input: 672.25 toks/s, output: 346.12 toks/s]  



Processed prompts:  65%|██████▍   | 4866/7500 [30:52<39:43,  1.11it/s, est. speed input: 672.33 toks/s, output: 346.15 toks/s]  



Processed prompts:  68%|██████▊   | 5118/7500 [32:26<16:32,  2.40it/s, est. speed input: 673.17 toks/s, output: 346.57 toks/s]  



Processed prompts:  71%|███████   | 5329/7500 [33:46<14:11,  2.55it/s, est. speed input: 673.07 toks/s, output: 346.62 toks/s]  



Processed prompts:  74%|███████▍  | 5580/7500 [35:21<21:05,  1.52it/s, est. speed input: 673.21 toks/s, output: 346.64 toks/s]



Processed prompts:  77%|███████▋  | 5789/7500 [36:41<25:13,  1.13it/s, est. speed input: 673.25 toks/s, output: 346.82 toks/s]



Processed prompts:  81%|████████  | 6038/7500 [38:13<05:21,  4.55it/s, est. speed input: 673.91 toks/s, output: 347.31 toks/s]



Processed prompts:  84%|████████▍ | 6292/7500 [39:48<05:11,  3.88it/s, est. speed input: 674.46 toks/s, output: 347.68 toks/s]



Processed prompts:  87%|████████▋ | 6508/7500 [41:11<12:11,  1.36it/s, est. speed input: 673.99 toks/s, output: 347.35 toks/s]



Processed prompts:  90%|█████████ | 6756/7500 [42:42<07:23,  1.68it/s, est. speed input: 675.02 toks/s, output: 347.92 toks/s]



Processed prompts:  93%|█████████▎| 7005/7500 [44:13<01:48,  4.56it/s, est. speed input: 675.88 toks/s, output: 348.27 toks/s]



Processed prompts:  96%|█████████▋| 7220/7500 [45:34<02:05,  2.23it/s, est. speed input: 676.01 toks/s, output: 348.30 toks/s]



Processed prompts:  99%|█████████▉| 7431/7500 [46:54<00:49,  1.39it/s, est. speed input: 675.81 toks/s, output: 348.28 toks/s]



Processed prompts: 100%|██████████| 7500/7500 [47:16<00:00,  2.64it/s, est. speed input: 676.98 toks/s, output: 348.92 toks/s]




    Finished 150 Runs with 50 Prompts/Run.


    Total Time: 2837.99s, AVG/Prompt: 378.40ms


    Average tokens per second: 348.69


    Total Prompts: 7500

    Total Input Tokens: 1920000, AVG/Prompt: 256.0

    Total Output Tokens: 989579, AVG/Prompt: 131.94386666666668

    --------------------------------------------------

    Total Inference Emissions: 0.087kg CO₂eq


    Emissions / 1.000.000 Input Tokens: 0.046kg CO₂eq

    Emissions / 1.000.000 Output Tokens: 0.088kg CO₂eq

    Emissions / 10.000 Prompts: 0.117kg CO₂eq


    


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AVG. Input Tokens,▁
AVG. Output Tokens,▁
AVG. Time / Prompt,▁
AVG. Tokens / Second,▁
Emissions / 1.000.000 Input Tokens,▁
Emissions / 1.000.000 Output Tokens,▁
Emissions / 10.000 Prompts,▁
Total Emissions,▁
Total Time,▁

0,1
AVG. Input Tokens,256.0
AVG. Output Tokens,131.94387
AVG. Time / Prompt,378.39846
AVG. Tokens / Second,348.69028
Emissions / 1.000.000 Input Tokens,0.04557
Emissions / 1.000.000 Output Tokens,0.08842
Emissions / 10.000 Prompts,0.11666
Total Emissions,0.08749
Total Time,2837.98847


Results saved to emission_data/vLLM_meta-llama/CodeLlama-7b-Instruct-hf_1GPUs_emission_data.csv




# Note: Idle Performance

- In idle each L4 GPU needs about 27W to store its maximum capacity in VRAM. 
- In full idle with empty VRAM each L4 needs about 16W