In [15]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.linear_model import LinearRegression

In [16]:
# Define paths to the individual CSV files
csv_files_output = {
    'vllm': 'emission_data/vllm_meta-llama/Meta-Llama-3.1-8B-Instruct_emission_data.csv',
    'transformers': 'emission_data/transformers_meta-llama/Meta-Llama-3.1-8B-Instruct_emission_data.csv',
}

# Read the emissions data
emissions_data = pd.read_csv('emissions_vllm_transformers_llama3_1.csv')

In [17]:
# Initialize lists to store metadata
total_time = []
time_per_prompt = []
tok_per_sec = []
engine = []
parameters = []
num_prompts = []
total_emissions = []
cpu_energy = []
gpu_energy = []
ram_energy = []
total_energy = []
total_output_tokens = []
total_input_tokens = []
avg_input_tokens = []
avg_output_tokens = []

In [18]:
# Read and extract metadata from each CSV file
for model, file in csv_files_output.items():
    data = pd.read_csv(file)
    time = data.loc[data['Metric'] == 'Total Time', 'Value'].values[0]
    time_p_prompt = data.loc[data['Metric'] == 'AVG. Time / Prompt', 'Value'].values[0] / 1000 #Time is in ms
    tok_p_sec = data.loc[data['Metric'] == 'AVG. Tokens / Second', 'Value'].values[0]
    prompts = data.loc[data['Metric'] == 'Total Prompts', 'Value'].values[0]
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'AVG. Input Tokens / Prompt', 'Value'].values[0]
    avg_o_tok = data.loc[data['Metric'] == 'AVG. Output Tokens / Prompt', 'Value'].values[0]
    total_time.append(float(time))
    time_per_prompt.append(float(time_p_prompt))
    tok_per_sec.append(float(tok_p_sec))
    parameters.append(8)
    num_prompts.append(int(prompts))
    engine.append(model)
    total_output_tokens.append(float(output_tokens))
    total_input_tokens.append(float(input_tokens))
    avg_input_tokens.append(float(avg_i_tok))
    avg_output_tokens.append(float(avg_o_tok))    

In [19]:
# Extract emissions data
for idx, model in enumerate(csv_files_output.keys()):
    model_emissions = emissions_data
    total_emissions.append(model_emissions['emissions'].values[idx])
    cpu_energy.append(model_emissions['cpu_energy'].values[idx])
    gpu_energy.append(model_emissions['gpu_energy'].values[idx])
    ram_energy.append(model_emissions['ram_energy'].values[idx])
    total_energy.append(model_emissions['energy_consumed'].values[idx])


In [20]:
print(total_emissions)
print(cpu_energy)
print(gpu_energy)
print(ram_energy)
print(total_energy)

[0.0480402278404932, 0.1929542349933592]
[0.0092630084421402, 0.0429047904024687]
[0.048142913980967, 0.1785292698233014]
[0.0148406416413424, 0.0687452567050357]
[0.0722465640644497, 0.2901793169308061]


In [21]:
# Prepare data for regression and visualization
total_time = np.array(total_time)
time_per_prompt = np.array(time_per_prompt)
tok_per_sec = np.array(tok_per_sec)
parameters = np.array(parameters)
num_prompts = np.array(num_prompts)
total_output_tokens = np.array(total_output_tokens)
total_input_tokens = np.array(total_input_tokens)
avg_input_tokens = np.array(avg_input_tokens)
avg_output_tokens = np.array(avg_output_tokens)
total_emissions = np.array(total_emissions)
cpu_energy = np.array(cpu_energy)
gpu_energy = np.array(gpu_energy)
ram_energy = np.array(ram_energy)
total_energy = np.array(total_energy)
engine = np.array(engine)

In [22]:
print(total_time)

[ 643.117311   2978.90883017]


In [23]:
idle_gpu_power = 28*4 # 28W per GPU, 4 GPUs

total_idle_gpu_energy = (idle_gpu_power/1000)*(total_time/3600) # Convert W into kw and s into h
idle_gpu_energy_per_thousand_prompts = total_idle_gpu_energy / num_prompts * 10_000

print(total_idle_gpu_energy)
print(idle_gpu_energy_per_thousand_prompts)

gpu_energy_without_idle = gpu_energy - total_idle_gpu_energy
gpu_energy_without_idle_per_thousand_prompts = gpu_energy_without_idle / num_prompts * 10_000

print(gpu_energy)
print(gpu_energy_without_idle_per_thousand_prompts)

[0.02000809 0.09267716]
[0.02667746 0.12356955]
[0.04814291 0.17852927]
[0.03751309 0.11446947]


In [24]:
# Calculate emissions per 10,000 prompts
emissions_per_thousand_prompts = {
    'Total Emissions': total_emissions / num_prompts * 10_000,
    'CPU Energy': cpu_energy / num_prompts * 10_000,
    'GPU Energy': gpu_energy / num_prompts * 10_000,
    'GPU Energy (without idle)': gpu_energy_without_idle_per_thousand_prompts,
    'GPU Energy (idle)': idle_gpu_energy_per_thousand_prompts,
    'RAM Energy': ram_energy / num_prompts * 10_000,
    'Total Energy': total_energy / num_prompts * 10_000
}

In [25]:
print(f"Idle GPU Energy per 10.000 prompts: {emissions_per_thousand_prompts['GPU Energy']}")
print(f"Idle GPU Energy per 10.000 prompts: {idle_gpu_energy_per_thousand_prompts}")
print(f"GPU Energy without idle per 10.000 prompts: {gpu_energy_without_idle_per_thousand_prompts}")

Idle GPU Energy per 10.000 prompts: [0.06419055 0.23803903]
Idle GPU Energy per 10.000 prompts: [0.02667746 0.12356955]
GPU Energy without idle per 10.000 prompts: [0.03751309 0.11446947]


In [26]:
print(emissions_per_thousand_prompts)

{'Total Emissions': array([0.06405364, 0.25727231]), 'CPU Energy': array([0.01235068, 0.05720639]), 'GPU Energy': array([0.06419055, 0.23803903]), 'GPU Energy (without idle)': array([0.03751309, 0.11446947]), 'GPU Energy (idle)': array([0.02667746, 0.12356955]), 'RAM Energy': array([0.01978752, 0.09166034]), 'Total Energy': array([0.09632875, 0.38690576])}


In [27]:
# Create the dataframe
df = pd.DataFrame({
    'engine': engine,
    'model_type': ['Llama-3-8B', 'Llama-3-8B'],
    'parameters': parameters,
    'num_prompts': num_prompts,
    'total_time': total_time,
    'time_per_prompt': time_per_prompt,
    'tok_per_sec': tok_per_sec,
    'total_out_tok': total_output_tokens,
    'total_in_tok': total_input_tokens,
    'avg_out_tok': avg_output_tokens,
    'avg_in_tok': avg_input_tokens,
    'actual_emissions_per_10k_prompts': emissions_per_thousand_prompts['Total Emissions'],
    'actual_total_energy_per_10k_prompts': emissions_per_thousand_prompts['Total Energy'],
    'actual_cpu_energy_per_10k_prompts': emissions_per_thousand_prompts['CPU Energy'],
    'actual_gpu_energy_per_10k_prompts': emissions_per_thousand_prompts['GPU Energy'],
    'actual_ram_energy_per_10k_prompts': emissions_per_thousand_prompts['RAM Energy'],
    'actual_idle_gpu_energy_per_10k_prompts': emissions_per_thousand_prompts['GPU Energy (idle)'],
    'actual_non_idle_gpu_energy_per_10k_prompts': emissions_per_thousand_prompts['GPU Energy (without idle)'],
})

df

Unnamed: 0,engine,model_type,parameters,num_prompts,total_time,time_per_prompt,tok_per_sec,total_out_tok,total_in_tok,avg_out_tok,avg_in_tok,actual_emissions_per_10k_prompts,actual_total_energy_per_10k_prompts,actual_cpu_energy_per_10k_prompts,actual_gpu_energy_per_10k_prompts,actual_ram_energy_per_10k_prompts,actual_idle_gpu_energy_per_10k_prompts,actual_non_idle_gpu_energy_per_10k_prompts
0,vllm,Llama-3-8B,8,7500,643.117311,0.085749,1009.821986,649434.0,1549950.0,86.5912,206.66,0.064054,0.096329,0.012351,0.064191,0.019788,0.026677,0.037513
1,transformers,Llama-3-8B,8,7500,2978.90883,0.397188,219.681111,654410.0,1549950.0,87.254667,206.66,0.257272,0.386906,0.057206,0.238039,0.09166,0.12357,0.114469


In [28]:
# Store the results in a CSV file
df.to_csv('results/data/transformers_vs_vllm_llama3_1.csv', index=False)