In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.linear_model import LinearRegression

In [2]:
# Define paths to the individual CSV files
csv_files_output = {
    '1': 'outputtest-meta-llama/Meta-Llama-3-8B-Instruct-OutputTest-emissiondata-1-examples.csv',
    '5': 'outputtest-meta-llama/Meta-Llama-3-8B-Instruct-OutputTest-emissiondata-5-examples.csv',
    '10': 'outputtest-meta-llama/Meta-Llama-3-8B-Instruct-OutputTest-emissiondata-10-examples.csv',
    '30': 'outputtest-meta-llama/Meta-Llama-3-8B-Instruct-OutputTest-emissiondata-30-examples.csv',
    '90': 'outputtest-meta-llama/Meta-Llama-3-8B-Instruct-OutputTest-emissiondata-90-examples.csv',
}

csv_files_input = {
    '1': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-1-examples.csv',
    '5': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-5-examples.csv',
    '10': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-10-examples.csv',
    '20': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-20-examples.csv',
    '30': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-30-examples.csv',
    '70': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-70-examples.csv',
    '210': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-210-examples.csv',
    '350': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-350-examples.csv',
}

csv_files_params_llama2 = {
    '7': 'test_1_meta-llama/Llama-2-7b-chat-hf-emissiondata.csv',
    '13': 'test_1_meta-llama/Llama-2-13b-chat-hf-emissiondata.csv',
    '70': 'test_1_meta-llama/Llama-2-70b-chat-hf-emissiondata.csv',    
}

csv_files_params_llama3 = {
    '8': 'test_1_meta-llama/Meta-Llama-3-8B-Instruct-emissiondata.csv',
    '70': 'test_1_meta-llama/Meta-Llama-3-70B-Instruct-emissiondata.csv',  
}

framework_comp = pd.read_csv('Inference_Framework_Comparison.csv')

# Read the emissions data
emissions_data = pd.read_csv('emissions.csv')

In [3]:
# Initialize lists to store metadata
parameters_output = []
num_examples_output = []
num_prompts_output = []
total_emissions_output = []
cpu_energy_output = []
gpu_energy_output = []
ram_energy_output = []
total_output_tokens_output = []
total_input_tokens_output = []
avg_input_tokens_output = []
avg_output_tokens_output = []

parameters_input = []
num_examples_input = []
num_prompts_input = []
total_emissions_input = []
cpu_energy_input = []
gpu_energy_input = []
ram_energy_input = []
total_output_tokens_input = []
total_input_tokens_input = []
avg_input_tokens_input = []
avg_output_tokens_input = []

parameters_llama2 = []
num_examples_llama2 = []
num_prompts_llama2 = []
total_emissions_llama2 = []
cpu_energy_llama2 = []
gpu_energy_llama2 = []
ram_energy_llama2 = []
total_output_tokens_llama2 = []
total_input_tokens_llama2 = []
avg_input_tokens_llama2 = []
avg_output_tokens_llama2 = []

parameters_llama3 = []
num_examples_llama3 = []
num_prompts_llama3 = []
total_emissions_llama3 = []
cpu_energy_llama3 = []
gpu_energy_llama3 = []
ram_energy_llama3 = []
total_output_tokens_llama3 = []
total_input_tokens_llama3 = []
avg_input_tokens_llama3 = []
avg_output_tokens_llama3 = []

In [4]:
# Read and extract metadata from each CSV file
for model, file in csv_files_output.items():
    data = pd.read_csv(file)
    prompts = data.loc[data['Metric'] == 'Total Prompts', 'Value'].values[0]
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'Avg Input Tokens per Prompt', 'Value'].values[0]
    avg_o_tok = data.loc[data['Metric'] == 'Avg Output Tokens per Prompt', 'Value'].values[0]
    parameters_output.append(8)
    num_examples_output.append(int(model))
    num_prompts_output.append(int(prompts))
    total_output_tokens_output.append(float(output_tokens))
    total_input_tokens_output.append(float(input_tokens))
    avg_input_tokens_output.append(float(avg_i_tok))
    avg_output_tokens_output.append(float(avg_o_tok))    

In [5]:
for model, file in csv_files_input.items():
    data = pd.read_csv(file)
    prompts = data.loc[data['Metric'] == 'Total Prompts', 'Value'].values[0]
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'Avg Input Tokens per Prompt', 'Value'].values[0]
    avg_o_tok = data.loc[data['Metric'] == 'Avg Output Tokens per Prompt', 'Value'].values[0]
    parameters_input.append(8)
    num_examples_input.append(int(model))
    num_prompts_input.append(int(prompts))
    total_output_tokens_input.append(float(output_tokens))
    total_input_tokens_input.append(float(input_tokens))
    avg_input_tokens_input.append(float(avg_i_tok))
    avg_output_tokens_input.append(float(avg_o_tok))

In [6]:
for model, file in csv_files_params_llama2.items():
    data = pd.read_csv(file)
    prompts = data.loc[data['Metric'] == 'Total Prompts', 'Value'].values[0]
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'Avg Input Tokens per Prompt', 'Value'].values[0]
    avg_o_tok = data.loc[data['Metric'] == 'Avg Output Tokens per Prompt', 'Value'].values[0]
    parameters_llama2.append(int(model))
    num_examples_llama2.append(1)
    num_prompts_llama2.append(int(prompts))
    total_output_tokens_llama2.append(float(output_tokens))
    total_input_tokens_llama2.append(float(input_tokens))
    avg_input_tokens_llama2.append(float(avg_i_tok))
    avg_output_tokens_llama2.append(float(avg_o_tok))

In [7]:
for model, file in csv_files_params_llama3.items():
    data = pd.read_csv(file)
    prompts = data.loc[data['Metric'] == 'Total Prompts', 'Value'].values[0]
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'Avg Input Tokens per Prompt', 'Value'].values[0]
    avg_o_tok = data.loc[data['Metric'] == 'Avg Output Tokens per Prompt', 'Value'].values[0]
    parameters_llama3.append(int(model))
    num_examples_llama3.append(1)
    num_prompts_llama3.append(int(prompts))
    total_output_tokens_llama3.append(float(output_tokens))
    total_input_tokens_llama3.append(float(input_tokens))
    avg_input_tokens_llama3.append(float(avg_i_tok))
    avg_output_tokens_llama3.append(float(avg_o_tok))

In [8]:
# Extract emissions data
for model in csv_files_output.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains("Llama-3-8B-Instruct-OutputTest-emissiondata-" + model + "-")]
    total_emissions_output.append(model_emissions['emissions'].values[0])
    cpu_energy_output.append(model_emissions['cpu_energy'].values[0])
    gpu_energy_output.append(model_emissions['gpu_energy'].values[0])
    ram_energy_output.append(model_emissions['ram_energy'].values[0])

In [9]:
print(avg_input_tokens_output)
print(avg_output_tokens_output)
print(total_output_tokens_output)
print(total_emissions_output)

[186.66, 186.66, 186.66, 186.66, 186.66]
[18.027, 73.053, 176.913, 587.78, 2124.58]
[2704.0, 10958.0, 26537.0, 29389.0, 106229.0]
[0.0107639213548215, 0.0439809911016423, 0.1084166041887974, 0.1236233887845658, 0.4659625892970863]


In [10]:
for model in csv_files_input.keys(): 
    model_emissions = emissions_data[emissions_data['project_name'].str.contains("Llama-3-8B-Instruct-emissiondata-" + model + "-")]
    total_emissions_input.append(model_emissions['emissions'].values[0])
    cpu_energy_input.append(model_emissions['cpu_energy'].values[0])
    gpu_energy_input.append(model_emissions['gpu_energy'].values[0])
    ram_energy_input.append(model_emissions['ram_energy'].values[0])

In [11]:
print(avg_input_tokens_input)
print(avg_output_tokens_input)
print(total_output_tokens_input)
print(total_emissions_input)

[158.66, 222.66, 301.66, 462.66, 627.66, 1343.66, 3743.66, 6143.66]
[18.007, 18.04, 17.94, 18.047, 18.033, 18.24, 18.513, 18.807]
[2701.0, 2706.0, 2691.0, 2707.0, 2705.0, 2736.0, 2777.0, 2821.0]
[0.0199979467296797, 0.0203268454993705, 0.0205117219125248, 0.0211612903588763, 0.021899838445093, 0.0248178578752835, 0.0364961148272292, 0.0511827769119716]


In [12]:
for model in csv_files_params_llama2.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains("meta-llama/Llama-2-" + model + "b-chat-hf")]
    total_emissions_llama2.append(model_emissions['emissions'].values[0])
    cpu_energy_llama2.append(model_emissions['cpu_energy'].values[0])
    gpu_energy_llama2.append(model_emissions['gpu_energy'].values[0])
    ram_energy_llama2.append(model_emissions['ram_energy'].values[0])

In [13]:
print(avg_input_tokens_llama2)
print(avg_output_tokens_llama2)
print(total_output_tokens_llama2)
print(total_emissions_llama2)

[168.0, 168.0, 147.0]
[37.616, 41.008, 39.848]
[9404.0, 10252.0, 9962.0]
[0.0672881651768405, 0.0968092248640126, 0.3127521811156471]


In [14]:
for model in csv_files_params_llama3.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains("meta-llama/Meta-Llama-3-" + model + "B-Instruct-params")]
    total_emissions_llama3.append(model_emissions['emissions'].values[0])
    cpu_energy_llama3.append(model_emissions['cpu_energy'].values[0])
    gpu_energy_llama3.append(model_emissions['gpu_energy'].values[0])
    ram_energy_llama3.append(model_emissions['ram_energy'].values[0])

In [15]:
print(avg_input_tokens_llama3)
print(avg_output_tokens_llama3)
print(total_output_tokens_llama3)
print(total_emissions_llama3)

[132.66, 132.66]
[18.096, 18.79]
[4524.0, 1879.0]
[0.0337420699294056, 0.0532702232629774]


In [16]:
# Prepare data for regression and visualization
parameters_output = np.array(parameters_output)
num_examples_output = np.array(num_examples_output)
num_prompts_output = np.array(num_prompts_output)
total_output_tokens_output = np.array(total_output_tokens_output)
total_input_tokens_output = np.array(total_input_tokens_output)
avg_input_tokens_output = np.array(avg_input_tokens_output)
avg_output_tokens_output = np.array(avg_output_tokens_output)
total_emissions_output = np.array(total_emissions_output)
cpu_energy_output = np.array(cpu_energy_output)
gpu_energy_output = np.array(gpu_energy_output)
ram_energy_output = np.array(ram_energy_output)

In [17]:
parameters_input = np.array(parameters_input)
num_examples_input = np.array(num_examples_input)
num_prompts_input = np.array(num_prompts_input)
total_output_tokens_input = np.array(total_output_tokens_input)
total_input_tokens_input = np.array(total_input_tokens_input)
avg_input_tokens_input = np.array(avg_input_tokens_input)
avg_output_tokens_input = np.array(avg_output_tokens_input)
total_emissions_input = np.array(total_emissions_input)
cpu_energy_input = np.array(cpu_energy_input)
gpu_energy_input = np.array(gpu_energy_input)
ram_energy_input = np.array(ram_energy_input)

In [18]:
parameters_llama2 = np.array(parameters_llama2)
num_examples_llama2 = np.array(num_examples_llama2)
num_prompts_llama2 = np.array(num_prompts_llama2)
total_output_tokens_llama2 = np.array(total_output_tokens_llama2)
total_input_tokens_llama2 = np.array(total_input_tokens_llama2)
avg_input_tokens_llama2 = np.array(avg_input_tokens_llama2)
avg_output_tokens_llama2 = np.array(avg_output_tokens_llama2)
total_emissions_llama2 = np.array(total_emissions_llama2)
cpu_energy_llama2 = np.array(cpu_energy_llama2)
gpu_energy_llama2 = np.array(gpu_energy_llama2)
ram_energy_llama2 = np.array(ram_energy_llama2)

In [19]:
parameters_llama3 = np.array(parameters_llama3)
num_examples_llama3 = np.array(num_examples_llama3)
num_prompts_llama3 = np.array(num_prompts_llama3)
total_output_tokens_llama3 = np.array(total_output_tokens_llama3)
total_input_tokens_llama3 = np.array(total_input_tokens_llama3)
avg_input_tokens_llama3 = np.array(avg_input_tokens_llama3)
avg_output_tokens_llama3 = np.array(avg_output_tokens_llama3)
total_emissions_llama3 = np.array(total_emissions_llama3)
cpu_energy_llama3 = np.array(cpu_energy_llama3)
gpu_energy_llama3 = np.array(gpu_energy_llama3)
ram_energy_llama3 = np.array(ram_energy_llama3)

In [20]:
# Calculate emissions per 10,000 prompts
emissions_per_thousand_prompts = {
    'Total Emissions Output Tok': total_emissions_output / num_prompts_output * 10_000,
    'CPU Energy Output Tok': cpu_energy_output / num_prompts_output * 10_000,
    'GPU Energy Output Tok': gpu_energy_output / num_prompts_output * 10_000,
    'RAM Energy Output Tok': ram_energy_output / num_prompts_output * 10_000,

    'Total Emissions Input Tok': total_emissions_input / num_prompts_input * 10_000,
    'CPU Energy Input Tok': cpu_energy_input / num_prompts_input * 10_000,
    'GPU Energy Input Tok': gpu_energy_input / num_prompts_input * 10_000,
    'RAM Energy Input Tok': ram_energy_input / num_prompts_input * 10_000,

    # We have to normalize the emissions to the number of output tokens for llama2, because the llama2 and llama3 output differed to drastically
    'Total Emissions Llama2 Params': (total_emissions_llama2 / num_prompts_llama2) / avg_output_tokens_llama2 * 10_000 * 18.5, 
    'CPU Energy Llama2 Params': (cpu_energy_llama2 / num_prompts_llama2) / avg_output_tokens_llama2 * 10_000 * 18.5,
    'GPU Energy Llama2 Params': (gpu_energy_llama2 / num_prompts_llama2) / avg_output_tokens_llama2 * 10_000 * 18.5,
    'RAM Energy Llama2 Params': (ram_energy_llama2 / num_prompts_llama2) / avg_output_tokens_llama2 * 10_000 * 18.5,

    'Total Emissions Llama3 Params': total_emissions_llama3 / num_prompts_llama3 * 10_000,
    'CPU Energy Llama3 Params': cpu_energy_llama3 / num_prompts_llama3 * 10_000,
    'GPU Energy Llama3 Params': gpu_energy_llama3 / num_prompts_llama3 * 10_000,
    'RAM Energy Llama3 Params': ram_energy_llama3 / num_prompts_llama3 * 10_000,
}


In [21]:
print(emissions_per_thousand_prompts)

{'Total Emissions Output Tok': array([ 0.71759476,  2.93206607,  7.22777361, 24.72467776, 93.19251786]), 'CPU Energy Output Tok': array([ 0.34119865,  1.38108   ,  3.41097504, 11.68430222, 43.35962526]), 'GPU Energy Output Tok': array([ 0.37330112,  1.55227741,  3.81306506, 13.01042801, 50.44779913]), 'RAM Energy Output Tok': array([ 0.36467404,  1.47610751,  3.64563793, 12.48812872, 46.34260226]), 'Total Emissions Input Tok': array([1.33319645, 1.35512303, 1.36744813, 1.41075269, 1.45998923,
       1.65452386, 2.43307432, 3.41218513]), 'CPU Energy Input Tok': array([0.34934319, 0.34979593, 0.34966325, 0.36020338, 0.36834101,
       0.41130111, 0.58992712, 0.8125549 ]), 'GPU Energy Input Tok': array([1.09583639, 1.12760167, 1.14650559, 1.18416797, 1.23706802,
       1.41778184, 2.12381272, 3.01694606]), 'RAM Energy Input Tok': array([0.55978307, 0.56053992, 0.56030411, 0.57722631, 0.59023429,
       0.6591164 , 0.94530317, 1.30200367]), 'Total Emissions Llama2 Params': array([1.3237250

In [22]:
print(parameters_input)
print(parameters_output)
print(parameters_llama2)
print(parameters_llama3)

[8 8 8 8 8 8 8 8]
[8 8 8 8 8]
[ 7 13 70]
[ 8 70]


In [23]:
# Perform regression analysis
def perform_regression(x, y):
    x = x.reshape(-1, 1)
    model = LinearRegression()
    model.fit(x, y)
    predicted = model.predict(x)
    return model, predicted

In [24]:
models = {}
predictions = {}
for name, y in emissions_per_thousand_prompts.items():
    if(name == "Total Emissions Output Tok"):
        model, predicted = perform_regression(avg_output_tokens_output, y)
    elif(name == "Total Emissions Input Tok"):
        model, predicted = perform_regression(avg_input_tokens_input, y)
    elif(name == "Total Emissions Llama2 Params"):
        model, predicted = perform_regression(parameters_llama2, y)
    elif(name == "Total Emissions Llama3 Params"):
        model, predicted = perform_regression(parameters_llama3, y)
    else:
        print("Error, could not find model")
    models[name] = model
    predictions[name] = predicted
    print(f"{name} - Intercept: {model.intercept_}, Coefficient: {model.coef_[0]}")

Total Emissions Output Tok - Intercept: -0.4666966237208392, Coefficient: 0.04399751075726668
Error, could not find model
CPU Energy Output Tok - Intercept: -0.4666966237208392, Coefficient: 0.04399751075726668
Error, could not find model
GPU Energy Output Tok - Intercept: -0.4666966237208392, Coefficient: 0.04399751075726668
Error, could not find model
RAM Energy Output Tok - Intercept: -0.4666966237208392, Coefficient: 0.04399751075726668
Total Emissions Input Tok - Intercept: 1.2480841224646957, Coefficient: 0.00034155061699822484
Error, could not find model
CPU Energy Input Tok - Intercept: 1.2480841224646957, Coefficient: 0.00034155061699822484
Error, could not find model
GPU Energy Input Tok - Intercept: 1.2480841224646957, Coefficient: 0.00034155061699822484
Error, could not find model
RAM Energy Input Tok - Intercept: 1.2480841224646957, Coefficient: 0.00034155061699822484
Total Emissions Llama2 Params - Intercept: 0.8233785594775198, Coefficient: 0.07120580743041335
Error, cou

In [25]:
# Define the test types and model types
test_types = ['Output-tok', 'Input-tok', 'Llama2 Params', 'Llama3 Params']
model_types = ['llama3', 'llama3', 'llama2', 'llama3']

# Define the parameters
parameters = np.concatenate([parameters_output, parameters_input, parameters_llama2, parameters_llama3])
num_examples = np.concatenate([num_examples_output, num_examples_input, num_examples_llama2, num_examples_llama3])
num_prompts = np.concatenate([num_prompts_output, num_prompts_input, num_prompts_llama2, num_prompts_llama3])
total_out_tok = np.concatenate([total_output_tokens_output, total_output_tokens_input, total_output_tokens_llama2, total_output_tokens_llama3])
total_in_tok = np.concatenate([total_input_tokens_output, total_input_tokens_input, total_input_tokens_llama2, total_input_tokens_llama3])
avg_out_tok = np.concatenate([avg_output_tokens_output, avg_output_tokens_input, avg_output_tokens_llama2, avg_output_tokens_llama3])
avg_in_tok = np.concatenate([avg_input_tokens_output, avg_input_tokens_input, avg_input_tokens_llama2, avg_input_tokens_llama3])

pred_emissions_per_10k_prompts = np.concatenate([
    predictions['Total Emissions Output Tok'],
    predictions['Total Emissions Input Tok'],
    predictions['Total Emissions Llama2 Params'],
    predictions['Total Emissions Llama3 Params']
])

pred_cpu_energy_per_10k_prompts = np.concatenate([
    predictions['CPU Energy Output Tok'],
    predictions['CPU Energy Input Tok'],
    predictions['CPU Energy Llama2 Params'],
    predictions['CPU Energy Llama3 Params']
])

pred_gpu_energy_per_10k_prompts = np.concatenate([
    predictions['GPU Energy Output Tok'],
    predictions['GPU Energy Input Tok'],
    predictions['GPU Energy Llama2 Params'],
    predictions['GPU Energy Llama3 Params']
])

pred_ram_energy_per_10k_prompts = np.concatenate([
    predictions['RAM Energy Output Tok'],
    predictions['RAM Energy Input Tok'],
    predictions['RAM Energy Llama2 Params'],
    predictions['RAM Energy Llama3 Params']
])

actual_emissions_per_10k_prompts = np.concatenate([
    emissions_per_thousand_prompts['Total Emissions Output Tok'],
    emissions_per_thousand_prompts['Total Emissions Input Tok'],
    emissions_per_thousand_prompts['Total Emissions Llama2 Params'],
    emissions_per_thousand_prompts['Total Emissions Llama3 Params']
])

actual_cpu_energy_per_10k_prompts = np.concatenate([
    emissions_per_thousand_prompts['CPU Energy Output Tok'],
    emissions_per_thousand_prompts['CPU Energy Input Tok'],
    emissions_per_thousand_prompts['CPU Energy Llama2 Params'],
    emissions_per_thousand_prompts['CPU Energy Llama3 Params']
])

actual_gpu_energy_per_10k_prompts = np.concatenate([
    emissions_per_thousand_prompts['GPU Energy Output Tok'],
    emissions_per_thousand_prompts['GPU Energy Input Tok'],
    emissions_per_thousand_prompts['GPU Energy Llama2 Params'],
    emissions_per_thousand_prompts['GPU Energy Llama3 Params']
])

actual_ram_energy_per_10k_prompts = np.concatenate([
    emissions_per_thousand_prompts['RAM Energy Output Tok'],
    emissions_per_thousand_prompts['RAM Energy Input Tok'],
    emissions_per_thousand_prompts['RAM Energy Llama2 Params'],
    emissions_per_thousand_prompts['RAM Energy Llama3 Params']
])

In [26]:
# Repeat test types and model types for each data point
test_type_column = np.concatenate([
    np.repeat(test_types[0], len(parameters_output)),
    np.repeat(test_types[1], len(parameters_input)),
    np.repeat(test_types[2], len(parameters_llama2)),
    np.repeat(test_types[3], len(parameters_llama3))
])

model_type_column = np.concatenate([
    np.repeat(model_types[0], len(parameters_output)),
    np.repeat(model_types[1], len(parameters_input)),
    np.repeat(model_types[2], len(parameters_llama2)),
    np.repeat(model_types[3], len(parameters_llama3))
])

In [27]:
# Create the dataframe
df = pd.DataFrame({
    'test_type': test_type_column,
    'model_type': model_type_column,
    'parameters': parameters,
    'num_examples': num_examples,
    'num_prompts': num_prompts,
    'total_out_tok': total_out_tok,
    'total_in_tok': total_in_tok,
    'avg_out_tok': avg_out_tok,
    'avg_in_tok': avg_in_tok,
    'actual_emissions_per_10k_prompts': actual_emissions_per_10k_prompts,
    'actual_cpu_energy_per_10k_prompts': actual_cpu_energy_per_10k_prompts,
    'actual_gpu_energy_per_10k_prompts': actual_gpu_energy_per_10k_prompts,
    'actual_ram_energy_per_10k_prompts': actual_ram_energy_per_10k_prompts,
    'pred_emissions_per_10k_prompts': pred_emissions_per_10k_prompts,
    'pred_cpu_energy_per_10k_prompts': pred_cpu_energy_per_10k_prompts,
    'pred_gpu_energy_per_10k_prompts': pred_gpu_energy_per_10k_prompts,
    'pred_ram_energy_per_10k_prompts': pred_ram_energy_per_10k_prompts
})

df

Unnamed: 0,test_type,model_type,parameters,num_examples,num_prompts,total_out_tok,total_in_tok,avg_out_tok,avg_in_tok,actual_emissions_per_10k_prompts,actual_cpu_energy_per_10k_prompts,actual_gpu_energy_per_10k_prompts,actual_ram_energy_per_10k_prompts,pred_emissions_per_10k_prompts,pred_cpu_energy_per_10k_prompts,pred_gpu_energy_per_10k_prompts,pred_ram_energy_per_10k_prompts
0,Output-tok,llama3,8,1,150,2704.0,27999.0,18.027,186.66,0.717595,0.341199,0.373301,0.364674,0.326447,0.326447,0.326447,0.326447
1,Output-tok,llama3,8,5,150,10958.0,27999.0,73.053,186.66,2.932066,1.38108,1.552277,1.476108,2.747454,2.747454,2.747454,2.747454
2,Output-tok,llama3,8,10,150,26537.0,27999.0,176.913,186.66,7.227774,3.410975,3.813065,3.645638,7.317035,7.317035,7.317035,7.317035
3,Output-tok,llama3,8,30,50,29389.0,9333.0,587.78,186.66,24.724678,11.684302,13.010428,12.488129,25.39416,25.39416,25.39416,25.39416
4,Output-tok,llama3,8,90,50,106229.0,9333.0,2124.58,186.66,93.192518,43.359625,50.447799,46.342602,93.009535,93.009535,93.009535,93.009535
5,Input-tok,llama3,8,1,150,2701.0,23799.0,18.007,158.66,1.333196,0.349343,1.095836,0.559783,1.302275,1.302275,1.302275,1.302275
6,Input-tok,llama3,8,5,150,2706.0,33399.0,18.04,222.66,1.355123,0.349796,1.127602,0.56054,1.324134,1.324134,1.324134,1.324134
7,Input-tok,llama3,8,10,150,2691.0,45249.0,17.94,301.66,1.367448,0.349663,1.146506,0.560304,1.351116,1.351116,1.351116,1.351116
8,Input-tok,llama3,8,20,150,2707.0,69399.0,18.047,462.66,1.410753,0.360203,1.184168,0.577226,1.406106,1.406106,1.406106,1.406106
9,Input-tok,llama3,8,30,150,2705.0,94149.0,18.033,627.66,1.459989,0.368341,1.237068,0.590234,1.462462,1.462462,1.462462,1.462462


In [29]:
framework_comp

Unnamed: 0,Name,State,Notes,User,Tags,Created,Runtime,Sweep,framework,model,...,total_time,AVG. Input Tokens,AVG. Output Tokens,AVG. Time / Prompt,AVG. Tokens / Second,Emissions / 1.000.000 Input Tokens,Emissions / 1.000.000 Output Tokens,Emissions / 10.000 Prompts,Total Emissions,Total Time
0,vLLM_Inference_7500_prompts,finished,-,,,2024-06-07T18:20:15.000Z,168,,vLLM,bigscience/bloomz-560m,...,,172.22,30.482133,20.897599,1458.642858,0.007615,0.043024,0.013115,0.009836,156.731992
1,vLLM_Inference_7500_prompts,finished,-,,,2024-06-07T18:14:32.000Z,217,,vLLM,bigscience/bloomz-1b1,...,,172.22,35.4056,27.639732,1280.967557,0.010474,0.050949,0.018039,0.013529,207.29799
2,vLLM_Inference_7500_prompts,finished,-,,,2024-06-07T18:08:45.000Z,238,,vLLM,bigscience/bloomz-1b7,...,,172.22,32.533733,30.266593,1074.905689,0.01177,0.062306,0.02027,0.015203,226.99945
3,vLLM_Inference_7500_prompts,finished,-,,,2024-06-07T18:01:25.000Z,291,,vLLM,bigscience/bloomz-3b,...,,172.22,26.118533,37.542294,695.709568,0.015074,0.099394,0.02596,0.01947,281.567207
4,vLLM_Inference_7500_prompts,finished,-,,,2024-06-07T17:48:04.000Z,467,,vLLM,bigscience/bloomz-7b1,...,,172.22,34.083333,60.393297,564.356227,0.02637,0.133245,0.045414,0.034061,452.949728
5,vLLM_Inference_5000_prompts,finished,-,,,2024-06-07T15:36:41.000Z,253,,vLLM,meta-llama/Meta-Llama-3-8B-Instruct,...,,186.66,18.2004,48.790199,373.033938,0.018982,0.194673,0.035431,0.017716,243.950994


In [30]:
# Mapping columns from framework_comp to df
mapped_data = {
    'test_type': ['framework_comp_vllm'] * len(framework_comp),
    'model_type': ['vLLM'] * len(framework_comp),
    'parameters': [.560, 1.100, 1.700, 3.000, 7.100, 8.000],
    'num_examples': [np.nan] * len(framework_comp),
    'num_prompts': framework_comp['total_prompts'],
    'total_out_tok': [np.nan] * len(framework_comp),
    'total_in_tok': [np.nan] * len(framework_comp),
    'avg_out_tok': framework_comp['AVG. Output Tokens'],
    'avg_in_tok': framework_comp['AVG. Input Tokens'],
    'actual_emissions_per_10k_prompts': framework_comp['Emissions / 10.000 Prompts'],
    'actual_emissions_per_1M_out_tok': framework_comp['Emissions / 1.000.000 Output Tokens'],
    'actual_cpu_energy_per_10k_prompts': [np.nan] * len(framework_comp),
    'actual_gpu_energy_per_10k_prompts': [np.nan] * len(framework_comp),
    'actual_ram_energy_per_10k_prompts': [np.nan] * len(framework_comp),
    'pred_emissions_per_10k_prompts': [np.nan] * len(framework_comp),
    'pred_cpu_energy_per_10k_prompts': [np.nan] * len(framework_comp),
    'pred_gpu_energy_per_10k_prompts': [np.nan] * len(framework_comp),
    'pred_ram_energy_per_10k_prompts': [np.nan] * len(framework_comp)
}

# Add the model names
mapped_data['model_type'] = framework_comp['model'].apply(lambda x: x.split('/')[-1])

# Create the mapped dataframe
mapped_df = pd.DataFrame(mapped_data)

# Append to the original dataframe
result_df = pd.concat([df, mapped_df], ignore_index=True)

In [31]:
result_df

Unnamed: 0,test_type,model_type,parameters,num_examples,num_prompts,total_out_tok,total_in_tok,avg_out_tok,avg_in_tok,actual_emissions_per_10k_prompts,actual_cpu_energy_per_10k_prompts,actual_gpu_energy_per_10k_prompts,actual_ram_energy_per_10k_prompts,pred_emissions_per_10k_prompts,pred_cpu_energy_per_10k_prompts,pred_gpu_energy_per_10k_prompts,pred_ram_energy_per_10k_prompts,actual_emissions_per_1M_out_tok
0,Output-tok,llama3,8.0,1.0,150,2704.0,27999.0,18.027,186.66,0.717595,0.341199,0.373301,0.364674,0.326447,0.326447,0.326447,0.326447,
1,Output-tok,llama3,8.0,5.0,150,10958.0,27999.0,73.053,186.66,2.932066,1.38108,1.552277,1.476108,2.747454,2.747454,2.747454,2.747454,
2,Output-tok,llama3,8.0,10.0,150,26537.0,27999.0,176.913,186.66,7.227774,3.410975,3.813065,3.645638,7.317035,7.317035,7.317035,7.317035,
3,Output-tok,llama3,8.0,30.0,50,29389.0,9333.0,587.78,186.66,24.724678,11.684302,13.010428,12.488129,25.39416,25.39416,25.39416,25.39416,
4,Output-tok,llama3,8.0,90.0,50,106229.0,9333.0,2124.58,186.66,93.192518,43.359625,50.447799,46.342602,93.009535,93.009535,93.009535,93.009535,
5,Input-tok,llama3,8.0,1.0,150,2701.0,23799.0,18.007,158.66,1.333196,0.349343,1.095836,0.559783,1.302275,1.302275,1.302275,1.302275,
6,Input-tok,llama3,8.0,5.0,150,2706.0,33399.0,18.04,222.66,1.355123,0.349796,1.127602,0.56054,1.324134,1.324134,1.324134,1.324134,
7,Input-tok,llama3,8.0,10.0,150,2691.0,45249.0,17.94,301.66,1.367448,0.349663,1.146506,0.560304,1.351116,1.351116,1.351116,1.351116,
8,Input-tok,llama3,8.0,20.0,150,2707.0,69399.0,18.047,462.66,1.410753,0.360203,1.184168,0.577226,1.406106,1.406106,1.406106,1.406106,
9,Input-tok,llama3,8.0,30.0,150,2705.0,94149.0,18.033,627.66,1.459989,0.368341,1.237068,0.590234,1.462462,1.462462,1.462462,1.462462,


In [253]:
# Define chart width and height
chart_width = 600
chart_height = 350

llama2_note = 'Note: Emissions normalized to number of output tokens for Llama2 because the Llama2 and Llama3 output differed drastically'

# Function to create charts for each test type
def create_chart(df, test_type, color, remove_x_title=False):
    chart_data = df[df['test_type'] == test_type]

    if test_type == 'Output-tok': 
        x_title = 'Average Output Tokens per Prompt'
        x_data = 'avg_out_tok'
    elif test_type == 'Input-tok':
        x_title = 'Average Input Tokens per Prompt'
        x_data = 'avg_in_tok'
    elif test_type == 'Llama2 Params':
        x_title = 'Parameters (billions)'
        x_data = 'parameters'
    elif test_type == 'Llama3 Params':
        x_title = 'Parameters (billions)'
        x_data = 'parameters'
    
    scatter = alt.Chart(chart_data).mark_circle(size=100).encode(
        x=alt.X(x_data, title=x_title),
        y=alt.Y('actual_emissions_per_10k_prompts', title='Actual Emissions per 10,000 Prompts'),
        color = alt.Color('test_type:N', title='Test Type').sort(df['test_type'].unique()),
        tooltip=[
            alt.Tooltip('parameters', title='Parameters (billions)'),
            alt.Tooltip('actual_emissions_per_10k_prompts', title='Actual Emissions per 10,000 Prompts'),
            alt.Tooltip('pred_emissions_per_10k_prompts', title='Predicted Emissions per 10,000 Prompts'),
            alt.Tooltip('avg_out_tok', title='Average Output Tokens per Prompt'),
            alt.Tooltip('avg_in_tok', title='Average Input Tokens per Prompt'),
            alt.Tooltip('num_examples', title='Number of Examples'),
            alt.Tooltip('num_prompts', title='Number of Prompts'),
            alt.Tooltip('model_type', title='Model Type'),
            alt.Tooltip('test_type', title='Test Type'),
            alt.Tooltip('test_type', title='Test Type'),
        ]
    ).properties(
        title=f'Actual Emissions for {test_type} per 10,000 Prompts',
        width=chart_width,
        height=chart_height
    )

    # Create line plots for predicted emissions
    line = alt.Chart(chart_data).mark_line().encode(
        x=alt.X(x_data, title=x_title),
        y=alt.Y('pred_emissions_per_10k_prompts', title='Predicted Emissions per 10,000 Prompts'),
        color = alt.Color('test_type:N', title='Test Type').sort(df['test_type'].unique()),
        tooltip=[
            alt.Tooltip('parameters', title='Parameters (billions)'),
            alt.Tooltip('actual_emissions_per_10k_prompts', title='Actual Emissions per 10,000 Prompts'),
            alt.Tooltip('pred_emissions_per_10k_prompts', title='Predicted Emissions per 10,000 Prompts'),
            alt.Tooltip('avg_out_tok', title='Average Output Tokens per Prompt'),
            alt.Tooltip('avg_in_tok', title='Average Input Tokens per Prompt'),
            alt.Tooltip('num_examples', title='Number of Examples'),
            alt.Tooltip('num_prompts', title='Number of Prompts'),
            alt.Tooltip('model_type', title='Model Type'),
            alt.Tooltip('test_type', title='Test Type'),
        ]
    ).properties(
        width=chart_width,
        height=chart_height
    )
    
    # Add note for Llama2
    if test_type == 'Llama2 Params':
        chart_data['Note'] = llama2_note
        #print(chart_data)
        scatter = scatter.encode(
            tooltip=[
                alt.Tooltip('parameters', title='Parameters (billions)'),
                alt.Tooltip('actual_emissions_per_10k_prompts', title='Actual Emissions per 10,000 Prompts'),
                alt.Tooltip('pred_emissions_per_10k_prompts', title='Predicted Emissions per 10,000 Prompts'),
                alt.Tooltip('avg_out_tok', title='Average Output Tokens per Prompt'),
                alt.Tooltip('avg_in_tok', title='Average Input Tokens per Prompt'),
                alt.Tooltip('num_examples', title='Number of Examples'),
                alt.Tooltip('num_prompts', title='Number of Prompts'),
                alt.Tooltip('model_type', title='Model Type'),
                alt.Tooltip('test_type', title='Test Type'),
                alt.Tooltip('Note', title='Normalization Note')
            ]
        )
    
    if remove_x_title:
        scatter = scatter.encode(
            x=alt.X(x_data, title=None, axis=None),
        )
        line = line.encode(
            x=alt.X(x_data, title=None, axis=None),
        )


    return scatter + line

In [258]:
# Create charts for each test type
charts = []
colors = ['blue', 'green', 'red', 'purple']
for test_type, color in zip(df['test_type'].unique(), colors):
    charts.append(create_chart(df, test_type, color))

# Arrange the charts in a grid
grid_chart = alt.vconcat(*[alt.hconcat(*charts[i:i+2]) for i in range(0, len(charts), 2)]).resolve_scale(
    y='independent'
)

grid_chart.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chart_data['Note'] = llama2_note


In [261]:
# Create charts for each test type
stacked_charts = []
stacked_df = df[df.num_examples != 90]
for test_type, color in zip(stacked_df['test_type'].unique(), colors):
    stacked_charts.append(create_chart(stacked_df, test_type, color, remove_x_title=True))


# Create a combined chart with overlays for all emission types
combined_chart = alt.layer(*stacked_charts).resolve_scale(
    x='independent'
).properties(
    title='Emissions per Ten-Thousand Prompts by Test Type',
    width=1300,  # Adjusted width for combined chart
    height=700  # Adjusted height for combined chart
)

combined_chart.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chart_data['Note'] = llama2_note


In [33]:
# Store the results in a CSV file
result_df.to_csv('results/emission_regression.csv', index=False)