In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.linear_model import LinearRegression

In [2]:
# Define paths to the individual CSV files
csv_files = {
    '1': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-1-examples.csv',
    '5': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-5-examples.csv',
    '10': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-10-examples.csv',
    '20': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-20-examples.csv',
    '30': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-30-examples.csv',
    '70': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-70-examples.csv',
    '210': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-210-examples.csv',
    '350': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata-350-examples.csv',
}

# Read the emissions data
emissions_data = pd.read_csv('emissions.csv')

In [3]:
# Initialize lists to store metadata
parameters = []
num_examples = []
total_emissions = []
cpu_energy = []
gpu_energy = []
ram_energy = []
total_output_tokens = []
total_input_tokens = []
avg_input_tokens = []

In [4]:
# Read and extract metadata from each CSV file
for model, file in csv_files.items():
    data = pd.read_csv(file)
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    input_tokens = data.loc[data['Metric'] == 'Total Input Tokens', 'Value'].values[0]
    avg_i_tok = data.loc[data['Metric'] == 'Avg Input Tokens per Prompt', 'Value'].values[0]
    parameters.append(70)
    num_examples.append(int(model))
    total_output_tokens.append(float(output_tokens))
    total_input_tokens.append(float(input_tokens))
    avg_input_tokens.append(float(avg_i_tok))

In [5]:
# Extract emissions data
for model in csv_files.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains("8B-Instruct-emissiondata-" + model + "-")]
    total_emissions.append(model_emissions['emissions'].values[0])
    cpu_energy.append(model_emissions['cpu_energy'].values[0])
    gpu_energy.append(model_emissions['gpu_energy'].values[0])
    ram_energy.append(model_emissions['ram_energy'].values[0])

In [6]:
print(total_output_tokens)
print(total_emissions)

[2701.0, 2706.0, 2691.0, 2707.0, 2705.0, 2736.0, 2777.0, 2821.0]
[0.0199979467296797, 0.0203268454993705, 0.0205117219125248, 0.0211612903588763, 0.021899838445093, 0.0248178578752835, 0.0364961148272292, 0.0511827769119716]


In [7]:
# Prepare data for regression and visualization
parameters = np.array(parameters)
num_examples = np.array(num_examples)
total_output_tokens = np.array(total_output_tokens)
total_input_tokens = np.array(total_input_tokens)
avg_input_tokens = np.array(avg_input_tokens)
total_emissions = np.array(total_emissions)
cpu_energy = np.array(cpu_energy)
gpu_energy = np.array(gpu_energy)
ram_energy = np.array(ram_energy)

In [8]:
# Calculate emissions per 1,000,000 output tokens
emissions_per_million_output_tokens = {
    'Total Emissions': total_emissions / total_output_tokens * 1_000_000,
    'CPU Energy': cpu_energy / total_output_tokens * 1_000_000,
    'GPU Energy': gpu_energy / total_output_tokens * 1_000_000,
    'RAM Energy': ram_energy / total_output_tokens * 1_000_000
}

In [9]:
# Perform regression analysis
def perform_regression(x, y):
    x = x.reshape(-1, 1)
    model = LinearRegression()
    model.fit(x, y)
    predicted = model.predict(x)
    return model, predicted

In [10]:
models = {}
predictions = {}
for name, y in emissions_per_million_output_tokens.items():
    model, predicted = perform_regression(avg_input_tokens, y)
    models[name] = model
    predictions[name] = predicted
    print(f"{name} - Intercept: {model.intercept_}, Coefficient: {model.coef_[0]}")

Total Emissions - Intercept: 6.982023176464173, Coefficient: 0.001764938846279883
CPU Energy - Intercept: 1.8107571471319526, Coefficient: 0.00039545527961131003
GPU Energy - Intercept: 5.787662062686252, Coefficient: 0.0016251565814090578
RAM Energy - Intercept: 2.9016805480915053, Coefficient: 0.0006336379866482673


In [11]:
# Prepare data for Altair visualization
vis_data = pd.DataFrame({
    'Parameters (billions)': parameters.flatten(),
    'Number of Examples': num_examples.flatten(),
    'Total Output Tokens': total_output_tokens.flatten(),
    'Total Input Tokens': total_input_tokens.flatten(),
    'Avg Input Tokens per Prompt': avg_input_tokens.flatten(),
    'Total Emissions per Million Output Tokens': emissions_per_million_output_tokens['Total Emissions'],
    'CPU Energy per Million Output Tokens': emissions_per_million_output_tokens['CPU Energy'],
    'GPU Energy per Million Output Tokens': emissions_per_million_output_tokens['GPU Energy'],
    'RAM Energy per Million Output Tokens': emissions_per_million_output_tokens['RAM Energy'],
    'Total Emissions Predicted': predictions['Total Emissions'],
    'CPU Energy Predicted': predictions['CPU Energy'],
    'GPU Energy Predicted': predictions['GPU Energy'],
    'RAM Energy Predicted': predictions['RAM Energy']
})

In [12]:
# Define chart width and height
chart_width = 600
chart_height = 350

# Create scatter and line plots for each emission type per million output tokens
charts = []
for emission_type, color in zip(['Total Emissions', 'CPU Energy', 'GPU Energy', 'RAM Energy'],
                                ['blue', 'green', 'red', 'purple']):
    scatter = alt.Chart(vis_data).mark_circle(size=100, color=color).encode(
        x='Avg Input Tokens per Prompt',
        y=f'{emission_type} per Million Output Tokens',
        tooltip=['Parameters (billions)', 
                 f'{emission_type} per Million Output Tokens', 
                 'Avg Input Tokens per Prompt', 
                 'Number of Examples']
    ).properties(
        title=f'Actual {emission_type} per Million Output Tokens',
        width=chart_width,
        height=chart_height
    )
    
    # Create line plots for predicted emissions
    line = alt.Chart(vis_data).mark_line(color=color).encode(
        x='Avg Input Tokens per Prompt',
        y=f'{emission_type} Predicted',
        tooltip=['Parameters (billions)', 
                 f'{emission_type} per Million Output Tokens', 
                 'Avg Input Tokens per Prompt', 
                 'Number of Examples']
    ).properties(
        width=chart_width,
        height=chart_height
    )
    
    charts.append(scatter + line)

# Arrange the charts in a 2x2 grid
grid_chart = alt.vconcat(
    alt.hconcat(charts[0], charts[1]),
    alt.hconcat(charts[2], charts[3])
).resolve_scale(
    y='independent'
)

grid_chart.show()

In [13]:
# Create a combined chart with overlays for all emission types
combined_chart = alt.layer(*charts).resolve_scale(
    y='independent'
).properties(
    title='Emissions per Million Output Tokens by Type vs Avg Input Tokens',
    width=600,  # Adjusted width for combined chart
    height=400  # Adjusted height for combined chart
)

combined_chart.show()