In [2]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.linear_model import LinearRegression

In [8]:
# Define paths to the individual CSV files
csv_files = {
    '70b': 'meta-llama/Llama-2-70b-chat-hf-emissiondata.csv',
    '13b': 'meta-llama/Llama-2-13b-chat-hf-emissiondata.csv',
    '7b': 'meta-llama/Llama-2-7b-chat-hf-emissiondata.csv'
}

# Read the emissions data
emissions_data = pd.read_csv('emissions.csv')

In [22]:
# Initialize lists to store metadata
parameters = []
total_emissions = []
cpu_energy = []
gpu_energy = []
ram_energy = []
total_output_tokens = []

In [23]:
# Read and extract metadata from each CSV file
for model, file in csv_files.items():
    data = pd.read_csv(file)
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    if model == '70b':
        parameters.append(70)
    elif model == '13b':
        parameters.append(13)
    elif model == '7b':
        parameters.append(7)
    total_output_tokens.append(float(output_tokens))

In [24]:
# Extract emissions data
for model in csv_files.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains(model)]
    total_emissions.append(model_emissions['emissions'].values[0])
    cpu_energy.append(model_emissions['cpu_energy'].values[0])
    gpu_energy.append(model_emissions['gpu_energy'].values[0])
    ram_energy.append(model_emissions['ram_energy'].values[0])

In [25]:
# Prepare data for regression and visualization
parameters = np.array(parameters)
total_output_tokens = np.array(total_output_tokens)
total_emissions = np.array(total_emissions)
cpu_energy = np.array(cpu_energy)
gpu_energy = np.array(gpu_energy)
ram_energy = np.array(ram_energy)

In [26]:
# Calculate emissions per 1,000,000 output tokens
emissions_per_million_output_tokens = {
    'Total Emissions': total_emissions / total_output_tokens * 1_000_000,
    'CPU Energy': cpu_energy / total_output_tokens * 1_000_000,
    'GPU Energy': gpu_energy / total_output_tokens * 1_000_000,
    'RAM Energy': ram_energy / total_output_tokens * 1_000_000
}

In [27]:
# Perform regression analysis
def perform_regression(x, y):
    x = x.reshape(-1, 1)
    model = LinearRegression()
    model.fit(x, y)
    predicted = model.predict(x)
    return model, predicted

In [28]:
models = {}
predictions = {}
for name, y in emissions_per_million_output_tokens.items():
    model, predicted = perform_regression(parameters, y)
    models[name] = model
    predictions[name] = predicted
    print(f"{name} - Intercept: {model.intercept_}, Coefficient: {model.coef_[0]}")

Total Emissions - Intercept: 4.450694916094701, Coefficient: 0.38489625638061276
CPU Energy - Intercept: 1.2499486513028675, Coefficient: 0.08936725010484739
GPU Energy - Intercept: 3.440219327790901, Coefficient: 0.34627411218242776
RAM Energy - Intercept: 2.0031269914168046, Coefficient: 0.14319502743796003


In [29]:
# Prepare data for Altair visualization
vis_data = pd.DataFrame({
    'Parameters (billions)': parameters.flatten(),
    'Total Emissions per Million Output Tokens': emissions_per_million_output_tokens['Total Emissions'],
    'CPU Energy per Million Output Tokens': emissions_per_million_output_tokens['CPU Energy'],
    'GPU Energy per Million Output Tokens': emissions_per_million_output_tokens['GPU Energy'],
    'RAM Energy per Million Output Tokens': emissions_per_million_output_tokens['RAM Energy'],
    'Total Emissions Predicted': predictions['Total Emissions'],
    'CPU Energy Predicted': predictions['CPU Energy'],
    'GPU Energy Predicted': predictions['GPU Energy'],
    'RAM Energy Predicted': predictions['RAM Energy']
})

In [44]:
# Define chart width and height
chart_width = 600
chart_height = 350

# Create scatter and line plots for each emission type per million output tokens
charts = []
for emission_type, color in zip(['Total Emissions', 'CPU Energy', 'GPU Energy', 'RAM Energy'],
                                ['blue', 'green', 'red', 'purple']):
    scatter = alt.Chart(vis_data).mark_circle(size=100, color=color).encode(
        x='Parameters (billions)',
        y=f'{emission_type} per Million Output Tokens',
        tooltip=['Parameters (billions)', f'{emission_type} per Million Output Tokens']
    ).properties(
        title=f'Actual {emission_type} per Million Output Tokens',
        width=chart_width,
        height=chart_height
    )
    
    # Create line plots for predicted emissions
    line = alt.Chart(vis_data).mark_line(color=color).encode(
        x='Parameters (billions)',
        y=f'{emission_type} Predicted',
        tooltip=['Parameters (billions)', f'{emission_type} Predicted']
    ).properties(
        width=chart_width,
        height=chart_height
    )
    
    charts.append(scatter + line)

# Arrange the charts in a 2x2 grid
grid_chart = alt.vconcat(
    alt.hconcat(charts[0], charts[1]),
    alt.hconcat(charts[2], charts[3])
).resolve_scale(
    y='independent'
)

grid_chart.show()

In [49]:
# Create a combined chart with overlays for all emission types
combined_chart = alt.layer(*charts).resolve_scale(
    y='independent'
).properties(
    title='Emissions per Million Output Tokens by Type vs Number of Parameters',
    width=600,  # Adjusted width for combined chart
    height=400  # Adjusted height for combined chart
)

combined_chart.show()