In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.linear_model import LinearRegression

In [43]:
# Define paths to the individual CSV files
csv_files = {
    'Llama-2-70b': 'meta-llama/Llama-2-70b-chat-hf-emissiondata.csv',
    'Llama-2-13b': 'meta-llama/Llama-2-13b-chat-hf-emissiondata.csv',
    'Llama-2-7b': 'meta-llama/Llama-2-7b-chat-hf-emissiondata.csv',
    'Llama-3-70B': 'meta-llama/Meta-Llama-3-70B-Instruct-emissiondata.csv',
    'Llama-3-8B': 'meta-llama/Meta-Llama-3-8B-Instruct-emissiondata.csv' 
}

# Read the emissions data
emissions_data = pd.read_csv('emissions.csv')

In [44]:
# Initialize lists to store metadata
parameters = []
total_emissions = []
cpu_energy = []
gpu_energy = []
ram_energy = []
total_output_tokens = []
model_types = []

In [45]:
# Read and extract metadata from each CSV file
for model_key, file in csv_files.items():
    data = pd.read_csv(file)
    output_tokens = data.loc[data['Metric'] == 'Total Output Tokens', 'Value'].values[0]
    if 'Llama-2-70b' in model_key:
        parameters.append(70)
        model_types.append('Llama 2')
    elif 'Llama-2-13b' in model_key:
        parameters.append(13)
        model_types.append('Llama 2')
    elif 'Llama-2-7b' in model_key:
        parameters.append(7)
        model_types.append('Llama 2')
    elif 'Llama-3-70B' in model_key:
        parameters.append(70)
        model_types.append('Llama 3')
    elif 'Llama-3-8B' in model_key:
        parameters.append(8)
        model_types.append('Llama 3')
    total_output_tokens.append(float(output_tokens))

In [46]:
# Extract emissions data
for model in csv_files.keys():
    model_emissions = emissions_data[emissions_data['project_name'].str.contains(model)]

    if model_emissions.empty:
        print(f"No emissions data found for {model_key}")
        continue

    total_emissions.append(model_emissions['emissions'].values[0])
    cpu_energy.append(model_emissions['cpu_energy'].values[0])
    gpu_energy.append(model_emissions['gpu_energy'].values[0])
    ram_energy.append(model_emissions['ram_energy'].values[0])

In [47]:
# Prepare data for regression and visualization
parameters = np.array(parameters)
total_output_tokens = np.array(total_output_tokens)
total_emissions = np.array(total_emissions)
cpu_energy = np.array(cpu_energy)
gpu_energy = np.array(gpu_energy)
ram_energy = np.array(ram_energy)
model_types = np.array(model_types)

In [48]:
# Calculate emissions per 1,000,000 output tokens
emissions_per_million_output_tokens = {
    'Total Emissions': total_emissions / total_output_tokens * 1_000_000,
    'CPU Energy': cpu_energy / total_output_tokens * 1_000_000,
    'GPU Energy': gpu_energy / total_output_tokens * 1_000_000,
    'RAM Energy': ram_energy / total_output_tokens * 1_000_000
}

In [49]:
# Function to perform regression analysis for a subset of data
def perform_regression(x, y):
    x = x.reshape(-1, 1)
    model = LinearRegression()
    model.fit(x, y)
    predicted = model.predict(x)
    return model, predicted

In [50]:
# Separate data by model type
llama2_data = {key: value[model_types == 'Llama 2'] for key, value in emissions_per_million_output_tokens.items()}
llama2_parameters = parameters[model_types == 'Llama 2']

llama3_data = {key: value[model_types == 'Llama 3'] for key, value in emissions_per_million_output_tokens.items()}
llama3_parameters = parameters[model_types == 'Llama 3']

In [51]:
# Perform regression analysis separately for Llama 2 and Llama 3
models_llama2 = {}
predictions_llama2 = {}
models_llama3 = {}
predictions_llama3 = {}

for name in emissions_per_million_output_tokens.keys():
    model, predicted = perform_regression(llama2_parameters, llama2_data[name])
    models_llama2[name] = model
    predictions_llama2[name] = predicted
    
    model, predicted = perform_regression(llama3_parameters, llama3_data[name])
    models_llama3[name] = model
    predictions_llama3[name] = predicted

In [54]:
# Prepare data for Altair visualization
vis_data = pd.DataFrame({
    'Parameters (billions)': parameters.flatten(),
    'Model Type': model_types,
    'Total Emissions per Million Output Tokens': emissions_per_million_output_tokens['Total Emissions'],
    'CPU Energy per Million Output Tokens': emissions_per_million_output_tokens['CPU Energy'],
    'GPU Energy per Million Output Tokens': emissions_per_million_output_tokens['GPU Energy'],
    'RAM Energy per Million Output Tokens': emissions_per_million_output_tokens['RAM Energy'],
    'Total Emissions Predicted Llama 2': np.concatenate([predictions_llama2.get('Total Emissions', np.array([])), [np.nan]*len(predictions_llama3.get('Total Emissions', np.array([])))]),
    'Total Emissions Predicted Llama 3': np.concatenate([[np.nan]*len(predictions_llama2.get('Total Emissions', np.array([]))), predictions_llama3.get('Total Emissions', np.array([]))]),
    'CPU Energy Predicted Llama 2': np.concatenate([predictions_llama2.get('CPU Energy', np.array([])), [np.nan]*len(predictions_llama3.get('CPU Energy', np.array([])))]),
    'CPU Energy Predicted Llama 3': np.concatenate([[np.nan]*len(predictions_llama2.get('CPU Energy', np.array([]))), predictions_llama3.get('CPU Energy', np.array([]))]),
    'GPU Energy Predicted Llama 2': np.concatenate([predictions_llama2.get('GPU Energy', np.array([])), [np.nan]*len(predictions_llama3.get('GPU Energy', np.array([])))]),
    'GPU Energy Predicted Llama 3': np.concatenate([[np.nan]*len(predictions_llama2.get('GPU Energy', np.array([]))), predictions_llama3.get('GPU Energy', np.array([]))]),
    'RAM Energy Predicted Llama 2': np.concatenate([predictions_llama2.get('RAM Energy', np.array([])), [np.nan]*len(predictions_llama3.get('RAM Energy', np.array([])))]),
    'RAM Energy Predicted Llama 3': np.concatenate([[np.nan]*len(predictions_llama2.get('RAM Energy', np.array([]))), predictions_llama3.get('RAM Energy', np.array([]))])
})

In [55]:
# Define chart width and height
chart_width = 600
chart_height = 350

# Create scatter and line plots for each emission type per million output tokens
charts = []
for emission_type, colors in zip(['Total Emissions', 'CPU Energy', 'GPU Energy', 'RAM Energy'],
                                [('blue', 'lightblue'), ('green', 'lightgreen'), ('red', 'pink'), ('purple', 'violet')]):
    scatter_llama2 = alt.Chart(vis_data[vis_data['Model Type'] == 'Llama 2']).mark_circle(size=100, color=colors[0], filled=True).encode(
        x='Parameters (billions)',
        y=f'{emission_type} per Million Output Tokens',
        tooltip=['Parameters (billions)', f'{emission_type} per Million Output Tokens']
    )
    
    scatter_llama3 = alt.Chart(vis_data[vis_data['Model Type'] == 'Llama 3']).mark_square(size=100, color=colors[1], filled=True).encode(
        x='Parameters (billions)',
        y=f'{emission_type} per Million Output Tokens',
        tooltip=['Parameters (billions)', f'{emission_type} per Million Output Tokens']
    )
    
    # Create line plots for predicted emissions
    line_llama2 = alt.Chart(vis_data).mark_line(color=colors[0]).encode(
        x='Parameters (billions)',
        y=f'{emission_type} Predicted Llama 2',
        tooltip=['Parameters (billions)', f'{emission_type} Predicted Llama 2']
    )
    
    line_llama3 = alt.Chart(vis_data).mark_line(color=colors[1], strokeDash=[5, 5]).encode(
        x='Parameters (billions)',
        y=f'{emission_type} Predicted Llama 3',
        tooltip=['Parameters (billions)', f'{emission_type} Predicted Llama 3']
    )
    
    charts.append((scatter_llama2 + scatter_llama3 + line_llama2 + line_llama3).properties(
        title=f'Actual and Predicted {emission_type} per Million Output Tokens',
        width=chart_width,
        height=chart_height
    ))

# Arrange the charts in a 2x2 grid
grid_chart = alt.vconcat(
    alt.hconcat(charts[0], charts[1]),
    alt.hconcat(charts[2], charts[3])
).resolve_scale(
    y='independent'
)

grid_chart.show()


In [56]:
# Create a combined chart with overlays for all emission types
combined_chart = alt.layer(*charts).resolve_scale(
    y='independent'
).properties(
    title='Emissions per Million Output Tokens by Type vs Number of Parameters',
    width=600,  # Adjusted width for combined chart
    height=400  # Adjusted height for combined chart
)

combined_chart.show()