# Task 4

We are interested in the state of Michigan, USA at a time period between March 1, 2024, and March 2, 2024.

Once again, we first load the dataset and improt all necessary libraries.

In [1]:
# load the dataset
from datasets import load_dataset
import tiktoken

ds = load_dataset("allenai/WildChat-1M")
ds

DatasetDict({
    train: Dataset({
        features: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header'],
        num_rows: 837989
    })
})

In [2]:
ds = ds['train']

***

After removing unecessary columns, we filter the dataset to only contain entries from Michigan, and the requested time interval.

In [3]:
# remove unnecessary columns
ds = ds.remove_columns([
    "conversation_hash",
    "turn",
    "language",
    "openai_moderation",
    "detoxify_moderation",
    "toxic",
    "redacted",
    "hashed_ip",
    "header",
])

In [4]:
# filter to only include entries from Michigan, United States
ds = ds.filter(lambda x: x['state'] == 'Michigan' and x['country'] == 'United States') 

In [5]:
# filter to only include entries from March 1, 2024 to March 2, 2024
ds = ds.filter(lambda x: x['timestamp'].timetuple()[:3] >= (2024, 3, 1) and x['timestamp'].timetuple()[:3] < (2024, 3, 2))
ds

Dataset({
    features: ['model', 'timestamp', 'conversation', 'state', 'country'],
    num_rows: 198
})

We then check all the unique models used. 

In [6]:
ds.unique('model')

['gpt-3.5-turbo-0125']

The encoding for this model is cl100k_base.

We create two new columns containing the output and input tokens per conversation.

In [7]:
# initialize encoder
enc = tiktoken.get_encoding("cl100k_base")

# given an example, count the number of output tokens from the assistant and input tokens from the user
def count_output_tokens(example):
    convo = example['conversation']
    example['output_token_count'] = 0
    example['input_token_count'] = 0
    for turn in convo:
        if turn['role'] == 'assistant':
            output_text = turn['content']
            token_ids = enc.encode(output_text)
            example['output_token_count'] += len(token_ids)
        elif turn['role'] == 'user':
            input_text = turn['content']
            token_ids = enc.encode(input_text)
            example['input_token_count'] += len(token_ids)
    return example

# map the function to the dataset
ds = ds.map(count_output_tokens)
ds

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Dataset({
    features: ['model', 'timestamp', 'conversation', 'state', 'country', 'output_token_count', 'input_token_count'],
    num_rows: 198
})

Given that the maximum amount of input tokens in a multi-turn conversation is only 1403 tokens, we can assume that input tokens do not contribute much more than output tokens.

In [9]:
max_input = max(row['input_token_count'] for row in ds)
max_input

1403

Adding all tokens together.

In [19]:
# count total output tokens
total_output_tokens = sum(row['output_token_count'] for row in ds)
total_input_tokens = sum(row['input_token_count'] for row in ds)
total_tokens = total_input_tokens + total_output_tokens

print(f"Total tokens: {total_tokens}")

Total tokens: 153691


***

## Final Calculations

Given that gpt-3.5-turbo contains 20B parameters, we calculate total FLOPs required

In [18]:
# If we assume FLOPs per token is rougly 2 * model_size_in_billions only for inference
model_size_in_billions = 20  # for gpt-3.5-turbo, some say 20B others 175B
total_flops = 2 * model_size_in_billions * (total_output_tokens + total_input_tokens) * 1e9

print(f"Total FLOPs: {total_flops:.2e} FLOPs")

Total FLOPs: 6.15e+15 FLOPs


Estimating total kWh required for one cluster, given certain hardware characteristics.

In [16]:
peak_flops = 9.89e14 # 989 TFLOPS
real_flops = peak_flops * 0.1 # 10% efficiency
hours_needed = total_flops / real_flops / 3600

h100 = 700 * 0.7 # 700W H100 GPU with 70% consumption
total_Wh = hours_needed * h100 
total_kWh_one_gpu = total_Wh / 1000
total_kWh_cluster = total_kWh_one_gpu * 16  # assuming 16 GPUs
print(f"Total kWh used: {total_kWh_cluster:.4f} kWh")

Total kWh used: 0.1354 kWh


Finally, calculating the total emissions.

In [17]:
PUE = 1.1 # data center Power Usage Effectiveness
CI = 0.4 # carbon intensity in kgCO2eq/kWh approximate for US in 2024
total_emissions_kg = total_kWh_cluster * PUE * CI

print(f"Total emissions: {total_emissions_kg:.4f} kgCO2eq")

Total emissions: 0.0596 kgCO2eq


***