In [1]:
import json
import re
from typing import Optional

import pandas as pd

In [2]:
df = pd.concat([
    pd.read_csv('./data/llm_perf/hf-dgx-01_perf-report.csv'),
    pd.read_csv('./data/llm_perf/audace_perf-report.csv')
])

In [3]:
df['gpu'] = df['environment.gpus'].apply(lambda x: json.loads(x.replace('\'', '"'))[0])

Le nombre de paramètres des modèles n'est pas directement disponible. Une fonction de parsing est donc nécessaire.

In [4]:
def parse_model_parameters_count(model_name: str) -> Optional[tuple[float, str]]:
    match = re.search(r'([0-9.]+)(b|m)', model_name, re.IGNORECASE)
    if match is not None:
        count, unit = float(match[1]), match[2].lower()
        if unit == 'm':
            count /= 1000
        return count
       

In [5]:
df['parameters_count'] = df['model'].apply(parse_model_parameters_count)

Certains modèles ne sont pas pris en compte par cette fonction.

In [6]:
no_params_models = df.loc[df['parameters_count'].isnull(), 'model'].unique()

In [7]:
list(no_params_models)

['NYTK/PULI-GPTrio',
 'ai-forever/mGPT',
 'Writer/palmyra-base',
 'Writer/palmyra-large',
 'gpt2',
 'cyberagent/open-calm-large',
 'BEE-spoke-data/NanoLlama-GQA-L10-A32_KV8-v13-KI',
 'BEE-spoke-data/verysmol_llama-v11-KIx2',
 'TurkuNLP/gpt3-finnish-large',
 'TurkuNLP/gpt3-finnish-small',
 'rishiraj/CatPPT-base',
 'LLM360/Amber',
 'golaxy/gowizardlm',
 'bigcode/tiny_starcoder_py',
 'bigcode/gpt_bigcode-santacoder',
 'bit-dny/MindLLM',
 'SaylorTwift/gpt2_test',
 'microsoft/phi-1_5',
 'gpt2-xl',
 'bn22/tinyllama_frankenmerge',
 'microsoft/phi-2',
 'vishesht27/22-Neuro_Model']

Il est possible de récupérer manuellement cette info sur le site d'HuggingFace (ou sur llm.extractum.io pour *golaxy/gowizardlm*)

In [8]:
# source: huggingface.co, llm.extractum.io for gowizardlm
params = [7.67, 1.3, 5, 20, 0.124, 0.4, 0.218, 0.058, 0.881, 0.186, 7.24, 6.74, 7, 0.164, 1.12, 1.3, 0.137, 1.3, 1.61, 1.54, 2.78, 7.24]

In [9]:
no_params_models_dict = dict(zip(no_params_models, params))

In [10]:
for model in no_params_models_dict:
    df.loc[df['model'] == model, 'parameters_count'] = no_params_models_dict[model]

Les informations concernant les data types, les méthodes d'optimisation et les péthodes de quantization sont dans le champ `experiment_name`

In [11]:
sorted(list(df['experiment_name'].unique()))

['pytorch+cuda+bfloat16',
 'pytorch+cuda+float16',
 'pytorch+cuda+float16+awq-4bit+gemm',
 'pytorch+cuda+float16+awq-4bit+gemv',
 'pytorch+cuda+float16+bettertransformer',
 'pytorch+cuda+float16+bnb-4bit',
 'pytorch+cuda+float16+bnb-4bit+bettertransformer',
 'pytorch+cuda+float16+bnb-8bit',
 'pytorch+cuda+float16+bnb-8bit+bettertransformer',
 'pytorch+cuda+float16+flash-attention-v2',
 'pytorch+cuda+float16+gptq-4bit+cuda-fp16',
 'pytorch+cuda+float16+gptq-4bit+exllama-v1',
 'pytorch+cuda+float16+gptq-4bit+exllama-v2',
 'pytorch+cuda+float32']

In [12]:
df['dtype'] = df['experiment_name'].apply(lambda x:x.split('+')[2])

In [13]:
def get_optim(exp_name):
    if 'bettertransformer' in exp_name:
        return 'BetterTransformer'
    elif 'flash-attention-v2' in exp_name:
        return 'FlashAttentionV2'
    else:
        return 'None'

In [14]:
df['optimization'] = df['experiment_name'].apply(get_optim)

In [15]:
def get_quant(exp_name):
    if ('bnb' in exp_name) or ('cuda-fp16' in exp_name):
        return exp_name.split('+')[3]
    elif ('awq' in exp_name) or ('gptq' in exp_name):
        return '+'.join(exp_name.split('+')[3:5])
    else:
        return 'None'

In [16]:
df['quantization'] = df['experiment_name'].apply(get_quant)

In [17]:
df['cuda-fp16'] = df['experiment_name'].apply(lambda x: 'cuda-fp16' in x)

In [18]:
df['response_length'] = 256

In [19]:
# convert kWh to Wh
df['energy_per_token'] = df['generate.energy_consumption(kWh/token)'] * 1000

# convert Wh to J
df['energy'] = 3600 * df['energy_per_token'] * df['response_length']

In [20]:
df.rename(columns={'generate.throughput(tokens/s)':'throughput', 'generate.latency(s)':'latency'}, inplace=True)

In [21]:
df['task'] = 'chat'

In [22]:
sub = df[[
    'model',
    'throughput',
    'response_length',
    'latency',    
    'energy',
    'gpu',
    'task',
    'parameters_count',
    'energy_per_token',
    'dtype',
    'optimization',
    'quantization',
    'cuda-fp16'
]]



In [23]:
sub.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1944 entries, 0 to 175
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   model             1944 non-null   object 
 1   throughput        1944 non-null   float64
 2   response_length   1944 non-null   int64  
 3   latency           1944 non-null   float64
 4   energy            1944 non-null   float64
 5   gpu               1944 non-null   object 
 6   task              1944 non-null   object 
 7   parameters_count  1944 non-null   float64
 8   energy_per_token  1944 non-null   float64
 9   dtype             1944 non-null   object 
 10  optimization      1944 non-null   object 
 11  quantization      1944 non-null   object 
 12  cuda-fp16         1944 non-null   bool   
dtypes: bool(1), float64(5), int64(1), object(6)
memory usage: 199.3+ KB


In [24]:
sub.to_csv('./data/aggregated_llm_perf.csv', index=False)