In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### Open LLM Dataset

In [2]:
open_llm_df = pd.read_csv('./data/open_llm/open-llm-leaderboard.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
open_llm_df.head()

Unnamed: 0,T,Model,Average ‚¨ÜÔ∏è,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Type,Architecture,Weight type,Precision,Merged,Hub License,#Params (B),Hub ‚ù§Ô∏è,Available on the hub,Model sha,Flagged,MoE
0,üî∂,davidkim205/Rhea-72b-v0.5,81.22,79.78,91.15,77.95,74.5,87.85,76.12,fine-tuned on domain-specific datasets,LlamaForCausalLM,Original,float16,False,apache-2.0,72.29,38.0,True,fda5cf998a0f2d89b53b5fa490793e3e50bb8239,False,False
1,üí¨,Contamination/contaminated_proof_7b_v1.0_safet...,81.14,78.07,90.22,78.92,82.29,88.16,69.14,"chat models (RLHF, DPO, IFT, ...)",MistralForCausalLM,Original,float16,False,unknown,7.24,9.0,True,5d7fcb3724d6b08cf82e1b0c1faa1695b9fd6932,True,False
2,üí¨,Contamination/contaminated_proof_7b_v1.0,81.14,78.07,90.22,78.92,82.29,88.16,69.14,"chat models (RLHF, DPO, IFT, ...)",MistralForCausalLM,Original,float16,False,unknown,7.0,3.0,True,b1415875faed65cd29fd804941f5dcf835e99608,True,False
3,üî∂,davidkim205/Rhea-72b-v0.4,81.09,78.5,90.75,78.01,73.91,86.74,78.62,fine-tuned on domain-specific datasets,LlamaForCausalLM,Original,float16,False,apache-2.0,72.29,0.0,False,5502123c46485914a580d6794eeb5fb3554b46aa,False,False
4,üí¨,MTSAIR/MultiVerse_70B,81.0,78.67,89.77,78.22,75.18,87.53,76.65,"chat models (RLHF, DPO, IFT, ...)",LlamaForCausalLM,Original,bfloat16,False,other,72.29,6.0,True,ea2b4ff8e5acd7a48993f56b2d7b99e049eb6939,False,False


In [5]:
open_llm_df.rename(columns={'Average ‚¨ÜÔ∏è': 'Average', 'Model': 'model'}, inplace=True)

In [6]:
open_llm_df = open_llm_df[['model', 'Average', 'ARC', 'HellaSwag', 'MMLU', 'TruthfulQA', 'Winogrande', 'GSM8K', 
                  'Precision', '#Params (B)', 'Flagged', 'MoE']]

In [7]:
open_llm_df.head()

Unnamed: 0,model,Average,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Precision,#Params (B),Flagged,MoE
0,davidkim205/Rhea-72b-v0.5,81.22,79.78,91.15,77.95,74.5,87.85,76.12,float16,72.29,False,False
1,Contamination/contaminated_proof_7b_v1.0_safet...,81.14,78.07,90.22,78.92,82.29,88.16,69.14,float16,7.24,True,False
2,Contamination/contaminated_proof_7b_v1.0,81.14,78.07,90.22,78.92,82.29,88.16,69.14,float16,7.0,True,False
3,davidkim205/Rhea-72b-v0.4,81.09,78.5,90.75,78.01,73.91,86.74,78.62,float16,72.29,False,False
4,MTSAIR/MultiVerse_70B,81.0,78.67,89.77,78.22,75.18,87.53,76.65,bfloat16,72.29,False,False


#### Precision

La colonne Precision comprend √† la fois des data types (`float16`, `bfloat16`) et des m√©thodes de quantization(`4bit`, `8bit` et `GPTQ`.

In [8]:
open_llm_df['Precision'].unique()

array(['float16', 'bfloat16', '4bit', '8bit', 'GPTQ', nan], dtype=object)

Il existe plusieurs mod√®les avec plusieurs valeurs de Precision.

In [9]:
unique_precisions_per_model = open_llm_df.groupby('model')['Precision'].nunique()
models_with_multiple_precision = unique_precisions_per_model[unique_precisions_per_model > 1]
models_with_multiple_precision

model
01-ai/Yi-34B-Chat                                                2
01-ai/Yi-6B                                                      2
AA051610/A0106                                                   2
AA051610/FT                                                      2
AI-Sweden-Models/gpt-sw3-126m                                    2
AIGym/deepseek-coder-1.3b-chat                                   2
AIGym/deepseek-coder-6.7b-chat                                   2
Abhaykoul/qwen1.5-vortex                                         2
Azazelle/Bianca-7b                                               2
BAAI/Aquila2-34B                                                 2
BEE-spoke-data/smol_llama-101M-GQA                               2
BarraHome/Wistral-7B-Instruct-v0.3                               2
BryanSwk/LaserPipe-7B-SLERP                                      2
CausalLM/72B-preview                                             2
CausalLM/72B-preview-canary-llamafied-qwen-llamafy-unbia

In [10]:
llm_perf_filtered_df = pd.read_csv('./data/llm_perf_filtered.csv', index_col=0).reset_index(drop=True)

Si l'on croise avec les mod√®les de LLM Perf, il n'y a que `CodeLlama-34b-hf` qui est concern√©.

In [11]:
set(llm_perf_filtered_df['model']).intersection(set(models_with_multiple_precision.index))

{'codellama/CodeLlama-34b-hf'}

Seul le mod√®le `Yi-34B-200K` manque dans le dataset d'Open LLM.

In [12]:
set(llm_perf_filtered_df['model'].unique()) - set(open_llm_df['model'].unique())

{'01-ai/Yi-34B-200K'}

### Merge avec LLM Perf

In [13]:
open_llm_perf_filtered_df = llm_perf_filtered_df.merge(open_llm_df, how='inner', on='model')

In [14]:
open_llm_perf_filtered_df.head()

Unnamed: 0,model,throughput,response_length,latency,energy,gpu,task,parameters_count,energy_per_token,dtype,optimization,quantization,cuda-fp16,Average,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Precision,#Params (B),Flagged,MoE
0,EleutherAI/gpt-neox-20b,27.4,256,9.34,3013.632,NVIDIA A100-SXM4-80GB,chat,20.0,0.00327,float16,,gptq-4bit+exllama-v1,False,41.69,45.73,73.45,25.0,31.61,68.9,5.46,float16,20.74,False,False
1,EleutherAI/gpt-neox-20b,25.6,256,10.0,2995.2,NVIDIA A100-SXM4-80GB,chat,20.0,0.00325,float16,,awq-4bit+gemv,False,41.69,45.73,73.45,25.0,31.61,68.9,5.46,float16,20.74,False,False
2,EleutherAI/gpt-neox-20b,27.5,256,9.32,2598.912,NVIDIA A100-SXM4-80GB,chat,20.0,0.00282,float16,,gptq-4bit+exllama-v2,False,41.69,45.73,73.45,25.0,31.61,68.9,5.46,float16,20.74,False,False
3,EleutherAI/gpt-neox-20b,23.5,256,10.9,3216.384,NVIDIA A100-SXM4-80GB,chat,20.0,0.00349,float16,,awq-4bit+gemm,False,41.69,45.73,73.45,25.0,31.61,68.9,5.46,float16,20.74,False,False
4,EleutherAI/pythia-12b,36.6,256,6.99,2101.248,NVIDIA A100-SXM4-80GB,chat,12.0,0.00228,float16,,gptq-4bit+exllama-v1,False,38.82,39.59,68.82,26.76,31.85,64.17,1.74,float16,12.0,False,False


#### Analyse de param√®tres

In [15]:
diff_params = open_llm_perf_filtered_df[np.abs(open_llm_perf_filtered_df['parameters_count'] - open_llm_perf_filtered_df['#Params (B)']) > 3]

In [16]:
diff_params[['model', 'parameters_count', '#Params (B)']]

Unnamed: 0,model,parameters_count,#Params (B)
21,NYTK/PULI-GPTrio,7.67,0.0
22,NYTK/PULI-GPTrio,7.67,0.0
23,NYTK/PULI-GPTrio,7.67,0.0
24,NYTK/PULI-GPTrio,7.67,0.0
80,Writer/palmyra-large,20.0,0.0
81,Writer/palmyra-large,20.0,0.0
82,Writer/palmyra-large,20.0,0.0
188,golaxy/gowizardlm,7.0,0.0
189,golaxy/gowizardlm,7.0,0.0
190,golaxy/gowizardlm,7.0,0.0


Pour les trois premiers mod√®les, ce doit √™tre un probl√®me de qualit√© des donn√©es. En revanche, pour Qwen-1_8B-Llamafied, la model card sur huggingface pr√©cise bien 1.84 md de param√®tres. Je propose de l'exclure du p√©rim√®tre.

In [17]:
open_llm_perf_filtered_df = open_llm_perf_filtered_df[open_llm_perf_filtered_df['model'] != 'KnutJaegersberg/Qwen-1_8B-Llamafied']

Il n'y a aucun mod√®le "flagg√©", i.e. soumis √† discussion.

In [18]:
np.any(open_llm_perf_filtered_df['Flagged'])

False

Il y a 4 mod√®les identifi√©s en tant que MoE.

In [19]:
open_llm_perf_filtered_df.loc[open_llm_perf_filtered_df['MoE'], 'model'].unique()

array(['uukuguy/Orca-2-7b-f16', '01-ai/Yi-34B', 'rishiraj/CatPPT-base',
       'upstage/SOLAR-10.7B-v1.0'], dtype=object)

Au vu des correspondances ci-dessous, la colonne Precision n'a pas l'air tr√®s pr√©cise ^^'. Je propose de ne pas en tenir compte.

In [20]:
open_llm_perf_filtered_df.value_counts(['dtype', 'quantization', 'Precision'])

dtype    quantization          Precision
float16  awq-4bit+gemm         float16      51
         awq-4bit+gemv         float16      50
         gptq-4bit+exllama-v1  float16      47
         gptq-4bit+exllama-v2  float16      42
         gptq-4bit             float16      18
         awq-4bit+gemm         bfloat16     13
         awq-4bit+gemv         bfloat16     12
         gptq-4bit+exllama-v1  bfloat16     11
         gptq-4bit+exllama-v2  bfloat16     10
         gptq-4bit             bfloat16      4
         awq-4bit+gemm         4bit          1
         awq-4bit+gemv         4bit          1
         gptq-4bit             4bit          1
         gptq-4bit+exllama-v1  4bit          1
         gptq-4bit+exllama-v2  4bit          1
Name: count, dtype: int64

Le mod√®le `CodeLlama-34b-hf` est le seul avec plusieurs valeurs de `Precision`.

In [21]:
open_llm_perf_filtered_df[open_llm_perf_filtered_df['model'] == 'codellama/CodeLlama-34b-hf']

Unnamed: 0,model,throughput,response_length,latency,energy,gpu,task,parameters_count,energy_per_token,dtype,optimization,quantization,cuda-fp16,Average,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Precision,#Params (B),Flagged,MoE
262,codellama/CodeLlama-34b-hf,24.4,256,10.5,3741.696,NVIDIA A100-SXM4-80GB,chat,34.0,0.00406,float16,,awq-4bit+gemv,False,55.33,54.1,75.82,55.02,39.11,73.56,34.34,bfloat16,33.74,False,False
263,codellama/CodeLlama-34b-hf,24.4,256,10.5,3741.696,NVIDIA A100-SXM4-80GB,chat,34.0,0.00406,float16,,awq-4bit+gemv,False,55.28,54.18,75.82,54.92,39.11,73.32,34.34,float16,33.74,False,False
264,codellama/CodeLlama-34b-hf,22.7,256,11.3,3861.504,NVIDIA A100-SXM4-80GB,chat,34.0,0.00419,float16,,awq-4bit+gemm,False,55.33,54.1,75.82,55.02,39.11,73.56,34.34,bfloat16,33.74,False,False
265,codellama/CodeLlama-34b-hf,22.7,256,11.3,3861.504,NVIDIA A100-SXM4-80GB,chat,34.0,0.00419,float16,,awq-4bit+gemm,False,55.28,54.18,75.82,54.92,39.11,73.32,34.34,float16,33.74,False,False


Je propose de garder `float16` en coh√©rence avec notre filtrage.

In [22]:
rows_to_drop = open_llm_perf_filtered_df[
    (open_llm_perf_filtered_df['model'] == 'codellama/CodeLlama-34b-hf') & 
    (open_llm_perf_filtered_df['Precision'] == 'bfloat16')
]

open_llm_perf_filtered_df = open_llm_perf_filtered_df.drop(rows_to_drop.index)

In [23]:
open_llm_perf_filtered_df.to_csv('./data/open_llm_perf_filtered.csv')