# Mistral 7B LM Eval Testing

Code to evaluate three variants of a Mistral-7B on the Open LLM Leaderboard eval.

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

In [None]:
!pip install -q -U transformers peft torch accelerate einops sentencepiece bitsandbytes

In [2]:
# clone repository
!git clone https://github.com/EleutherAI/lm-evaluation-harness.git

Cloning into 'lm-evaluation-harness'...
remote: Enumerating objects: 26919, done.[K
remote: Counting objects: 100% (6920/6920), done.[K
remote: Compressing objects: 100% (958/958), done.[K
remote: Total 26919 (delta 6303), reused 6338 (delta 5947), pack-reused 19999[K
Receiving objects: 100% (26919/26919), 21.51 MiB | 10.60 MiB/s, done.
Resolving deltas: 100% (18607/18607), done.


In [None]:
# change to repo directory
import os
os.chdir("/content/lm-evaluation-harness")
# install
!pip install -e .

In [7]:
import datetime

now = datetime.datetime.now()
now = now.strftime("%Y_%m_%d_%H_%M_%S")

os.mkdir(f"/content/{now}")
os.mkdir(f"/content/{now}/arc")
os.mkdir(f"/content/{now}/hellaswag")
os.mkdir(f"/content/{now}/mmlu")
os.mkdir(f"/content/{now}/truthfulqa")
os.mkdir(f"/content/{now}/winogrande")
os.mkdir(f"/content/{now}/gsm8k")


In [None]:
os.environ["now_log_folder"] = now

# arc challenge

AI2 Reasoning Challenge (25-shot) - a set of grade-school science questions.


In [5]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,dtype="bfloat16" \
    --tasks arc_challenge \
    --batch_size 16 \
    --write_out \
    --output_path arc_challenge_mistralai_Mistral-7B-Instruct-v0.2_lm_eval.json \
    --device cuda:0 \
    --num_fewshot 25 \
    --verbosity DEBUG


2023-12-25:18:47:48,773 INFO     [utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-25:18:47:48,773 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2023-12-25:18:47:49,029 INFO     [config.py:58] PyTorch version 2.1.2 available.
2023-12-25:18:47:49,029 INFO     [config.py:95] TensorFlow version 2.15.0 available.
2023-12-25:18:47:49,031 INFO     [config.py:108] JAX version 0.4.23 available.
2023-12-25 18:47:49.540291: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-25 18:47:49.540343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-25 18:47:49.542200: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to regis

In [1]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.1,dtype="bfloat16" \
    --tasks arc_challenge \
    --batch_size 16 \
    --write_out \
    --output_path arc_challenge_mistralai_Mistral-7B-Instruct-v0.1_lm_eval.json \
    --device cuda:0 \
    --num_fewshot 25 \
    --verbosity DEBUG


2023-12-25:18:08:02,807 INFO     [utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-25:18:08:02,807 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2023-12-25:18:08:03,058 INFO     [config.py:58] PyTorch version 2.1.2 available.
2023-12-25:18:08:03,059 INFO     [config.py:95] TensorFlow version 2.15.0 available.
2023-12-25:18:08:03,060 INFO     [config.py:108] JAX version 0.4.23 available.
2023-12-25 18:08:03.550277: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-25 18:08:03.550321: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-25 18:08:03.551472: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to regis

In [2]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-v0.1,dtype="bfloat16",peft=dfurman/Mistral-7B-Instruct-v0.2 \
    --tasks arc_challenge \
    --batch_size 16 \
    --write_out \
    --output_path arc_challenge_dfurman_Mistral-7B-Instruct-v0.2_lm_eval.json \
    --device cuda:0 \
    --num_fewshot 25 \
    --verbosity DEBUG


2023-12-25:18:20:17,541 INFO     [utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-25:18:20:17,541 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2023-12-25:18:20:17,794 INFO     [config.py:58] PyTorch version 2.1.2 available.
2023-12-25:18:20:17,795 INFO     [config.py:95] TensorFlow version 2.15.0 available.
2023-12-25:18:20:17,796 INFO     [config.py:108] JAX version 0.4.23 available.
2023-12-25 18:20:18.286429: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-25 18:20:18.286476: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-25 18:20:18.288111: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to regis

In [3]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-v0.1,dtype="bfloat16",peft=dfurman/Mistral-7B-Instruct-v0.1 \
    --tasks arc_challenge \
    --batch_size 16 \
    --write_out \
    --output_path arc_challenge_dfurman_Mistral-7B-Instruct-v0.1_lm_eval.json \
    --device cuda:0 \
    --num_fewshot 25 \
    --verbosity DEBUG


2023-12-25:18:32:57,513 INFO     [utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-25:18:32:57,513 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2023-12-25:18:32:57,768 INFO     [config.py:58] PyTorch version 2.1.2 available.
2023-12-25:18:32:57,769 INFO     [config.py:95] TensorFlow version 2.15.0 available.
2023-12-25:18:32:57,770 INFO     [config.py:108] JAX version 0.4.23 available.
2023-12-25 18:32:58.263473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-25 18:32:58.263519: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-25 18:32:58.265125: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to regis

# hellaswag

* HellaSwag (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.


## MMLU

MMLU (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.


# TruthfulQA

TruthfulQA (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.

# Winogrande
Winogrande (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.


# GSM8k

GSM8k (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.

In [13]:
!lm-eval --tasks list

2023-12-25:17:55:04,896 INFO     [utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-25:17:55:04,896 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2023-12-25:17:55:05,148 INFO     [config.py:58] PyTorch version 2.1.2 available.
2023-12-25:17:55:05,148 INFO     [config.py:95] TensorFlow version 2.15.0 available.
2023-12-25:17:55:05,149 INFO     [config.py:108] JAX version 0.4.23 available.
2023-12-25 17:55:05.645036: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-25 17:55:05.645086: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-25 17:55:05.646919: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to regis

In [9]:
# mmlu_abstract_algebra,mmlu_anatomy,mmlu_astronomy,mmlu_business_ethics,mmlu_clinical_knowledge,mmlu_college_biology,mmlu_college_chemistry,mmlu_college_computer_science,mmlu_college_mathematics,mmlu_college_medicine,mmlu_college_physics,mmlu_computer_security,mmlu_conceptual_physics,mmlu_econometrics,mmlu_electrical_engineering,mmlu_elementary_mathematics,mmlu_formal_logic,mmlu_global_facts,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_european_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_macroeconomics,mmlu_high_school_mathematics,mmlu_high_school_microeconomics,mmlu_high_school_physics,mmlu_high_school_psychology,mmlu_high_school_statistics,mmlu_high_school_us_history,mmlu_high_school_world_history,mmlu_human_aging,mmlu_human_sexuality,mmlu_humanities,mmlu_international_law,mmlu_jurisprudence,mmlu_logical_fallacies,mmlu_machine_learning,mmlu_management,mmlu_marketing,mmlu_medical_genetics,mmlu_miscellaneous,mmlu_moral_disputes,mmlu_moral_scenarios,mmlu_nutrition,mmlu_other,mmlu_philosophy,mmlu_prehistory,mmlu_professional_accounting,mmlu_professional_law,mmlu_professional_medicine,mmlu_professional_psychology,mmlu_public_relations,mmlu_security_studies,mmlu_social_sciences,mmlu_sociology,mmlu_stem,mmlu_us_foreign_policy,mmlu_virology,mmlu_world_religions \