In [None]:
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM
import torch
import json
from transformers import AutoTokenizer


models = [
    "meta-math/MetaMath-Mistral-7B",
]

results = {}

for model_name in models:
    print(f"\nEvaluating {model_name}...")
    
    model = HFLM(
        pretrained=model_name,
        tokenizer=AutoTokenizer.from_pretrained(model_name),
        device='cuda' if torch.cuda.is_available() else 'cpu',
        batch_size=32,
        trust_remote_code=True,
        dtype=torch.float16
    )
    
    model_results = evaluator.simple_evaluate(
        model=model,
        tasks=["gsm8k"],
        num_fewshot=8,
        limit=256 # Remove this line to evaluate on full dataset
    )
    
    results[model_name] = model_results
    
    print(f"Results for {model_name}:")
    print(model_results['results']['gsm8k'])

with open('gsm8k_results.json', 'w') as f:
    json.dump(results, f, indent=2)


Evaluating meta-math/MetaMath-Mistral-7B...


2024-12-02:02:22:46,851 INFO     [huggingface.py:129] Using device 'cuda'
2024-12-02:02:22:46,944 INFO     [huggingface.py:481] Using model type 'default'
2024-12-02:02:22:46,950 INFO     [huggingface.py:365] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-02:02:22:53,357 INFO     [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2024-12-02:02:22:53,358 INFO     [evaluator.py:217] Using pre-initialized model
2024-12-02:02:23:02,977 INFO     [task.py:415] Building contexts for gsm8k on rank 0...
100%|██████████| 256/256 [00:01<00:00, 237.81it/s]
2024-12-02:02:23:04,060 INFO     [evaluator.py:489] Running generate_until requests
Running generate_until requests:   0%|          | 0/256 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1888 > 1024). Running this sequence through the model will result in indexing errors
Running generate_until requests: 100%|██████████| 256/256 [05:49<00:00,  1.36s/it]
fatal: not a git repository (or any of the parent directories): .git


Results for meta-math/MetaMath-Mistral-7B:
{'alias': 'gsm8k', 'exact_match,strict-match': np.float64(0.69921875), 'exact_match_stderr,strict-match': 0.02871850463421181, 'exact_match,flexible-extract': np.float64(0.70703125), 'exact_match_stderr,flexible-extract': 0.028500984607927556}
