In [7]:
from transformers import CONFIG_MAPPING, MODEL_MAPPING
from transformers.models.auto.configuration_auto import _LazyConfigMapping
from transformers.models.auto.auto_factory import _LazyAutoMapping
from transformers.models.auto import MODEL_FOR_CAUSAL_LM_MAPPING

from models.flash_stu_550M.model import FlashSTU, FlashSTUConfig
import lm_eval as evaluator
import os

os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_OFFLINE"] = "1"

# Dynamically register FlashSTU:
if isinstance(CONFIG_MAPPING, _LazyConfigMapping):
    CONFIG_MAPPING._extra_content["flashstu"] = FlashSTUConfig
else:
    CONFIG_MAPPING.update({"flashstu": FlashSTUConfig})

if isinstance(MODEL_MAPPING, _LazyAutoMapping):
    MODEL_MAPPING._extra_content[FlashSTUConfig] = FlashSTU
else:
    MODEL_MAPPING.update({FlashSTUConfig: FlashSTU})

if isinstance(MODEL_FOR_CAUSAL_LM_MAPPING, _LazyAutoMapping):
    MODEL_FOR_CAUSAL_LM_MAPPING._extra_content[FlashSTUConfig] = FlashSTU
else:
    MODEL_FOR_CAUSAL_LM_MAPPING.update({FlashSTUConfig: FlashSTU})

print("Registered FlashSTU model and configuration.")

# Define evaluation tasks.
tasks = [
    "mmlu",
    "hellaswag",
    "piqa",
    "boolq",
    "winogrande",
    "commonsense_qa",
    "openbookqa",
    "arc_easy",
    "arc_challenge"
]

# Few-shot settings dictionary.
tasks_fewshot = {
    "hellaswag": 0,
    "mmlu": 5,
    "piqa": 0,
    "boolq": 0,
    "winogrande": -1,
    "commonsense_qa": 7,
    "openbookqa": -1,
    "arc_easy": -1,
    "arc_challenge": -1,
}

all_results = {}

for task in tasks:
    print(f"Evaluating task: {task}")
    eval_kwargs = dict(
        model="hf",
        model_args=(
            "pretrained=Hazan-Lab/Flash_STU_550M,"
            "trust_remote_code=True,"
            "dtype=bfloat16,"
            "cache_dir=/path/to/your/cache"
        ),
        tasks=[task],
        batch_size=1,
        device="cuda:0"
    )
    few_shot_value = tasks_fewshot.get(task, -1)
    if few_shot_value != -1:
        eval_kwargs["num_fewshot"] = few_shot_value

    results = evaluator.simple_evaluate(**eval_kwargs)
    task_result = results["results"].get(task, {})
    all_results[task] = task_result
    print(f"Results for {task}:")
    print(task_result)
    print("\n" + "="*50 + "\n")

print("All Evaluation Results:")
for task, result in all_results.items():
    print(f"{task}: {result}")



Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Registered FlashSTU model and configuration.
Evaluating task: mmlu


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like Hazan-Lab/Flash_STU_550M is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [8]:
import os
import time

from typing import Dict, List, Optional

import lm_eval as evaluator
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, MambaForCausalLM

os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.set_float32_matmul_precision("high")


class ModelConfig:
    def __init__(
        self,
        name: str,
        checkpoint: str,
        model_class,
        tokenizer_class,
        dtype: torch.dtype = torch.bfloat16,
        device: str = "cuda:0",
    ):
        self.name = name
        self.checkpoint = checkpoint
        self.model_class = model_class
        self.tokenizer_class = tokenizer_class
        self.dtype = dtype
        self.device = device


def setup_device() -> tuple[str, torch.dtype]:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.bfloat16 if device == "cuda" else torch.float32
    return device, dtype


def evaluate_model(
    model_config: ModelConfig,
    tasks: List[str],
    tasks_fewshot: Dict[str, int],
    batch_size: str = "auto",
    cache_dir: Optional[str] = None,
) -> Dict:
    """Evaluate a model on specified tasks."""
    print(f"\nEvaluating {model_config.name}...")
    start_time = time.time()

    # Convert dtype to string format
    dtype_str = "bfloat16" if model_config.dtype == torch.bfloat16 else "float32"

    # Prepare model arguments
    model_args = (
        f"pretrained={model_config.checkpoint},"
        "trust_remote_code=True,"
        f"dtype={dtype_str},"
        f"device_map={model_config.device}"
    )
    if cache_dir:
        model_args += f",cache_dir={cache_dir}"

    # Run evaluation for each task
    all_results = {}
    for task in tasks:
        print(f"\nRunning task: {task}")
        eval_kwargs = dict(
            model="hf", model_args=model_args, tasks=[task], batch_size=batch_size, device=model_config.device
        )

        # Add few-shot setting if specified
        few_shot_value = tasks_fewshot.get(task, -1)
        if few_shot_value != -1:
            eval_kwargs["num_fewshot"] = few_shot_value

        results = evaluator.simple_evaluate(**eval_kwargs)
        task_result = results["results"].get(task, {})
        all_results[task] = task_result

    elapsed_time = time.time() - start_time
    print(f"\nEvaluation completed in {elapsed_time:.2f} seconds")
    return all_results


def format_results(model_results: Dict[str, Dict]) -> None:
    """Print results in a clean tabular format."""

    # Get all unique tasks and metrics
    all_tasks = set()
    all_metrics = set()
    for results in model_results.values():
        all_tasks.update(results.keys())
        for task_results in results.values():
            all_metrics.update(task_results.keys())

    # Print header
    print("\n" + "=" * 120)
    print("EVALUATION RESULTS")
    print("=" * 120)

    # Organize by model first, then by task for clearer hierarchy
    for model_name in sorted(model_results.keys()):
        print(f"\nModel: {model_name}")
        print("-" * 80)

        # Print header row with tasks
        header = "Metric".ljust(25)
        for task in sorted(all_tasks):
            header += f"{task}".rjust(15)
        print(header)
        print("-" * 80)

        # Print metrics for this model
        for metric in sorted(all_metrics):
            row = metric.ljust(25)
            for task in sorted(all_tasks):
                if task in model_results[model_name] and metric in model_results[model_name][task]:
                    value = model_results[model_name][task][metric]
                    # Handle both string and numeric values
                    if isinstance(value, (int, float)):
                        row += f"{value:15.4f}"
                    else:
                        row += f"{str(value):>15}"
                else:
                    row += " " * 15  # Add padding for missing values
            print(row)

        print("-" * 80)


def main():
    # Define tasks and their few-shot settings
    tasks = [
        "hellaswag",
        # "piqa",
        # "siqa",
        # "boolq",
        # "winogrande",
        # "commonsense_qa",
        # "openbookqa",
        # "arc",
        # "arc_easy",
        # "arc_challenge",
    ]

    tasks_fewshot = {
        "hellaswag": None,
        # "piqa": 0,
        # "siqa": 0,
        # "boolq": 0,
        # "winogrande": -1,
        # "commonsense_qa": 7,
        # "openbookqa": -1,
        # "arc": -1,
        # "arc_easy": -1,
        # "arc_challenge": -1,
    }

    # Setup device and models
    device, dtype = setup_device()

    model_configs = [
        ModelConfig(
            name="SmolLM-360M",
            checkpoint="HuggingFaceTB/SmolLM-360M",
            model_class=AutoModelForCausalLM,
            tokenizer_class=AutoTokenizer,
            dtype=dtype,
            device=device,
        ),
        ModelConfig(
            name="Mamba-370m-hf",
            checkpoint="state-spaces/mamba-370m-hf",
            model_class=MambaForCausalLM,
            tokenizer_class=AutoTokenizer,
            dtype=dtype,
            device=device,
        ),
    ]

    # Run evaluations
    all_model_results = {}
    for config in model_configs:
        results = evaluate_model(
            config,
            tasks,
            tasks_fewshot,
            batch_size="auto",

            # Change this to your own cache folder lol
            cache_dir="/scratch/gpfs/mn4560/hazan-lab/tensorized_filters/tensorized_filters/eval/cache",
        )
        all_model_results[config.name] = results

    # Print results in a clean format
    format_results(all_model_results)


if __name__ == "__main__":
    main()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Evaluating SmolLM-360M...

Running task: hellaswag


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like HuggingFaceTB/SmolLM-360M is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [9]:
import time
import sys

import torch
import torch.nn.functional as F
import torch.utils.benchmark as benchmark

from transformers import AutoModelForCausalLM, AutoTokenizer, MambaForCausalLM

torch.set_float32_matmul_precision("high")


def setup_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.bfloat16 if device == "cuda" else torch.float32
    return device, dtype


def load_model(checkpoint, model_class, tokenizer_class, device, dtype):
    tokenizer = tokenizer_class.from_pretrained(checkpoint)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = model_class.from_pretrained(checkpoint, device_map=device, torch_dtype=dtype).to(device)
    return model, tokenizer


def warmup_model(model, inputs, device, iterations=3):
    for _ in range(iterations):
        _ = model.generate(**inputs)
        if device == "cuda":
            torch.cuda.synchronize()


def stream_generate(model, tokenizer, inputs, max_new_tokens=150, temperature=0.7, top_k=50):
    model.eval()
    input_ids = inputs["input_ids"].to(model.device)  # [1, seq_len]
    attention_mask = inputs.get("attention_mask").to(model.device)  # [1, seq_len]

    generated_ids = input_ids.clone()
    start_time = time.time()
    new_tokens = 0

    print("Streaming output:", end=" ")
    sys.stdout.flush()

    sample_rng = torch.Generator(device=model.device)
    sample_rng.manual_seed(1746)

    with torch.no_grad():
        for _ in range(max_new_tokens):
            outputs = model(generated_ids, attention_mask=attention_mask)
            logits = outputs.logits[:, -1, :]  # [1, vocab_size]

            if temperature > 0:
                logits = logits / temperature
            probs = F.softmax(logits, dim=-1)  # [1, vocab_size]

            top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)  # [1, top_k]
            ix = torch.multinomial(top_k_probs, 1, generator=sample_rng)  # [1, 1]
            next_token = torch.gather(top_k_indices, -1, ix)  # [1, 1]

            # Ensure next_token is 2D before concatenation
            generated_ids = torch.cat([generated_ids, next_token], dim=-1)  # [1, seq_len + 1]
            attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)  # [1, seq_len + 1]

            token_str = tokenizer.decode(next_token[0], skip_special_tokens=True)  # Decode single token
            print(token_str, end="", flush=True)

            new_tokens += 1
            if next_token[0].item() == tokenizer.eos_token_id:
                break

    elapsed_time = time.time() - start_time
    tokens_per_sec = new_tokens / elapsed_time if elapsed_time > 0 else 0

    print()  # Newline after streaming
    return generated_ids, tokens_per_sec


def benchmark_model(model, inputs, min_run_time=0.2):
    timer = benchmark.Timer(
        stmt="model.generate(**inputs)",
        globals={"model": model, "inputs": inputs},
    )
    measurement = timer.blocked_autorange(min_run_time=min_run_time)
    return measurement.median


def main():
    device, dtype = setup_device()

    # Model 1: SmolLM-360M
    model1, tokenizer1 = load_model("HuggingFaceTB/SmolLM-360M", AutoModelForCausalLM, AutoTokenizer, device, dtype)
    inputs1 = tokenizer1("Hi, I'm a student at Princeton University, and", return_tensors="pt", padding=True)
    inputs1 = {k: v.to(device) for k, v in inputs1.items()}

    # Model 2: Mamba-370m-hf
    model2, tokenizer2 = load_model("state-spaces/mamba-370m-hf", MambaForCausalLM, AutoTokenizer, device, dtype)
    inputs2 = tokenizer2("Hi, I'm a student at Princeton University, and", return_tensors="pt", padding=True)
    inputs2 = {k: v.to(device) for k, v in inputs2.items()}

    # Warm-up
    print("Warming up models...")
    warmup_model(model1, inputs1, device)
    warmup_model(model2, inputs2, device)

    # Streaming generation with tokens/sec
    print("\nGenerating text samples...")
    print("\nSmolLM-360M:")
    _, tokens_sec1 = stream_generate(model1, tokenizer1, inputs1)
    print("\nMamba-370m-hf:")
    _, tokens_sec2 = stream_generate(model2, tokenizer2, inputs2)

    # Memory footprint
    if hasattr(model1, "get_memory_footprint"):
        mem1 = model1.get_memory_footprint() / 1e6
        mem2 = model2.get_memory_footprint() / 1e6

    # Benchmarking
    print("\nRunning benchmarks...")
    time1 = benchmark_model(model1, inputs1)
    time2 = benchmark_model(model2, inputs2)

    # Present results in a clean tabular format
    print("\n" + "=" * 50)
    print("MODEL COMPARISON RESULTS")
    print("=" * 50)
    print(f"{'Metric':<20} {'SmolLM-360M':<15} {'Mamba-370m-hf':<15}")
    print("-" * 50)
    print(f"{'Tokens/sec':<20} {tokens_sec1:<15.2f} {tokens_sec2:<15.2f}")
    print(f"{'Memory (MB)':<20} {mem1:<15.2f} {mem2:<15.2f}")
    print(f"{'Inference speed (s)':<20} {time1:<15.4f} {time2:<15.4f}")
    print("=" * 50)


if __name__ == "__main__":
    main()


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like HuggingFaceTB/SmolLM-360M is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.