# Use this Colab notebook to do evaluation (perplexity, throughput, model size)




In [None]:
# install dependencies
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U bitsandbytes

In [None]:
# load model

from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-370m-hf")
model = MambaForCausalLM.from_pretrained("state-spaces/mamba-370m-hf", quantization_config=None, output_hidden_states=False, device_map="auto")


In [None]:
print(model)

In [4]:
#save the original mamba model
torch.save(model.state_dict(), "mamba_model.pt")

## Manually changing layers: run at most 1 cell in this section, depending on which layers you want to replace


This loop replaces each linear layer with a int8 linear layer. Comment out any layer types you don't want to replace

In [5]:
import torch
import torch.nn as nn
import bitsandbytes as bnb

threshold=6.0

# This loop replaces each linear layer with a int8 linear layer. Comment out any layer types you don't want to replace

# Assuming 'model' is your pre-trained MambaForCausalLM model
# This code modify the og mamba model, where it replaces the Linear Layer to Linear8bit
for i, block in enumerate(model.backbone.layers):


    # inner_project layer
    in_proj_layer = block.mixer.in_proj

    in_proj_layer_in_features = in_proj_layer.in_features
    in_proj_layer_out_features = in_proj_layer.out_features
    in_proj_layer_bias = in_proj_layer.bias is not None

    # Create a new 8-bit precision in_proj layer
    # Make sure to set has_fp16_weights=False for inference-focused quantization
    new_in_proj_layer = bnb.nn.Linear8bitLt(in_proj_layer_in_features, in_proj_layer_out_features,bias=in_proj_layer_bias, has_fp16_weights=False, threshold=threshold)

    # Replace the existing in_proj layer with the new one
    block.mixer.in_proj = new_in_proj_layer


    # x_project layer
    x_proj = block.mixer.x_proj

    x_proj_layer_in_features = x_proj.in_features
    x_proj_layer_out_features = x_proj.out_features
    x_proj_layer_bias = x_proj.bias is not None

    new_x_proj_layer = bnb.nn.Linear8bitLt(x_proj_layer_in_features, x_proj_layer_out_features,bias=x_proj_layer_bias, has_fp16_weights=False, threshold=threshold)

    # Replace the existing in_proj layer with the new one
    block.mixer.x_proj = new_x_proj_layer


    # dt_project layer
    dt_proj_layer = block.mixer.dt_proj

    dt_proj_layer_in_features = dt_proj_layer.in_features
    dt_proj_layer_out_features = dt_proj_layer.out_features
    dt_proj_layer_bias = dt_proj_layer.bias is not None

    new_dt_proj_layer = bnb.nn.Linear8bitLt(dt_proj_layer_in_features, dt_proj_layer_out_features,bias=dt_proj_layer_bias, has_fp16_weights=False, threshold=threshold)

    block.mixer.dt_proj = new_dt_proj_layer



    #out_project layer

    out_proj = block.mixer.out_proj

    out_proj_layer_in_features = out_proj.in_features
    out_proj_layer_out_features = out_proj.out_features
    out_proj_layer_bias = out_proj.bias is not None

    new_out_proj_layer = bnb.nn.Linear8bitLt(out_proj_layer_in_features, out_proj_layer_out_features,bias=out_proj_layer_bias, has_fp16_weights=False, threshold=threshold)
    block.mixer.out_proj = new_out_proj_layer
    pass



# To load the state_dict back into the model (for inference or further adjustments):
model.load_state_dict(torch.load("mamba_model.pt"))

# If your deployment environment supports it, move the model to the appropriate device
# For example, using CUDA device 0
bit_model = model.to('cuda:0') # This also triggers the internal quantization process in bitsandbytes


This loop replaces each linear layer with a int8 or fp4 linear layer. Comment out any layer types you don't want to replace

In [None]:
import torch
import torch.nn as nn
import bitsandbytes as bnb

threshold=6.0

# Assuming 'model' is your pre-trained MambaForCausalLM model
for i, block in enumerate(model.backbone.layers):


    # inner_project layer
    in_proj_layer = block.mixer.in_proj

    in_proj_layer_in_features = in_proj_layer.in_features
    in_proj_layer_out_features = in_proj_layer.out_features
    in_proj_layer_bias = in_proj_layer.bias is not None

    # Create a new 8-bit precision in_proj layer
    # Make sure to set has_fp16_weights=False for inference-focused quantization
    new_in_proj_layer = bnb.nn.Linear8bitLt(in_proj_layer_in_features, in_proj_layer_out_features,bias=in_proj_layer_bias, has_fp16_weights=False, threshold=threshold)

    # Replace the existing in_proj layer with the new one
    block.mixer.in_proj = new_in_proj_layer


    # x_project layer
    x_proj = block.mixer.x_proj

    x_proj_layer_in_features = x_proj.in_features
    x_proj_layer_out_features = x_proj.out_features
    x_proj_layer_bias = x_proj.bias is not None

    new_x_proj_layer = bnb.nn.Linear4bit(x_proj_layer_in_features, x_proj_layer_out_features,bias= x_proj_layer_bias)

    # Replace the existing in_proj layer with the new one
    block.mixer.x_proj = new_x_proj_layer


    # dt_project layer
    dt_proj_layer = block.mixer.dt_proj

    dt_proj_layer_in_features = dt_proj_layer.in_features
    dt_proj_layer_out_features = dt_proj_layer.out_features
    dt_proj_layer_bias = dt_proj_layer.bias is not None

    new_dt_proj_layer = bnb.nn.Linear8bitLt(dt_proj_layer_in_features, dt_proj_layer_out_features,bias= dt_proj_layer_bias, has_fp16_weights=False, threshold=threshold)

    block.mixer.dt_proj = new_dt_proj_layer



    #out_project layer

    out_proj = block.mixer.out_proj

    out_proj_layer_in_features = out_proj.in_features
    out_proj_layer_out_features = out_proj.out_features
    out_proj_layer_bias = out_proj.bias is not None

    new_out_proj_layer = bnb.nn.Linear4bit(out_proj_layer_in_features, out_proj_layer_out_features,bias= out_proj_layer_bias)
    block.mixer.out_proj = new_out_proj_layer
    pass


# To load the state_dict back into the model (for inference or further adjustments):
model.load_state_dict(torch.load("mamba_model.pt"))

# If your deployment environment supports it, move the model to the appropriate device
# For example, using CUDA device 0
bit_model = model.to('cuda:0') # This also triggers the internal quantization process in bitsandbytes

# Model Size

In [None]:
bit_model

This cell prints model size

In [None]:
param_size = 0
for param in bit_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in bit_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

#Throughput and perplexity



In [None]:
#!pip install datasets
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

This loop performs perplexity/throughput benchmarks.

In [None]:
import torch
from tqdm import tqdm

device = "cuda"
max_length = 1024 #bit_model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()

num_output_tokens = 0

for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        logits = outputs.logits
        generated_tokens = torch.argmax(logits, dim=-1)
        # Count non-padding tokens in the generated sequence
        num_output_tokens += torch.sum(generated_tokens != -100).item()


        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)
    print("Current:",torch.exp(torch.stack(nlls).mean()))

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

end.record()

# Waits for everything to finish running
torch.cuda.synchronize()

print("RESULTS:")
print("num output tokens:", num_output_tokens)
print("time in seconds:", start.elapsed_time(end)/1000)
print("output tokens per second", num_output_tokens/(start.elapsed_time(end)/1000))
print("perplexity", ppl)