In [1]:
# fix numpy in colab
import numpy
from IPython.display import clear_output

# fix triton in colab
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

!git clone https://github.com/dvmazur/mixtral-offloading.git --quiet
!cd mixtral-offloading && pip install -r requirements.txt

clear_output()

In [1]:
import sys
sys.path.append("mixtral-offloading")

import torch
from torch.nn import functional as F

from transformers import AutoConfig, AutoTokenizer

from huggingface_hub import snapshot_download
from IPython.display import clear_output
from tqdm.auto import trange

from hqq.core.quantize import BaseQuantizeConfig

from src.build_model import build_model, OffloadConfig, QuantConfig


model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
config = AutoConfig.from_pretrained(model_name)

state_path = snapshot_download("lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo")
clear_output()

device = torch.device("cuda:0")




offload_config = OffloadConfig(
    main_size=config.num_local_experts * config.num_hidden_layers * 4 // 8,
    offload_size=config.num_local_experts * config.num_hidden_layers * 4 // 8,
    buffer_size=4,
    offload_per_layer=4,
)

attn_config = BaseQuantizeConfig(
    nbits=4,
    group_size=64,
    quant_zero=True,
    quant_scale=True,
)
attn_config["scale_quant_params"]["group_size"] = 256


ffn_config = BaseQuantizeConfig(
    nbits=2,
    group_size=16,
    quant_zero=True,
    quant_scale=True,
)

quant_config = QuantConfig(ffn_config=ffn_config, attn_config=attn_config)


offload_config = OffloadConfig(
    main_size=config.num_local_experts * config.num_hidden_layers * 4 // 8,
    offload_size=config.num_local_experts * config.num_hidden_layers * 4 // 8,
    buffer_size=4,
    offload_per_layer=4,
)

model = build_model(
    device=device,
    quant_config=quant_config,
    offload_config=offload_config,
    state_path=state_path,
)



Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
id_to_token = {v: k for k, v in tokenizer.get_vocab().items()}


input_ids = tokenizer.apply_chat_template(
    [
      dict(
            role="user",
            content="Integrate exp(-x^2)"
          )
    ],
    return_tensors='pt',
).to(device)

inputs = dict(input_ids=input_ids, attention_mask=torch.ones_like(input_ids))

with torch.no_grad():
    past_key_values = None
    for _ in trange(100):
        out = model(**inputs, past_key_values=past_key_values, output_hidden_states=True)
        past_key_values = out.past_key_values
        probs = F.softmax(out.logits[0, -1] / 0.9, dim=-1)
        token_id = torch.multinomial(probs, 1)
        token = id_to_token[token_id.item()]

        if token[0] == '▁':
            print(end=' ')
            token = token[1:]
        print(token, end='')

        inp = token_id.reshape(1, 1)
        inputs = dict(
            input_ids=inp,
        )

  0%|          | 0/100 [00:00<?, ?it/s]

 It's not possible to find a general antiderivative of f(x) = e^(-x^2) in terms of elementary functions. However, we can represent the antiderivative as an improper integral:<0x0A><0x0A><0xE2><0x88><0xAB> e^(-x^2) dx = F(x) = Erf(x)/sqrt(π)<0x0A><0x0A>where Erf(x) is the error function, defined as<0x0A><0x0A>Erf(x) =