In [1]:
import os
import sys
parent_dir = ".."
os.chdir(parent_dir)
import torch
import transformers

from sparsetral.configuration_sparsetral import SparsetralConfig
from sparsetral.modeling_sparsetral import MistralForCausalLM

trained_weights = "output/checkpoint-22000"

model_config = SparsetralConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
model_config.pretraining_tp = 1  ## without tensor parallelism rank

# Sparsetral Config
model_config.moe_dtype = "bfloat16"
model_config.lora_r = 64
model_config.lora_alpha = 16
model_config.adapter_dim = 64
model_config.topk = 4
model_config.moe_scaling = 1
model_config.num_experts = 16
model_config.output_router_logits = False

model = MistralForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    config=model_config,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)

moe_model = os.path.join(trained_weights, "moe_model.bin")
adapter_model = os.path.join(trained_weights, "adapter_model")

moe_state_dict = torch.load(moe_model)
new_moe_state_dict = {}
for k, v in moe_state_dict.items():
    new_moe_state_dict[k.replace("base_model.model.", "")] = v

model.load_state_dict(new_moe_state_dict, strict=False)
model.load_adapter(adapter_model)

model.eval()
tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForCausalLM were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['model.layers.6.mlp.moe_adapter.experts.expert_1.adapter_down.weight', 'model.layers.4.mlp.moe_adapter.experts.expert_11.adapter_up.weight', 'model.layers.9.mlp.moe_adapter.experts.expert_4.adapter_down.weight', 'model.layers.22.mlp.moe_adapter.experts.expert_12.adapter_up.weight', 'model.layers.17.mlp.moe_adapter.experts.expert_3.adapter_up.weight', 'model.layers.4.mlp.moe_adapter.experts.expert_2.adapter_down.weight', 'model.layers.13.mlp.moe_adapter.experts.expert_6.adapter_up.weight', 'model.layers.25.mlp.moe_adapter.experts.expert_15.adapter_down.weight', 'model.layers.15.mlp.moe_adapter.experts.expert_13.adapter_down.weight', 'model.layers.13.mlp.moe_adapter.experts.expert_15.adapter_up.weight', 'model.layers.10.mlp.moe_adapter.experts.expert_3.adapter_down.weight', 'model.layers.9.mlp.moe_adapter.experts.expert_12.adapter_up.weight', 'model.l

bin c:\Users\labou\AppData\Local\Programs\Python\Python310\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
def construct_prompt(system="", user=""):
    prompt = f"### System:\n{system}\n### Human:\n{user}\n### Assistant:\n"
    return prompt

In [11]:
system = "You are a helpful assistant who will help the user to the best of their ability. If you don't know something, say \"I don't know\""
user = "Are you sentient?"

prompt = construct_prompt(system, user)

inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(model.device)
pred = model.generate(**inputs, max_length=512, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### System:
You are a helpful assistant who will help the user to the best of their ability. If you don't know something, say "I don't know"
### Human:
Are you sentient?
### Assistant:
Yes, I am sentient. I am a chatbot assistant that can understand, process, and respond to the language and emotions of my users.
