In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from torch import nn

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mixtral-8x7B-v0.1",
    quantization_config=double_quant_config,
    # attn_implementation="flash_attention_2",
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

In [5]:
activation = {}
activation['count'] = 0


def getActivation(name):
    def hook(model, input, output):
        if type(output) is tuple:
            output = output[0]
        activation[name] = output.detach()
        activation['count'] += 1

    return hook

experts = model.model.layers[15].block_sparse_moe.experts

for expert_idx in range(8):
    experts[expert_idx].w3.register_forward_hook(getActivation(f"layer_15_expert_{expert_idx}"))

gate_hook = model.model.layers[15].block_sparse_moe.gate.register_forward_hook(getActivation("layer_15_gate"))

moe_hook = model.model.layers[15].block_sparse_moe.register_forward_hook(
    getActivation("layer_15_moe_block")
)

In [7]:
prompt = ""

model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=1, do_sample=False)
print(tokenizer.batch_decode(generated_ids)[0])

# print(activation['15_6'].shape)
# print(activation["15_6"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> #


In [17]:
print(model.model.layers[15].block_sparse_moe.experts[5].w1.weight.shape)
print(model.model.layers[15].block_sparse_moe.experts[5].w2.weight.shape)
print(model.model.layers[15].block_sparse_moe.experts[5].w3.weight.shape)

print(model.model.layers[15].block_sparse_moe.gate.weight.shape)
print(activation.keys())
print(activation['count'])
print(activation["layer_15_expert_4"].shape)
print(activation["layer_15_gate"].shape)
print(activation["layer_15_moe_block"].shape)

torch.Size([29360128, 1])
torch.Size([29360128, 1])
torch.Size([29360128, 1])
torch.Size([16384, 1])
dict_keys(['count', 'layer_15_gate', 'layer_15_expert_4', 'layer_15_expert_6', 'layer_15_moe_block'])
8
torch.Size([1, 14336])
torch.Size([1, 8])
torch.Size([1, 1, 4096])


In [12]:
print(activation["layer_15_moe_block"][0][0].tolist())

[-0.210693359375, 0.09893798828125, -0.1822509765625, -0.114501953125, 0.0274810791015625, -0.09619140625, -0.0693359375, 0.11248779296875, 0.067626953125, -0.2086181640625, -0.001956939697265625, 0.037750244140625, 0.1337890625, -0.07830810546875, -0.1187744140625, -0.011016845703125, 0.0142669677734375, 0.10528564453125, -0.112060546875, 0.08551025390625, 0.107421875, 0.184326171875, 0.1605224609375, -0.1048583984375, 0.0927734375, -0.033935546875, 0.22509765625, -0.1890869140625, 0.189453125, 0.01861572265625, 0.25537109375, -0.06646728515625, 0.08367919921875, -0.204833984375, -0.0634765625, -0.00775146484375, 0.03753662109375, 0.00634765625, -0.10504150390625, -0.10565185546875, 0.0523681640625, -0.144775390625, -0.1424560546875, -0.057647705078125, -0.03436279296875, 0.0609130859375, -0.02642822265625, 0.09197998046875, -0.0699462890625, -0.044586181640625, -0.09735107421875, -0.0170135498046875, -0.1380615234375, 0.52099609375, -0.033538818359375, 0.002197265625, 0.1689453125, 0

In [26]:
model.model.config

MixtralConfig {
  "_name_or_path": "mistralai/Mixtral-8x7B-v0.1",
  "architectures": [
    "MixtralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mixtral",
  "num_attention_heads": 32,
  "num_experts_per_tok": 2,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "num_local_experts": 8,
  "output_router_logits": false,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope

In [10]:
14336 * 4096 / 2

29360128.0

In [22]:
print(activation["layer_15_expert_2"])

KeyError: 'layer_15_expert_2'

In [16]:
print(activation.keys())
print(activation['count'])

dict_keys(['count', 'layer_15_gate', 'layer_15_expert_4', 'layer_15_expert_6'])
3


In [37]:
gate_weights = model.model.layers[15].block_sparse_moe.gate.weight

print(gate_weights.shape)
print(gate_weights.)

torch.Size([16384, 1])
Parameter containing:
Parameter(Params4bit([[225],
            [ 22],
            [102],
            ...,
            [146],
            [ 23],
            [ 25]], device='cuda:0', dtype=torch.uint8))


In [27]:
h = (
    model.model.layers[15]
    .block_sparse_moe.experts[7]
    .w3.register_forward_hook(getActivation("15_7"))
)

experts = model.model.layers[15].block_sparse_moe.experts

for expert_idx in range(8):
    experts[expert_idx].w3.register_forward_hook(getActivation(f"layer_15_expert_{expert_idx}"))

In [62]:
# i = 0
# for child in model.children().:
#     # for param in child.parameters():
#     print(child)
#     i+=1
#     print(i)
#     # break
# top_layers = list(model.children())
# print(top_layers)
# named_layers = dict(model.named_modules())
# print(named_layers)
# # model.children()

dict(list(list(list(model.children())[0].children())[1])[15])

TypeError: 'MixtralDecoderLayer' object is not iterable