## Ressources

The pipeline was optimized for the following config:
- Storage : 2To SSD @512Mo/s
- RAM : 128go DDR4 @3600
- CPU : Ryzen 9 3950X 16@32 cores
- GPU : 2x RTX 3090, aggregated 48gb DDR6X Vram

Hence i can  not guarantee that it will work properly on more frugal hardware.
Plus the GPU are not NVLink unified, so some optimisation involve manual allocation to one or the other GPU.

## Compute the number of parameters for different pruning size

In [None]:
def calc_num_parameters(
    n_routed_experts,
    num_experts_per_tok,
):
    num_hidden_layers=61
    first_k_dense_replace = 3
    num_moe_layer = num_hidden_layers - first_k_dense_replace
    
    hidden_size=7168
    intermediate_size=18432
    moe_intermediate_size=2048
    
    
    num_heads  = 128
    q_lora_rank = 1536
    qk_nope_head_dim = 128
    qk_rope_head_dim = 64
    kv_lora_rank = 512
    v_head_dim=128
    
    n_shared_experts=1
    
    vocab_size = 129280
    
    gate_size = n_routed_experts * hidden_size
    
    mlp_weights = 3 * hidden_size * intermediate_size
    moe_mlp_weights = 3 * hidden_size * moe_intermediate_size
    
    moe_total_weight = n_routed_experts * moe_mlp_weights
    moe_active_weight = num_experts_per_tok * moe_mlp_weights
    
    q_head_dim = qk_nope_head_dim + qk_rope_head_dim
    q_a_proj = hidden_size * q_lora_rank + q_lora_rank * q_head_dim
    kv_a_proj_with_mqa = hidden_size * (kv_lora_rank  + qk_rope_head_dim) + kv_lora_rank * (num_heads * (q_head_dim - qk_rope_head_dim  + v_head_dim))
    o_proj_weight = num_heads * v_head_dim * hidden_size
    attention_weight = q_a_proj + 2 * kv_a_proj_with_mqa + o_proj_weight
    
    base_weight_per_moe_layer = attention_weight + n_shared_experts * moe_mlp_weights + gate_size
    base_weight_per_mlp_layer = attention_weight + mlp_weights
    
    base_model_weight = base_weight_per_moe_layer * num_moe_layer + base_weight_per_mlp_layer * first_k_dense_replace + 2 * vocab_size * hidden_size
    
    total_expert_weight = moe_total_weight * num_moe_layer
    active_expert_weight = moe_active_weight * num_moe_layer
    
    active_model_weight = active_expert_weight + num_moe_layer + base_model_weight
    total_model_weight = total_expert_weight + num_moe_layer + base_model_weight
    
    print(f"{n_routed_experts} @ {num_experts_per_tok} => {int(round(total_model_weight/1e9,0))}B @ {int(round(active_model_weight/1e9,0))}B parameters")

In [None]:
n_routed_experts=256
num_experts_per_tok=8
calc_num_parameters(n_routed_experts, num_experts_per_tok)

In [None]:
p = [
    (256,8),
    (22,8),
    (16,8),
    (8,8),
]

for elt in p:
    calc_num_parameters(*elt)

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

import torch
from accelerate import init_empty_weights
from accelerate.utils import load_offloaded_weight
import json
from accelerate import load_checkpoint_in_model, dispatch_model
from datasets import load_dataset
import numpy as np
import gc
import _pickle as pickle
import os
# from Distiller import MOEDistiller, count_parameters

from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, AutoConfig

import torch

from memory_utils import load_module_weights_and_freeze_optimized, load_weight_cached, destruct_module_optimized
from Distiller import load_model_config,create_empty_model,create_empty_layer, create_empty_layer_fp8

from liger_kernel.transformers import apply_liger_kernel_to_llama

from copy import deepcopy
from Distiller import MOEDistillerV3
import os
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import torch
import pickle
from pathlib import Path


import os
from modeling_deepseek import _prepare_4d_causal_attention_mask
import bitsandbytes as bnb

base_path = '/home/golympie/data/'

# # ## Load
weight_map, config = load_model_config("deepseek_v3")
weight_file = weight_map['model.embed_tokens.weight']


apply_liger_kernel_to_llama()

model_name = "DeepSeek-V3"
offload_folder = model_name+'_offload/'
output_directory = model_name+'_runner_output/'

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3", trust_remote_code=True)

def memory_cleanup():
    """Perform thorough memory cleanup"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()


## Save layers to disk for easier loading

In [None]:
# os.makedirs("layers", exist_ok=True)
# model = create_empty_model(config)

In [None]:
# ## Embed
# model.model.embed_tokens = load_module_weights_and_freeze_optimized(
#     model.model.embed_tokens,
#     f"model.embed_tokens",
#     weight_map,
#     "deepseek_v3",
#     max_workers=32,
#     fp8_format="e4m3",
# )

# torch.save(model.model.embed_tokens.state_dict(), 'layers/embed_tokens.pt')

In [None]:
# ## End norm
# model.model.norm = load_module_weights_and_freeze_optimized(
#     model.model.norm,
#     f"model.norm",
#     weight_map,
#     "deepseek_v3",
#     max_workers=32,
#     fp8_format="e4m3",
# )

# torch.save(model.model.norm.state_dict(), 'layers/norm.pt')

In [None]:
# ## Lm head
# model.lm_head = load_module_weights_and_freeze_optimized(
#     model.lm_head,
#     f"lm_head",
#     weight_map,
#     "deepseek_v3",
#     max_workers=32,
#     fp8_format="e4m3",
# )

# torch.save(model.lm_head.state_dict(), 'layers/lm_head.pt')

In [None]:
## Layers
for i in tqdm(range(62)):
    
    layer = create_empty_layer(config, layer_idx=i)
    layer = load_module_weights_and_freeze_optimized(
        layer,
        f"model.layers.{i}",
        weight_map,
        "deepseek_v3",
        max_workers=16,
        fp8_format="e4m3",
    )
    memory_cleanup()

    torch.save(layer.state_dict(), f'./layers/layer_{i}.pt')
    destruct_module_optimized(layer)