In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'


In [2]:
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from tqdm.auto import tqdm
import bitsandbytes as bnb
import torch
import json
import os

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

from peft import LoraConfig, get_peft_model


# from utils.ademamix import AdEMAMix

from bitsandbytes.optim.ademamix import AdEMAMix8bit as AdEMAMix

from utils.config_utils import GenerationParams, PathConfig, DistillationParams
from utils.adapters import DoRAAdapter
from utils.torch_utils import (
    save_quant,
    load_quant,
    destruct_module_optimized,
    memory_cleanup,
    get_nonreasoning_dataset,
    load_weight,
    rsetattr,
    rgetattr,
    load_weights,
    rhasattr,
    count_parameters
)

In [3]:
def prepare_limo(x):
    return [
        {'role':'user', 'content':x['question']},
        {'role':'assistant', 'content':x['solution']},
    ]
    
class HealingDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }


def load_and_prepare_data(tokenizer, batch_size=8, max_length=512, num_workers=os.cpu_count(), train_sample_limit=None, val_sample_limit=None):
    # dataset = load_dataset(
    #     "cognitivecomputations/dolphin-r1", "nonreasoning", cache_dir="../dolphin-r1"
    # )["train"]

    dataset = load_dataset(
        "HuggingFaceH4/ultrachat_200k", 
    )["train_sft"]

    dataset_refine = load_dataset(
        "GAIR/LIMO", 
    )["train"]

    # def filter_function(example):
    #     if example["overall_quality"] is not None and example["overall_quality"] == 5:
    #         return True
    #     if example["score"] is not None and example["score"] >= 0.16:
    #         return True
    #     return False

    # dataset = dataset.filter(filter_function)
    
    # Apply sample limits if provided
    if train_sample_limit is not None:
        train_dataset = dataset.select(range(train_sample_limit))  # Use .select for efficiency
    else:
        train_dataset = dataset

    train_dataset = train_dataset["messages"]
    
    train_dataset = [
        tokenizer.apply_chat_template(elt, tokenize=False, add_generation_prompt=False)
        for elt in tqdm(train_dataset, desc="Preparing dataset train")
    ]

    for elt in dataset_refine:
        train_dataset.append(tokenizer.apply_chat_template(prepare_limo(elt), tokenize=False, add_generation_prompt=False))

    train_dataset = HealingDataset(
        train_dataset, tokenizer, max_length=max_length
    )
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
    )
    return train_loader, None


## Configs

In [4]:
torch.set_float32_matmul_precision('medium')

device="cuda:0"

model_name="../deepseek_v3_awq"
# model_name = "../deepseek_v2_lite_chat_awq"

n_epochs = 1
start_layer = 1
end_layer = 61
target_routed_expert = 4
target_active_expert = target_routed_expert
dora_rank = 4
calibrate_merge=1
calibrate_merge= calibrate_merge == 1
pruning_method= "fused"

path_config = PathConfig(
    model_name = model_name,
    intermediate_states = "../data/intermediate_states",
    expert_states = "../data/expert_states",
    expert_activations = "../data/expert_activations",
    distillation_logs = "../distillation_logs",
    moe_states="../moe_states"
)



distillation_config = DistillationParams(
    n_epochs= n_epochs,
    target_routed_expert = target_routed_expert,
    target_active_expert = target_active_expert,
    eval_batches=16,
    gradient_accumulation_steps= 4,
    learning_rate= 3e-4,
    end_factor= 0.2,
    calibrate_merge=calibrate_merge,
    skip_first_tokens=0, ## useful to avoid tuning on early tokens that have less informations
    pruning_method=pruning_method, # topk , act_cl, state_cl
    dora_rank=dora_rank,
)

# if distillation_config.pruning_method=="progressive":
#     unhealed_name=model_name+f"_{distillation_config.pruning_method}_{distillation_config.target_routed_expert}a{distillation_config.target_active_expert}_unhealed"
# elif distillation_config.pruning_method=="fused":
#     unhealed_name=model_name+f"_fused_{distillation_config.target_routed_expert}_unhealed"
# else:
#     unhealed_name=model_name+f"_{distillation_config.pruning_method}_{distillation_config.target_routed_expert}a{distillation_config.target_active_expert}_{distillation_config.calibrate_merge}_{distillation_config.n_epochs}_unhealed"
 
# unhealed_name=unhealed_name.replace('_awq', '')

unhealed_name=model_name+f"_fused_{distillation_config.target_routed_expert}_unhealed"
unhealed_name=unhealed_name.replace('_awq', '').replace("../","/home/golympie/")

healed_name=unhealed_name.split('/')[-1].replace('_unhealed','')

final_path="/home/golympie/ai-toolbox/pruned_models/"
final_name = os.path.join(final_path, unhealed_name.replace('_unhealed','').replace('../',''))


tokenizer = AutoTokenizer.from_pretrained(
    unhealed_name, trust_remote_code=True
)

In [5]:
# from patched_modules.modeling_deepseek_fused import DeepseekV3ForCausalLM

quant_config = BitsAndBytesConfig(
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_quant_storage=torch.bfloat16,
)

with open(unhealed_name+"/model.safetensors.index.json", "r") as f:
    weight_map=json.loads(f.read())['weight_map']

device_map={}
for elt in tqdm(weight_map):
    if "lm_head" in elt:
        device_map[elt]="cuda:1"
    elif "model.embed_tokens" in elt:
        device_map[elt]="cuda:0"
    elif "model.norm" in elt:
        device_map[elt]="cuda:0"
    else:
        i = int(elt.split('.')[2])
        if i < 30:
            device_map[elt]="cuda:0"
        else:
            device_map[elt]="cuda:1"
    
model=AutoModelForCausalLM.from_pretrained(
    unhealed_name,
    device_map=device_map,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=quant_config,
)

target_modules=[
    'fused_layer',
    # 'gate_proj',
    # 'up_proj',
    # 'down_proj',
    'q_proj',
    'kv_a_proj_with_mqa',
    'kv_b_proj',
    'kv_b_proj',
    'o_proj',
]

peft_config = LoraConfig(
    # use_dora=True,
    target_modules=target_modules,
    r=4,
    lora_alpha=8,
    lora_dropout=0.1
)
    
model = get_peft_model(model, peft_config)

for name, parameter in model.named_parameters():
    if 'gate.' in name:
        parameter.requires_grad=True
    if 'norm' in name:
        parameter.requires_grad=True
    if 'qa_weights' in name:
        parameter.requires_grad=True
    if 'qb_weights' in name:
        parameter.requires_grad=True
    if 'scaling_factor' in name:
        parameter.requires_grad=True
    if 'lm_head' in name:
        parameter.requires_grad=False

  0%|          | 0/3925 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [6]:
from torch.utils.tensorboard import SummaryWriter
import torch
from utils.ademamix import AdEMAMix
from torch.optim.lr_scheduler import _LRScheduler
import math

class WarmupCosineAnnealingLR(_LRScheduler):
    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.min_lr = min_lr
        super(WarmupCosineAnnealingLR, self).__init__(optimizer, last_epoch=-1)

    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            # Linear warmup phase
            return [base_lr * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs]
        else:
            # Cosine annealing phase
            cosine_decay = 0.5 * (1.0 + math.cos(math.pi * (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)))
            decay_factor = (1 - self.min_lr) * cosine_decay + self.min_lr
            return [base_lr * decay_factor for base_lr in self.base_lrs]
            
# Assuming model, tokenizer, and load_and_prepare_data are defined elsewhere

max_length = 128

num_epochs=1
num_sample = 4096

batch_size = 1
gradient_accumulation_steps = 2
log_interval = 1  # Log every 10 steps

lr = 3e-4
# Initialize the SummaryWriter
writer = SummaryWriter(log_dir=f'runs/{healed_name}')

train_loader, val_loader = load_and_prepare_data(
    tokenizer, batch_size=batch_size, max_length=max_length,
    train_sample_limit=num_sample, val_sample_limit=None
)

optimizer = AdEMAMix(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=lr,
    betas=(0.9, 0.999, 0.9999),
    alpha=5.0 #batch size is small so increazing alpha to smooth gradient
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=len(train_loader) // gradient_accumulation_steps,
    eta_min=lr/10
)

model=torch.compile(model)
model.train()  # Ensure the model is in training mode

for epoch in range(num_epochs):  # Assuming num_epochs is defined
    progress_bar = tqdm(train_loader, desc=f"Training, epoch {epoch}")
    for i, encoding in enumerate(progress_bar):
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Forward pass
        output = model(
            input_ids=input_ids,
            labels=input_ids,  # Assuming labels are the same as input_ids for this task
            attention_mask=attention_mask,
            use_cache=False,
            output_attentions=False,
            output_hidden_states=False
        )

        # Compute loss and backpropagate
        loss = output.loss
        loss.backward()

        # Update model parameters and learning rate
        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # Log loss and learning rate to TensorBoard

        progress_bar.set_postfix(loss=loss.item())
        
        if (i + 1) % log_interval == 0:
            global_step = epoch * len(train_loader) + i
            writer.add_scalar('Loss/train', loss.item(), global_step)
            writer.add_scalar('Learning Rate', scheduler.get_last_lr()[0], global_step)
        memory_cleanup()

# Close the writer
writer.close()
model.save_pretrained(healed_name)

Preparing dataset train:   0%|          | 0/4096 [00:00<?, ?it/s]

Training, epoch 0:   0%|          | 0/4913 [00:09<?, ?it/s]

W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0] Graph break from `Tensor.item()`, consider setting:
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0]     torch._dynamo.config.capture_scalar_outputs = True
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0] or:
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0] to include these operations in the captured graph.
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0] 
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0] Graph break: from user code at:
W0323 16:01:28.775000 228249 site-packages/torch/_dynamo/variables/tensor.py:869] [6/0]   File "/home/golympie/miniconda3/lib/python3.11/site-pa

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 1 has a total capacity of 23.57 GiB of which 138.25 MiB is free. Including non-PyTorch memory, this process has 23.41 GiB memory in use. Of the allocated memory 23.05 GiB is allocated by PyTorch, and 48.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_pretrained(healed_name)

In [None]:
model

In [None]:
from peft import AutoPeftModelForCausalLM

model=AutoPeftModelForCausalLM.from_pretrained(
    healed_name,
    device_map='cpu',
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

model = model.merge_and_unload()

model.save_pretrained(final_name)
tokenizer.save_pretrained(final_name)

In [None]:
import shutil

In [None]:
shutil.copy(os.path.join('../patched_modules/', 'modeling_deepseek_fused_v2.py'), os.path.join(final_name, 'modeling_deepseek.py'))
shutil.copy(os.path.join('../patched_modules/', 'configuration_deepseek_fused_v2.py'), os.path.join(final_name, 'configuration_deepseek.py'))

shutil.copy(os.path.join('../patched_modules/', 'modeling_deepseek_fused_v2.py'), os.path.join(final_name, 'modeling_deepseek_fused_v2.py'))
shutil.copy(os.path.join('../patched_modules/', 'configuration_deepseek_fused_v2.py'), os.path.join(final_name, 'configuration_deepseek_fused_v2.py'))

In [None]:
with open(unhealed_name+"/model.safetensors.index.json", "r") as f:
    weight_map=json.loads(f.read())['weight_map']
    
model=AutoModelForCausalLM.from_pretrained(
    final_name,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=quant_config,
)


In [None]:
model.model.norm.weight

In [None]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# prompt="""Given the following evidences:
# - Henri IV was a famous king of france
# - Kings love to hunt and to joust
# - Hunting horse are always brown or camo
# - Camo is a pattern of color used to hide in plain sight

# Answer the following question:
# - What was the color of Henri IV white horse?"""

prompt="Implement a basic snake game in python. Start your answer with ```python"

messages = [{"role": "user", "content": prompt}]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
input_tensor = tokenizer(input_tensor, return_tensors="pt").to('cuda:0')

out = model.generate(**input_tensor, streamer=streamer, temperature=0.3, repetition_penalty=1.0, max_new_tokens=64, do_sample=True)

In [None]:
model.eval()

In [None]:
prompt="""Given the following evidences:
- Henri IV was a famous king of france
- Kings love to hunt and to joust
- Hunting horse are always brown or camo
- Camo is a pattern of color used to hide in plain sight

Answer the following question:
- What was the color of Henri IV white horse?"""



messages = [{"role": "user", "content": prompt}]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
input_tensor = tokenizer(input_tensor, return_tensors="pt").to(device)

In [None]:
y = model(**input_tensor).logits

In [7]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# prompt="""Given the following evidences:
# - Henri IV was a famous king of france
# - Kings love to hunt and to joust
# - Hunting horse are always brown or camo
# - Camo is a pattern of color used to hide in plain sight

# Answer the following question:
# - What was the color of Henri IV white horse?"""

prompt="how many r's are in mississipi?"

messages = [{"role": "user", "content": prompt}]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
input_tensor = tokenizer(input_tensor, return_tensors="pt").to(device)

out = model.generate(**input_tensor, streamer=streamer, temperature=0.01, repetition_penalty=1.0, max_new_tokens=512, do_sample=True)

Here’ a ** ** ** ** 

KeyboardInterrupt: 

distil_output

Guacamole is a rich and flavorful creation of the Mexican cuisine. It's a mix of spices, fruits, and herbs. The perfect cocktail for the drink, its signature is: "Guacamole is a mouth filled with a warm palette of flavor. It's a mix of spices, fruits, and herbs, like avocado, guilla, sour, guel, guam, guz, and guza. The flavors of this dish are intense, rich, vibrant, and sometimes you can cook with something that will you make it a more special, unforgettable, memorable, or unforgettable. The perfect cocktail is paired with a mix of citrus, fruity, sour, and savory, with the essence of spicy, smoky, and earthy flavors. It's a mouth filled with a warm palette of flavor, the taste is unique to create it as an authentic dish.
I've got a little thing that I can cook with this dish. The ingredients are: a combination of tomatoes, potatoes, and carrots, sour, sour, sweet, and guol, gule, guza, guam, guz, and guza, or guel. It's a 

nodist_corr

 The poem contains a poetic reflection of an experience of experimentation, experimentation, and experimentation with an mundane and mundane life of life. Guacamole is a poem that describes a world of culture,