In [1]:
# auto reloads source code modules
%load_ext autoreload

import os
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments

models_root = "/var/model/models"
gib_root    = "/var/model/genie-in-the-box"

print( os.getcwd() )
os.chdir( f"/{gib_root}/src" )
print( os.getcwd() )

import cosa.utils.util          as du
import cosa.utils.util_xml      as dux
import cosa.utils.util_pytorch  as dupt
from cosa.agents.llm            import Llm
from cosa.training.peft_trainer import PeftTrainer

os.environ[ "NCCL_P2P_DISABLE" ] = "1"
os.environ[ "NCCL_IB_DISABLE"  ] = "1"
os.environ[ "WANDB_DISABLED"   ] = "true"

def reset_models( models ):

    for model in models:
        del model

    gc.collect()

def reset_notebook_kernel():

    from IPython import get_ipython
    get_ipython().kernel.do_shutdown( restart=True )


/var/model/genie-in-the-box/src/ephemera/notebooks/mistral
/var/model/genie-in-the-box/src


2025-02-14 02:01:31,317 INFO schemas.py L1274: Include schema from 'file:///usr/local/lib/python3.10/dist-packages/xmlschema/schemas/XSD_1.1/xsd11-extra.xsd'


In [None]:
reset_models( [] )
reset_notebook_kernel()

In [None]:
torch.cuda.is_available()

## Instantiate trainer for Ministral-8B-Instruct-2410

In [None]:
%autoreload

# model_id_or_path = f"{models_root}/Ministral-8B-Instruct-2410-autoround-4-bits-sym.gptq/2025-01-24-at-20-48"
model_id_or_path   = "mistralai/Ministral-8B-Instruct-2410"
model_name         = "Ministral-8B-Instruct-2410"
test_train_path    = f"{gib_root}/src/ephemera/prompts/data"
lora_dir           = f"{models_root}/{model_name}.lora"

trainer = PeftTrainer( model_id_or_path, model_name, test_train_path, lora_dir=lora_dir, debug=True )

trainer.set_hf_env_vars()
trainer.set_gib_env_vars( gib_root=gib_root )
trainer.login_to_hf()

gib_root

### get training prompt stats, _ONCE_

In [None]:
%autoreload
token_stats, _ = trainer.get_training_prompt_stats()

{'min': 373, 'max': 683, 'mean': 589.1221287097386}

## Fine tune it

In [None]:
%autoreload
# baby batch...
trainer.fine_tune( sample_size=0.005, batch_size=2, gradient_accumulation_steps=8, logging_steps=0.05, eval_steps=0.20, device_map="auto", output_dir=lora_dir )
# Half batch (sample_size=0.5) should really be full batch!

In [None]:
trainer.checkpoint_dir = "/var/model/models/Ministral-8B-Instruct-2410.lora/training-2025-02-11-at-18-33/checkpoint-9"

In [None]:
trainer.lora_dir = lora_dir
trainer.lora_dir

## load an quantized model and merge with adapter

In [None]:
%autoreload

# checkpoint_dir = f"{models_root}/Ministral-8B-Instruct-2410.lora/training-2025-02-06-at-21-28/checkpoint-987"
# du.print_simple_file_list( lora_dir )

# we can get the last checkpoint directory only after a fine-tuning run has finished, Leslie:
# checkpoint_dir = trainer.get_last_checkpoint_dir()

trainer.load_and_merge_adapter()

## Save the merged model

In [None]:
%autoreload
merged_adapter_dir = trainer.save_merged_adapter( lora_dir )

In [None]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16

In [None]:
# clear models from memory


In [None]:
%autoreload
merged_adapter_dir = "/var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16"
# release_gpus( [] )
# reset_notebook_kernel()
# trainer.quantize_merged_adapter()
trainer.quantize_merged_adapter( merged_adapter_dir=merged_adapter_dir )

In [None]:
quantized_model_dir = "/var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16/autoround-4-bits-sym.gptq/2025-02-11-at-21-12"
quantized_model_dir

In [None]:
reset_notebook_kernel()

## Validate merged model

In [25]:
%autoreload

# model_id         = "/var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16"
model_id         = "/var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16/autoround-4-bits-sym.gptq/2025-02-11-at-21-12"
model_name       = "Ministral-8B-Instruct-2410"
test_train_path  = f"{gib_root}/src/ephemera/prompts/data"
# lora_dir         = f"{models_root}/{model_name}.lora"

trainer = PeftTrainer( model_id, model_name, test_train_path, debug=True )

trainer.set_gib_env_vars( gib_root=gib_root )

------------------------------------------------------------------------------------------------------------------------
- Initializing PEFT Trainer for Ministral-8B-Instruct-2410
------------------------------------------------------------------------------------------------------------------------

Model ID: /var/model/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-08-at-16-16/autoround-4-bits-sym.gptq/2025-02-11-at-21-12
Path to test/train data: /var/model/genie-in-the-box/src/ephemera/prompts/data


## run validation, using standalone server

In [14]:
from cosa.agents.llm import Llm

In [24]:
%autoreload
# llm = Llm( switch="deepily" )
Llm.DEEPILY_PREFIX

# llm = Llm( default_url="http://blah.blah.com" )
Llm.get_model( "/mnt/foo" )


'Deepily//mnt/foo'

In [26]:
quantized_model_dir = "/mnt/DATA01/include/www.deepily.ai/projects/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-12-at-02-05/autoround-4-bits-sym.gptq/2025-02-12-at-02-27"
model = Llm.get_model( quantized_model_dir )

In [27]:
%autoreload

# this one calls an LLM server, like vLLM or TGI

stats_df = trainer.run_validation_with_server(
    model=model, switch="deepily", device_map="cuda:0", sample_size=100, debug=True, verbose=True
)
print()
stats_df

------------------------------------------------------------------------------------------------------------------------
- Querying an LLM server w/ model [Deepily//mnt/DATA01/include/www.deepily.ai/projects/models/Ministral-8B-Instruct-2410.lora/merged-on-2025-02-12-at-02-05/autoround-4-bits-sym.gptq/2025-02-12-at-02-27]
------------------------------------------------------------------------------------------------------------------------

Updating the prompt field for [100] rows...
Updating the prompt field for [100] rows... Done!
------------------------------------------------------------------------------------------------------------------------
- Validating Ministral-8B-Instruct-2410 w/ 100 samples...
------------------------------------------------------------------------------------------------------------------------

command
go to new tab                                        8
search google new tab                                7
search perplexity new tab                

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=3000): Max retries exceeded with url: /v1/completions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7efaf3157cd0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [None]:
stats_df = trainer.run_validation_in_memory( switch="huggingface", device_map="cuda:0", sample_size=10, debug=False, verbose=False )
stats_df

## run validation, using in memory LLM

In [None]:
%autoreload

# trainer.set_gib_env_vars( gib_root=gib_root )
# checkpoint_dir = "/var/model/models/Ministral-8B-Instruct-2410.lora/training-2025-02-06-at-21-28/checkpoint-987"

# this one calls in LLM in memory
stats_df = trainer.run_validation_in_memory( switch="huggingface", device_map="cuda:0", sample_size=100, debug=True, verbose=False )
stats_df

## run this after tokenizer is initialized below

In [11]:
import pandas as pd
%autoreload

def run_validation( model, tokenizer, model_name="ministral/Ministral-3b-instruct", device="cuda:1", sample_size=1000, debug=False, verbose=False ):

    df = pd.read_json(
        "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True
    ).sample( sample_size, random_state=42 )

    du.print_banner( f"Validating {model_name} w/ {sample_size} samples" )
    # Print value counts for the command column to see how many unique commands we have
    print( df.command.value_counts(), end="\n\n" )

    xml_ftp_generator = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", debug=debug, verbose=verbose )

    df = xml_ftp_generator.generate_responses(
        df, tokenizer=tokenizer, model=model, switch="huggingface", model_name=model_name, device=device, debug=debug, verbose=verbose
    )
    df = xml_ftp_generator.validate_responses( df )

    xml_ftp_generator.print_validation_stats( df, title=f"Validation stats for model {model_name}" )
    
    return df

In [17]:
# ! ls -alh /var/model/models/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
! ls -alh /var/model/models/hub

total 48K
drwxrwxr-x 11 1001 1001 4.0K Jan 25 01:52 .
drwxrwxr-x 15 1001 1001 4.0K Jan 25 02:42 ..
drwxrwxr-x  5 1001 1001 4.0K Dec 20 23:09 .venv
drwxrwxr-x  6 1001 1001 4.0K Jan 15 19:14 datasets--NeelNanda--pile-10k
drwxrwxr-x  6 1001 1001 4.0K Dec 20 23:21 models--kaitchup--Phi-4-AutoRound-GPTQ-4bit
drwxrwxr-x  6 1001 1001 4.0K Dec 18 03:51 models--kaitchup--Qwen2.5-Coder-32B-Instruct-AutoRound-GPTQ-4bit
drwxrwxr-x  6 1001 1001 4.0K Jan 15 18:34 models--meta-llama--Llama-3.2-3B-Instruct
drwxrwxr-x  6 1001 1001 4.0K Dec 13 22:17 models--ministral--Ministral-3b-instruct
drwxrwxr-x  6 1001 1001 4.0K Jan 25 01:26 models--mistralai--Ministral-8B-Instruct-2410
drwxrwxr-x  6 1001 1001 4.0K Jan 24  2024 models--mistralai--Mistral-7B-Instruct-v0.2
drwxr-xr-x  2 1001 1001 4.0K Dec 13 19:41 models--mistralai--Mistral-7B-v0.1
-rw-rw-r--  1 1001 1001    1 Dec 18 03:43 version.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Load model and tokenizer Using bits and bites quantization or bfloat16?

In [22]:
# get HF_HOME from environment
os.environ[ "HF_HOME" ]


'/var/model/models'

In [16]:
import torch

def get_base_model_and_tokenizer( model_path=".", tokenizer_path=".", torch_dtype=torch.bfloat16, use_bnb_quantization=False, device_map="auto", cache_dir=f"{models_root}/hub" ):

    compute_dtype = getattr( torch, "float16" )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype
    )
    if use_bnb_quantization:

        print( bnb_config )

        # ¡OJO! Why were we turning off the cash here? It makes a big performance difference: 21 vs 14 tokens per second
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, quantization_config=bnb_config, device_map=device_map, low_cpu_mem_usage=True, use_cache=True,
            local_files_only=True, cache_dir=cache_dir,
            attn_implementation="flash_attention_2",
            # use_auth_token=auth_token,
            # token=auth_token
        )
    else:
        print( "Loading without BitsAndBytesConfig..." )
        print( "HF_HOME: " + os.environ[ "HF_HOME" ] )
        print( "HF_HUB_ETAG_TIMEOUT: " + os.environ[ "HF_HUB_ETAG_TIMEOUT" ] )
        print( "HF_HUB_DOWNLOAD_TIMEOUT: " + os.environ[ "HF_HUB_DOWNLOAD_TIMEOUT" ] )
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, device_map=device_map, low_cpu_mem_usage=True, use_cache=True,
            torch_dtype=torch_dtype, local_files_only=True, cache_dir=cache_dir,
            attn_implementation="flash_attention_2",
            # use_auth_token=auth_token,
            # token=auth_token
        )

    tokenizer              = AutoTokenizer.from_pretrained( tokenizer_path, force_download=True, from_slow=False )
    tokenizer.pad_token    = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return base_model, tokenizer


In [8]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
else:
    print("CUDA is not available. PyTorch will use the CPU.")



CUDA is available. PyTorch can use the GPU.


In [12]:
os.chdir( f"{models_root}/" )

In [34]:
model_path = "mistralai/Ministral-8B-Instruct-2410"

In [None]:
print( os.getcwd() )
base_model, tokenizer = get_base_model_and_tokenizer(
    model_path=model_path,
    tokenizer_path=model_path,
    use_bnb_quantization=False,
    device_map="auto"
)

In [14]:
! pwd

/var/model/models


In [25]:
base_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(131072, 4096)
    (layers): ModuleList(
      (0-35): 36 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
          (down_proj): Linear(in_features=12288, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
    

## Get training dataset

In [16]:
import json

In [17]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-train.jsonl"
deepily_dataset_train = du.get_file_as_list( path )#[ 0:10000 ]
deepily_dataset_train = [ json.loads( line ) for line in deepily_dataset_train ]
len( deepily_dataset_train )

31606

In [18]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-test.jsonl"
deepily_dataset_test = du.get_file_as_list( path )#[ 0:1000 ]
deepily_dataset_test = [ json.loads( line ) for line in deepily_dataset_test ]
len( deepily_dataset_test )

3951

In [20]:
# for line in prompt_instruction_format( deepily_dataset_test[ 0 ] ).split( "\n" ): print( line )

## Set up training arguments

In [21]:
from peft import LoraConfig, get_peft_config, PeftModel, PeftConfig, get_peft_model, AutoPeftModelForCausalLM

peft_config = LoraConfig(
    r=64, 
    lora_alpha=32, 
    # When target_modules was disabled, it was causing detention layers to be assigned to the CPU, throwing this runtime error:
    # RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! 
    # (when checking argument for argument mat2 in method wrapper_CUDA_mm)
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head" ], 
    lora_dropout=0.10, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [6]:
model_name = "Ministral-8B-Instruct-2410"

os.chdir( f"{models_root}/{model_name}" )
os.getcwd()

'/var/model/models/Ministral-8B-Instruct-2410'

## we need to disable peer to peer communication until the RTX 4090 drivers are updated

In [17]:
os.environ[ "NCCL_P2P_DISABLE" ] = "1"
os.environ[ "NCCL_IB_DISABLE" ] = "1"

In [25]:
from trl import SFTTrainer

# Define the training arguments
trainingArgs = TrainingArguments(
    output_dir="./training-results", # Output directory where the model predictions and checkpoints will be stored
    num_train_epochs=1, # Number of training epochs
    per_device_train_batch_size=2, # Batch size per GPU for training. https://kaitchup.substack.com/p/fine-tune-a-mixture-of-experts-on Says that using even batch size is best
    per_device_eval_batch_size=2,  # Batch size per GPU for evaluation. https://kaitchup.substack.com/p/fine-tune-a-mixture-of-experts-on Says that using even batch size is best
    gradient_accumulation_steps=8, # Number of update steps to accumulate the gradients for
    gradient_checkpointing=True,# Enable gradient checkpointing
    # optim="paged_adamw_32bit", # Optimizer to use: see kaitchup for more details: https://kaitchup.substack.com/p/fine-tuning-llms-with-32-bit-8-bit
    optim="paged_adamw_8bit",
    #save_steps=save_steps,
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    
    # Setting this may help with the warning message: The input hidden states seems to be silently casted in float32, 
    # this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
    fp16=False,
    # Test to confirm that this works!
    # BTW: according to PHIND, this may actually improve fine-tuning performance as well: https://www.phind.com/search?cache=ygn9dbyl0ij4kotmgns2nsrw
    
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    #max_steps=max_steps,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    # report_to="wandb",
    report_to="none",
    seed=42
)
# Create the trainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=deepily_dataset_train,
    eval_dataset=deepily_dataset_test,
    peft_config=peft_config,
    max_seq_length=2184, # Calculated by get_training_prompt_stats( tokenizer ), max = 728 * 3 # was: 2,048 or 4,096
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [26]:
def print_trainable_parameters( model ):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    
print_trainable_parameters( base_model )
# trainable params: 170,082,304 || all params: 7,411,814,400 || trainable%: 2.29

trainable params: 183,238,656 || all params: 8,203,046,912 || trainable%: 2.23


## Train model

In [18]:
os.chdir( f"{models_root}/{model_name}" )
print( os.getcwd() )

/var/model/models/Ministral-8B-Instruct-2410


In [None]:
trainer.train()

trainer.save_model()

print( "Model saved" )



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 0.6379, 'grad_norm': 3.7572505474090576, 'learning_rate': 6.25e-05, 'epoch': 0.00951927653498334}
{'loss': 0.4318, 'grad_norm': 2.073525905609131, 'learning_rate': 0.000125, 'epoch': 0.01903855306996668}
{'loss': 0.1912, 'grad_norm': 1.2309727668762207, 'learning_rate': 0.0001875, 'epoch': 0.028557829604950024}
{'loss': 0.1159, 'grad_norm': 0.6885170340538025, 'learning_rate': 0.00019996952581438068, 'epoch': 0.03807710613993336}
{'loss': 0.0891, 'grad_norm': 1.2253117561340332, 'learning_rate': 0.0001998457562671611, 'epoch': 0.047596382674916705}
{'loss': 0.0703, 'grad_norm': 0.9658526182174683, 'learning_rate': 0.00019962690449567912, 'epoch': 0.05711565920990005}
{'loss': 0.0631, 'grad_norm': 0.6428528428077698, 'learning_rate': 0.00019931317891052708, 'epoch': 0.06663493574488338}
{'loss': 0.0614, 'grad_norm': 0.7227765917778015, 'learning_rate': 0.0001989048782697851, 'epoch': 0.07615421227986673}
{'loss': 0.0598, 'grad_norm': 0.5716121196746826, 'learning_rate': 0.00019

## RESTART 1st time & load model and tokenizer in FP16

In [16]:
# ! accelerate estimate-memory ministral/Ministral-3b-instruct
! accelerate estimate-memory mistralai/Ministral-8B-Instruct-2410

Traceback (most recent call last):
  File "/usr/local/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/estimate.py", line 286, in estimate_command
    data = gather_data(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/estimate.py", line 253, in gather_data
    model = create_empty_model(
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/estimate.py", line 133, in create_empty_model
    raise ValueError(
ValueError: Library `vllm` is not supported yet, please open an issue on GitHub for us to add support.


In [14]:
os.chdir( f"{models_root}" )
os.getcwd()

'/var/model/models'

In [15]:
! pip show transformers

Name: transformers
Version: 4.46.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: autoawq, peft, trl


In [16]:
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path=model_path,
    tokenizer_path=model_path,
    use_bnb_quantization=False,
    device_map="auto" 
)

Loading without BitsAndBytesConfig...
HF_HOME: /var/model/models
HF_HUB_ETAG_TIMEOUT: 60
HF_HUB_DOWNLOAD_TIMEOUT: 60


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [17]:
os.getcwd()

'/var/model/models'

In [18]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410/training-results-2025.01.09

total 1.7G
drwxr-xr-x 3 root root 4.0K Jan  9 18:37 .
drwxrwxr-x 3 1001 1001 4.0K Jan  9 18:44 ..
-rw-r--r-- 1 root root 5.0K Jan  9 18:37 README.md
-rw-r--r-- 1 root root  824 Jan  9 18:37 adapter_config.json
-rw-r--r-- 1 root root 1.7G Jan  9 18:37 adapter_model.safetensors
drwxr-xr-x 2 root root 4.0K Jan  9 18:37 checkpoint-525
-rw-r--r-- 1 root root  437 Jan  9 18:37 special_tokens_map.json
-rw-r--r-- 1 root root  17M Jan  9 18:37 tokenizer.json
-rw-r--r-- 1 root root 178K Jan  9 18:37 tokenizer_config.json
-rw-r--r-- 1 root root 5.5K Jan  9 18:37 training_args.bin


In [19]:
os.environ[ "HF_HOME" ]

'/var/model/models'

In [20]:
from transformers import logging

logging.set_verbosity_debug()

In [21]:
! echo $HF_HOME 

/var/model/models


In [22]:
! ls -alh /root/.cache/huggingface/hub/

total 20K
drwxr-xr-x 4 root root 4.0K Jan  9 21:09 .
drwxr-xr-x 3 root root 4.0K Jan  9 21:08 ..
drwxr-xr-x 3 root root 4.0K Jan  9 21:09 .locks
drwxr-xr-x 6 root root 4.0K Jan  9 21:09 models--mistralai--Ministral-8B-Instruct-2410
-rw-r--r-- 1 root root    1 Jan  9 21:08 version.txt


In [23]:
from peft import PeftModel, AutoPeftModelForCausalLM

# adapter_plus_model = PeftModel.from_pretrained( base_model, "Mistral-7B-Instruct-v0.2/training-results-2024.02.05/", use_flash_attention_2=True )
# adapter_plus_model = PeftModel.from_pretrained( base_model, "Ministral-3b-instruct/training-results-2024.12.14/", use_flash_attention_2=True )
adapter_plus_model = PeftModel.from_pretrained( base_model, "Ministral-8B-Instruct-2410/training-results-2025.01.09/", use_flash_attention_2=True )

Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1
Detected flash_attn version: 2.7.2.post1


In [24]:
dupt.print_device_allocation( adapter_plus_model )

base_model.model.model.embed_tokens.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: cuda:0
base_model.model.

## TEST model on validation dataset using adapter loaded on top

In [29]:
stats_df = run_validation( adapter_plus_model, tokenizer, sample_size=1000, model_name=model_name, device="cuda:1" )

# Generating responses for 1,000 rows... Done! in 18:09
# [1,089.6] ms per item
#
# ------------------------------------------------------------------------------------------------------------------------
# - Validation stats for model ministral/Ministral-3b-instruct
# ------------------------------------------------------------------------------------------------------------------------
#
#                Is valid xml 100.0%
#         Contains <response> 100.0%
#          Contains <command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.6%
# Response has correct values 99.6%
#          Command is correct 99.9%
#             Args is correct 99.7%

# Generating responses for 1,000 rows... Done! in 34:46
# [2086.8] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 0.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.5%
# Response has correct values 99.5%
#  Browser command is correct 99.6%
#             Args is correct 99.9%

------------------------------------------------------------------------------------------------------------------------
- Validating Ministral-8B-Instruct-2410 w/ 1000 samples
------------------------------------------------------------------------------------------------------------------------

command
search google new tab                                58
search phind current tab                             58
go to current tab                                    57
agent router go to weather                           55
search google scholar current tab                    55
search phind new tab                                 53
go to new tab                                        53
search kagi current tab                              52
agent router go to date and time                     52
search google current tab                            49
search kagi new tab                                  48
agent router go to receptionist                      48
search current tab   

## Perform a 16bit merge & write to disk

In [10]:
import os
merge_date = "2025-01-09"
# merge_date = du.get_current_date()

os.chdir( f"{models_root}/{model_name}" )
merged_path = "./merged-00-" + merge_date
os.getcwd(), merged_path

('/var/model/models/Ministral-8B-Instruct-2410', './merged-00-2025-01-09')

In [31]:
adapter_plus_model = adapter_plus_model.merge_and_unload()
adapter_plus_model.save_pretrained( merged_path, safe_serialization=True )

Configuration saved in ./merged-00-2025-01-09/config.json
Configuration saved in ./merged-00-2025-01-09/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at ./merged-00-2025-01-09/model.safetensors.index.json.


In [32]:
tokenizer.save_pretrained( merged_path, safe_serialization=True )

tokenizer config file saved in ./merged-00-2025-01-09/tokenizer_config.json
Special tokens file saved in ./merged-00-2025-01-09/special_tokens_map.json


('./merged-00-2025-01-09/tokenizer_config.json',
 './merged-00-2025-01-09/special_tokens_map.json',
 './merged-00-2025-01-09/tokenizer.json')

In [39]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410/merged-00-2025-01-09

total 15G
drwxr-xr-x 2 root root 4.0K Jan  9 21:56 .
drwxrwxr-x 4 1001 1001 4.0K Jan  9 21:55 ..
-rw-r--r-- 1 root root  678 Jan  9 21:55 config.json
-rw-r--r-- 1 root root  111 Jan  9 21:55 generation_config.json
-rw-r--r-- 1 root root 4.7G Jan  9 21:55 model-00001-of-00004.safetensors
-rw-r--r-- 1 root root 4.7G Jan  9 21:55 model-00002-of-00004.safetensors
-rw-r--r-- 1 root root 4.7G Jan  9 21:55 model-00003-of-00004.safetensors
-rw-r--r-- 1 root root 1.1G Jan  9 21:55 model-00004-of-00004.safetensors
-rw-r--r-- 1 root root  27K Jan  9 21:55 model.safetensors.index.json
-rw-r--r-- 1 root root  437 Jan  9 21:56 special_tokens_map.json
-rw-r--r-- 1 root root  17M Jan  9 21:56 tokenizer.json
-rw-r--r-- 1 root root 178K Jan  9 21:56 tokenizer_config.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## RESTART 2nd time & load merged model + tokenizer in bfloat16

In [38]:
def reset_kernel( models ):
    
    import gc
    from IPython import get_ipython
    
    for model in models:
        del model
    
    gc.collect()
    get_ipython().kernel.do_shutdown(restart=True)

In [39]:
def reset_environment():
    
    %load_ext autoreload
    %autoreload
    

In [3]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410/

total 20K
drwxrwxr-x 5 1001 1001 4.0K Jan 13 16:29 .
drwxrwxr-x 9 1001 1001 4.0K Jan  9 02:59 ..
drwxr-xr-x 2 root root 4.0K Jan  9 21:56 merged-00-2025-01-09
drwxr-xr-x 2 root root 4.0K Jan 13 16:29 merged-00-2025-01-09.awq
drwxr-xr-x 3 root root 4.0K Jan  9 18:37 training-results-2025.01.09


In [41]:
reset_kernel( [ base_model, tokenizer, adapter_plus_model ] )
reset_environment()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

set_gib_env_vars()
set_hf_env_vars()

os.chdir( f"{models_root}/{model_name}/merged-00-{merge_date}" )
print( os.getcwd() )

merged_model, merged_tokenizer = get_base_model_and_tokenizer( 
    use_bnb_quantization=False, 
    device_map="cuda:1",
    torch_dtype=torch.float16
)


/var/model/models
60
60
/var/model/models/Ministral-8B-Instruct-2410/merged-00-2025-01-09
Loading without BitsAndBytesConfig...
HF_HOME: /var/model/models
HF_HUB_ETAG_TIMEOUT: 60
HF_HUB_DOWNLOAD_TIMEOUT: 60


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Raw merged model in bfloat16
```
Wed Dec 18 15:10:23 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   37C    P8              25W / 450W |      6MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   45C    P8              21W / 450W |   6776MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    1   N/A  N/A     33585      C   /usr/bin/python3                           6766MiB |
+---------------------------------------------------------------------------------------+
```

In [15]:
%autoreload
stats_df = run_validation( merged_model, merged_tokenizer, model_name=model_name, device="cuda:1", sample_size=100 )
stats_df

UsageError: Line magic function `%autoreload` not found.


```
Generating responses for 100 rows... Done! in 50 seconds
[501.0] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation stats for model ministral/Ministral-3b-instruct
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 100.0%
        Contains <response> 100.0%
         Contains <command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
         Command is correct 100.0%
            Args is correct 100.0%

Wed Dec 18 15:27:21 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   36C    P8              25W / 450W |      6MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   43C    P8              21W / 450W |   7020MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    1   N/A  N/A     33585      C   /usr/bin/python3                           7010MiB |
+---------------------------------------------------------------------------------------+
```

```
Generating responses for 100 rows... Done! in 02:19
[1390.6] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 0.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%

Exact same model loaded two different ways:
0: Using TGI with & w/o --dtype bfloat16 flag
   docker run --name huggingface-tgi --gpus all --shm-size 1g -p 3000:3000 -v `pwd`:/data/model  ghcr.io/huggingface/text-generation-inference:1.3.4 --dtype bfloat16 --sharded false --num-shard 1 --port 3000 --model-id /data/model --quantize awq

1: Using jupyter notebook with raw model file: 
   low_cpu_mem_usage=True, 
   use_cache=True, 
   attn_implementation="flash_attention_2",
   torch_dtype=torch.bfloat16

Wed Jan 24 11:27:02 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   39C    P2              69W / 450W |  23146MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   43C    P8              24W / 450W |  15366MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A     10768      C   /opt/conda/bin/python3.10                 23136MiB |
|    1   N/A  N/A      7765      C   /usr/bin/python3                          15356MiB |
+---------------------------------------------------------------------------------------+
```

## Run benchmark on TGI service listening on port 3000

In [30]:
reset_kernel( [ merged_model, merged_tokenizer ] )
reset_environment()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
%autoreload

# from ephemera.prompts.xml_fine_tuning_prompt_generator import XmlFineTuningPromptGenerator
import pandas as pd

# tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://192.168.1.21:3000", debug=True )
# tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://172.17.0.3:3000", debug=True )
tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://127.0.0.1:3000/v1", debug=True ) 

# model_name     = "ministral/Ministral-3b-instruct-raw-bfloat16"

sample_size    = 10
validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True ).sample( sample_size, random_state=42 )
validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for {sample_size} rows with `{model_name}` on TGI:3000" )

# Generating responses for 10 rows... Done! in 7 seconds
# [771.2] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for 10 rows with `mistralai/Mistral-7B-Instruct-v0.2-raw-bfloat16` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 100.0%
# Response has correct values 100.0%
#  Browser command is correct 100.0%
#             Args is correct 100.0%

Reusing ConfigurationManager() singleton...
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [74]
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [148]
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [296]
Commands file for command [go to current tab] exists: True
Commands file for command [go to new tab] exists: True
Commands file for command [search current tab] exists: True
Commands file for command [search new tab] exists: True
Commands file for command [search google current tab] exists: True
Commands file for command [search google new tab] exists: True
Commands file for command [search google scholar current tab] exists: True
Commands file for command [search google scholar new tab] exists: True
Commands file for command [search kagi new tab] exists: True
Commands file for command [search kagi current tab] exists: True
Commands file for command [search perplexity current



ConnectionError: (MaxRetryError("HTTPConnectionPool(host='127.0.0.1', port=3000): Max retries exceeded with url: /v1 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f78d1acd1e0>: Failed to establish a new connection: [Errno 111] Connection refused'))"), '(Request ID: a83b0645-60e5-49c4-bde2-5df9b885c13f)')

## Quantize using AutoRound and write to disk

In [24]:
!pip install auto-round

[0m

In [39]:
! pip install --upgrade optimum
# ! pip install --upgrade auto-gptq

[0m

In [36]:
# ! pip show optimum

In [11]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# model_name = "Qwen/Qwen2.5-72B-Instruct"
# model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 )
# tokenizer = AutoTokenizer.from_pretrained( model_name )

from auto_round import AutoRound

bits, group_size, sym = 4, 128, True

autoround = AutoRound( merged_model, merged_tokenizer, nsamples=128, iters=512, low_gpu_mem_usage=True, batch_size=1, graddient_accumulation_steps=8, bits=bits, group_size=group_size, sym=sym)


autoround.quantize()

2025-01-13 17:39:14,365 INFO utils.py L149: Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-01-13 17:39:14,365 INFO utils.py L162: NumExpr defaulting to 16 threads.
[38;20m2025-01-13 17:39:20 INFO utils.py L577: Using GPU device[0m
[38;20m2025-01-13 17:39:20 INFO autoround.py L230: using torch.float16 for quantization tuning[0m
[38;20m2025-01-13 17:39:20 INFO autoround.py L300: start to cache block inputs[0m
2025-01-13 17:39:21,023 INFO config.py L54: PyTorch version 2.5.1 available.


README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

(…)-00000-of-00001-4746b8785c874cc7.parquet:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

[38;20m2025-01-13 17:39:42 INFO autoround.py L305: caching done[0m
  return F.linear(input, self.weight, self.bias)
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
  return F.linear(input, self.weight, self.bias)
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
  return F.linear(x, weight, bias)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Quantizing model.layers.35: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [21:23<00:00, 35.67s/it][38;20m2025-01-13 18:01:06 INFO autoround.py L340: quantization tuning time 1305.894364118576[0m
[38;20m2025-01-13 18:01:06 INFO autoround.py L356: Summary: quantized 252/253 in the model,  ['lm_head'] have not been quantized[0m
Quantizing model.layers.35: 100%|█████████████████████████████████████████████████████████████

(MistralForCausalLM(
   (model): MistralModel(
     (embed_tokens): Embedding(131072, 4096)
     (layers): ModuleList(
       (0-35): 36 x MistralDecoderLayer(
         (self_attn): MistralFlashAttention2(
           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
           (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (rotary_emb): MistralRotaryEmbedding()
         )
         (mlp): MistralMLP(
           (gate_proj): Linear(in_features=4096, out_features=12288, bias=False)
           (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
           (down_proj): Linear(in_features=12288, out_features=4096, bias=False)
           (act_fn): SiLU()
         )
         (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
         (post_attention_layernorm): MistralRMSNorm((4

In [14]:
os.chdir( f"{models_root}/{model_name}" )
os.getcwd()

'/var/model/models/Ministral-8B-Instruct-2410'

In [19]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410/

total 24K
drwxrwxr-x 6 1001 1001 4.0K Jan 13 18:04 .
drwxrwxr-x 9 1001 1001 4.0K Jan  9 02:59 ..
drwxr-xr-x 2 root root 4.0K Jan  9 21:56 merged-00-2025-01-09
drwxr-xr-x 2 root root 4.0K Jan 13 16:29 merged-00-2025-01-09.awq
drwxr-xr-x 2 root root 4.0K Jan 13 18:06 merged-00-2025-01-09.gptq
drwxr-xr-x 3 root root 4.0K Jan  9 18:37 training-results-2025.01.09


In [20]:
# Save quantized model
gptq_path = merged_path + ".gptq"
print( gptq_path)
# autoround.save_quantized( gptq_path, format='auto_gptq', inplace=True )

./merged-00-2025-01-09.gptq


In [23]:
# release memory
reset_kernel( [ merged_model, merged_tokenizer ] )

In [12]:
gptq_path

'./merged-00-2025-01-09.gptq'

In [13]:
# Load quantized model
from transformers import AutoTokenizer, AutoModelForCausalLM

gptq_model = AutoModelForCausalLM.from_pretrained( gptq_path, device_map="cuda:1" )
gptq_tokenizer = AutoTokenizer.from_pretrained( gptq_path )

  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Quantize using AWQ (Adaptive Weight Quantization) and write to disk

In [36]:
stats_df = run_validation( gptq_model, gptq_tokenizer, model_name="ministral/Ministral-8B-Instruct-2410", sample_size=10, debug=True, verbose=True )
stats_df

------------------------------------------------------------------------------------------------------------------------
- Validating ministral/Ministral-8B-Instruct-2410 w/ 10 samples
------------------------------------------------------------------------------------------------------------------------

command
search kagi current tab                              2
search google new tab                                1
agent router go to receptionist                      1
search google current tab                            1
search perplexity new tab                            1
search google scholar using clipboard current tab    1
search phind current tab                             1
agent router go to date and time                     1
go to current tab                                    1
Name: count, dtype: int64

Reusing ConfigurationManager() singleton...
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [74]
Inserting DUPLICATE placehold

KeyboardInterrupt: 

In [40]:
reset_kernel( [ gptq_tokenizer, gptq_model ] )

In [3]:
reset_environment()
import os

In [6]:
# !pip install autoawq

In [11]:
os.chdir( f"{models_root}/Ministral-3b-instruct/" )
print( os.getcwd() )

/var/model/models/Ministral-3b-instruct


In [8]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }

# Load model and tokenizer
raw_16bit_model     = AutoAWQForCausalLM.from_pretrained( merged_path, device_map="auto", safetensors=True )
raw_16bit_tokenizer = AutoTokenizer.from_pretrained( merged_path, use_fast=True )

# Quantize
raw_16bit_model.quantize( raw_16bit_tokenizer, quant_config=quant_config )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


README.md:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


val.jsonl.zst:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214670 [00:00<?, ? examples/s]

AWQ: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [10:28<00:00, 17.46s/it]


In [9]:
# Save quantized model
awq_path = merged_path + ".awq"
raw_16bit_model.save_quantized( awq_path, safetensors=True )
raw_16bit_tokenizer.save_pretrained( awq_path )

('./merged-00-2025-01-09.awq/tokenizer_config.json',
 './merged-00-2025-01-09.awq/special_tokens_map.json',
 './merged-00-2025-01-09.awq/tokenizer.json')

## GPU RAM after quantizing model with 4bit AWQ
```
Wed Jan 24 12:06:36 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   37C    P8              27W / 450W |   1320MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   44C    P8              24W / 450W |   2084MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A     15181      C   /usr/bin/python3                           1310MiB |
|    1   N/A  N/A     15181      C   /usr/bin/python3                           2074MiB |
+---------------------------------------------------------------------------------------+
```

## Validate AWQ model: In memory loaded by Jupiter notebook

In [12]:
reset_kernel( [ raw_16bit_model, raw_16bit_tokenizer ] )

In [6]:
reset_environment()
set_gib_env_vars()
import os

print( os.getcwd() )
os.chdir( "/var/model/genie-in-the-box/src" )
print( os.getcwd() )
import cosa.utils.util         as du
import cosa.utils.util_xml     as dux
import cosa.utils.util_pytorch as dupt

from cosa.training.xml_fine_tuning_prompt_generator import XmlFineTuningPromptGenerator


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/var/model/genie-in-the-box/src
/var/model/genie-in-the-box/src


In [7]:
import os
os.chdir( f"{models_root}/{model_name}/" )
print( os.getcwd() )

/var/model/models/Ministral-8B-Instruct-2410


In [8]:
! ls -alh /var/model/models/Ministral-8B-Instruct-2410/merged-00-2025-01-09.awq

total 5.4G
drwxr-xr-x 2 root root 4.0K Jan 13 16:29 .
drwxrwxr-x 6 1001 1001 4.0K Jan 13 18:04 ..
-rw-r--r-- 1 root root  843 Jan 13 16:29 config.json
-rw-r--r-- 1 root root  132 Jan 13 16:29 generation_config.json
-rw-r--r-- 1 root root 4.4G Jan 13 16:29 model-00001-of-00002.safetensors
-rw-r--r-- 1 root root 1.1G Jan 13 16:29 model-00002-of-00002.safetensors
-rw-r--r-- 1 root root  67K Jan 13 16:29 model.safetensors.index.json
-rw-r--r-- 1 root root  551 Jan 13 16:29 special_tokens_map.json
-rw-r--r-- 1 root root  17M Jan 13 16:29 tokenizer.json
-rw-r--r-- 1 root root 178K Jan 13 16:29 tokenizer_config.json


In [13]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

awq_path      = f"./merged-00-{merge_date}.awq"
model_aqw     = AutoAWQForCausalLM.from_pretrained( awq_path, device_map="cuda:1", safetensors=True )
tokenizer_awq = AutoTokenizer.from_pretrained( awq_path, use_fast=True )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
stats_df = run_validation( model_aqw, tokenizer_awq, sample_size=100 )
stats_df

------------------------------------------------------------------------------------------------------------------------
- Validating ministral/Ministral-3b-instruct w/ 100 samples
------------------------------------------------------------------------------------------------------------------------

command
go to new tab                                        8
search perplexity new tab                            7
search google new tab                                7
agent router go to calendar                          7
search perplexity current tab                        7
go to current tab                                    6
search google scholar current tab                    6
search kagi new tab                                  6
search kagi current tab                              5
search phind new tab                                 5
search google current tab                            5
agent router go to receptionist                      4
agent router go to date and t

Unnamed: 0,command,instruction,input,output,prompt,gpt_message,response,response_xml_is_valid,contains_response,contains_command,contains_args,response_is_exact,response_has_correct_values,command_is_correct,args_is_correct
3136,search google new tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search google new tab</comm...,True,True,True,True,True,True,True,True
2118,agent router go to receptionist,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>age...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>agent router go to receptio...,True,True,True,True,True,True,True,True
1811,search kagi current tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search kagi current tab</co...,True,True,True,True,True,True,True,True
70,search google current tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search google current tab</...,True,True,True,True,True,True,True,True
2609,search perplexity new tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search perplexity new tab</...,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2528,search new tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search new tab</command><ar...,True,True,True,True,True,True,True,True
3386,search current tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search current tab</command...,True,True,True,True,True,True,True,True
3731,search phind current tab,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>sea...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>search phind current tab</c...,True,True,True,True,True,True,True,True
1006,agent router go to calendar,Your job is to discern the intent of a human v...,\n Below is the raw human voice command...,\n <response>\n <command>age...,### Instruction:\n \n Use the Task a...,"{'messages': [{'role': 'system', 'content': 'I...",<response><command>agent router go to calendar...,True,True,True,True,True,True,True,True


In [18]:
reset_kernel( [ model_aqw, tokenizer_awq ] )

## GPU RAM after loading & validating AWQ model with 4bit AWQ: Device 1
```
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
| 65%   49C    P2              73W / 450W |   2700MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
```

```
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   43C    P8              22W / 450W |   5578MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
```

## - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2: ~40 Tokens/sec
```
Generating responses for 100 rows... Done! in 01:41
[1014.6] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 100.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%
```

## Validate AWQ model: TGI service listening on port 3000

In [11]:
reset_environment()
set_gib_env_vars()
%autoreload

import os
print( os.getcwd() )
os.chdir( "/var/model/genie-in-the-box/src" )
print( os.getcwd() )
import lib.utils.util         as du
import lib.utils.util_xml     as dux
import lib.utils.util_pytorch as dupt

from ephemera.prompts.xml_fine_tuning_prompt_generator import XmlFineTuningPromptGenerator


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/var/model/genie-in-the-box/src
/var/model/genie-in-the-box/src


In [15]:
%autoreload
tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://192.168.1.21:3000", debug=True )

model_name = "Ministral-8B-Instruct-2410"
# model_name     = "ministral/Ministral-3b-instruct-AWQ"
# model_name     = "Phind-CodeLlama-34B-v2 w/ BnB 4nf"

validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True )

validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for `{model_name}` on TGI:3000" )

# Generating responses for 100 rows... Done! in 50 seconds
# [502.1] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-AWQ` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 100.0%
# Response has correct values 100.0%
#  Browser command is correct 100.0%
#             Args is correct 100.0%


# Generating responses for 100 rows... Done! in 01:12
# [722.3] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-BnB-4nf` on TGI:3000 with BnB 4nf 
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%


# Generating responses for 100 rows... Done! in 02:26
# [1461.4] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `Phind-CodeLlama-34B-v2 w/ BnB 4nf` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 42.0%
# Response has correct values 42.0%
#  Browser command is correct 46.0%
#             Args is correct 82.0%
# 
# Mon Jan 22 13:23:25 2024
# +---------------------------------------------------------------------------------------+
# | NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
# |-----------------------------------------+----------------------+----------------------+
# | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
# | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
# |                                         |                      |               MIG M. |
# |=========================================+======================+======================|
# |   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
# |  0%   40C    P8              29W / 450W |  18064MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# |   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
# |  0%   45C    P8              22W / 450W |   4994MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# 
# +---------------------------------------------------------------------------------------+
# | Processes:                                                                            |
# |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
# |        ID   ID                                                             Usage      |
# |=======================================================================================|
# |    0   N/A  N/A     22240      C   /opt/conda/bin/python3.10                 18054MiB |
# |    1   N/A  N/A     23207      C   /usr/bin/python3                           4984MiB |
# # +---------------------------------------------------------------------------------------+

Reusing ConfigurationManager() singleton...
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [74]
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [148]
Inserting DUPLICATE placeholders into the list. Requested length [500] > list length [296]
Commands file for command [go to current tab] exists: True
Commands file for command [go to new tab] exists: True
Commands file for command [search current tab] exists: True
Commands file for command [search new tab] exists: True
Commands file for command [search google current tab] exists: True
Commands file for command [search google new tab] exists: True
Commands file for command [search google scholar current tab] exists: True
Commands file for command [search google scholar new tab] exists: True
Commands file for command [search kagi new tab] exists: True
Commands file for command [search kagi current tab] exists: True
Commands file for command [search perplexity current



ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), '(Request ID: 44d30b0e-26f9-4061-927f-bdb742745517)')

In [19]:
reset_kernel( [ model_aqw, tokenizer_awq ] )

NameError: name 'model_aqw' is not defined

## GPU ram after loading AWQ model: ~83 Tokens/s!
```
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   42C    P2              70W / 450W |  20240MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

Generating responses for 100 rows... Done! in 50 seconds
[502.1] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-AWQ` on TGI:3000
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 100.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%
```

## See: [Phind advice for freeing GPU RAM](https://www.phind.com/search?cache=kh81ys0uelwxs8zpykdzv0d8)
### It worked... Once?!?  

In [5]:
# Accomplishes the same thing

dupt.release_gpu_memory( model_aqw )

# import gc
# import torch
# 
# model_aqw.device = torch.device( "cpu" )
# tokenizer_awq.device = torch.device( "cpu" )
# 
# model_aqw     = None
# tokenizer_awq = None
# 
# gc.collect()
# torch.cuda.empty_cache()