In [5]:
# load auto reload module
%load_ext autoreload

In [6]:
import os
import gc

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

import json
import wandb
from tqdm import tqdm

In [7]:
from xmlschema import XMLSchema

os.chdir( "/var/model/genie-in-the-box/src" )
print( os.getcwd() )
import lib.utils.util         as du
import lib.utils.util_xml     as dux
import lib.utils.util_pytorch as dupt

from ephemera.prompts.xml_fine_tuning_prompt_generator import XmlFineTuningPromptGenerator


/var/model/genie-in-the-box/src


In [8]:
# Print current working directory
# !ls -alh /var/model/Phind-CodeLlama-34B-v2
# Change to /var/model/Phind-CodeLlama-34B-v2
# os.chdir( "/var/model/Phind-CodeLlama-34B-v2" )
# Print current working directory
# os.getcwd()
! ls -alh /var/model/models

total 28K
drwxrwxr-x  6 1001 1001 4.0K Jan 18 19:13 .
drwxr--r-- 35 1001 1001 4.0K Jan 20 22:24 ..
drwxr-xr-x  3 root root 4.0K Jan 18 15:34 .locks
drwxrwxr-x  8 1001 1001 4.0K Jan 20 23:58 Mistral-7B-Instruct-v0.2
drwxr-xr-x  6 1001 1001 4.0K Jan 18 15:35 models--bigscience--bloom-560m
drwxrwxr-x  6 1001 1001 4.0K Jan 18 15:59 models--mistralai--Mistral-7B-Instruct-v0.2
-rw-r--r--  1 1001 1001    1 Jan 18 15:35 version.txt


## Load model and tokenizer in FP16?

In [5]:
def get_base_model_and_tokenizer( model_path=".", tokenizer_path=".", use_bnb_cuantization=False, device_map="auto" ):
    
    compute_dtype = getattr( torch, "float16" )
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype
    )
    if use_bnb_cuantization: 

        print( bnb_config )

        # ¡OJO! Why were we turning off the cash here? It makes a big performance difference: 21 vs 14 tokens per second
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, quantization_config=bnb_config, device_map=device_map, low_cpu_mem_usage=True, use_cache=True, attn_implementation="flash_attention_2"
        )
    else:
        print( "Loading without BitsAndBytesConfig..." )
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, device_map=device_map, low_cpu_mem_usage=True, use_cache=True, attn_implementation="flash_attention_2",
            torch_dtype=torch.bfloat16
        )
    
    tokenizer              = AutoTokenizer.from_pretrained( tokenizer_path )
    tokenizer.pad_token    = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    return base_model, tokenizer
    

In [8]:
os.chdir( "/var/model/models/" )
print( os.getcwd() )
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path="mistralai/Mistral-7B-Instruct-v0.2", 
    tokenizer_path="mistralai/Mistral-7B-Instruct-v0.2", 
    use_bnb_cuantization=False, 
    device_map="auto" 
)

/var/model/models
Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import pandas as pd

def run_validation( model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1" ):

    df = pd.read_json( 
        "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True 
    ).sample( 100, random_state=42 )
    
    print( "validate_df.shape", df.shape )

    xml_ftp_generator = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", debug=True, verbose=False )
    
    df = xml_ftp_generator.generate_responses( 
        df, tokenizer=tokenizer, model=model, switch="huggingface", model_name=model_name, device=device 
    )
    df = xml_ftp_generator.validate_responses( df )
    
    xml_ftp_generator.print_validation_stats( df, title=f"Validation stats for model {model_name}" )

## Set up W & B

In [5]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mricardo-felipe-ruiz[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
%env WANDB_PROJECT="Mistral-7B-Instruct-v0.2"

env: WANDB_PROJECT="Mistral-7B-Instruct-v0.2"


In [9]:
pwd

'/var/model/models'

In [10]:
! ls -alh

total 28K
drwxrwxr-x  6 1001 1001 4.0K Jan 18 19:13 .
drwxr--r-- 35 1001 1001 4.0K Jan 19 15:59 ..
drwxr-xr-x  3 root root 4.0K Jan 18 15:34 .locks
drwxrwxr-x  6 1001 1001 4.0K Jan 19 16:24 Mistral-7B-Instruct-v0.2
drwxr-xr-x  6 1001 1001 4.0K Jan 18 15:35 models--bigscience--bloom-560m
drwxrwxr-x  6 1001 1001 4.0K Jan 18 15:59 models--mistralai--Mistral-7B-Instruct-v0.2
-rw-r--r--  1 1001 1001    1 Jan 18 15:35 version.txt


In [11]:
base_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNor

## TEST model on validation dataset, BEFORE training

In [None]:
run_validation( base_model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2" )


In [18]:
# ------------------------------------------------------------------------------------------------------------------------
# Creates insanely verbose outputs, no need to benchmark any further!
# ------------------------------------------------------------------------------------------------------------------------
# Response: [<response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
#         Explanation:
#         The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
#         1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
#         2. "Google Scholar" is the search engine and the specific search type.
#         3. "URLError" is likely an error message or an argument related to the command.
# 
#         Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".</s> 
# 
#     I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
#     Best regards,
#     Your helpful AI assistant.</s><response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".</s>
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response xmlns="http://www.w3.org/2000/xmlns/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"><browser-command xsi:type="xsd:string">search google scholar current tab</browser-command><args><arg xsi:type="xsd:string">URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError". To ensure well-formed XML, I have added the XML namespaces and types to the response.
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response xmlns="http://www.w3.org/2000/xmlns/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/]

## Get training dataset

In [13]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-train.jsonl"
deepily_dataset_train = du.get_file_as_list( path )[ 0:10000 ]
deepily_dataset_train = [ json.loads( line ) for line in deepily_dataset_train ]
len( deepily_dataset_train )

8000

In [14]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-test.jsonl"
deepily_dataset_test = du.get_file_as_list( path )[ 0:1000 ]
deepily_dataset_test = [ json.loads( line ) for line in deepily_dataset_test ]
len( deepily_dataset_test )

1000

In [15]:
# Use the Task below and the Input given to write the Response, which is a programmatic instruction that can solve the following Task:
def prompt_instruction_format( sample ):
    
  return f"""### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    {sample['instruction']}

    ### Input:
    {sample['input']}

    ### Response:
    {sample['output']}
    """

In [16]:
for line in prompt_instruction_format( deepily_dataset_test[ 0 ] ).split( "\n" ): print( line )

### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    Your job is to discern the intent of a human voice command transcription and translate it into a standardized command that a browser on your computer would understand.

        You will be given a human voice command and a list of possible standardized commands. You must choose the correct standardized command from the following list: `'search new tab', 'search current tab', 'search google new tab', 'search google current tab', 'search google scholar new tab', 'search google scholar current tab' and 'none'`.

        Requirement: You MUST NOT use python code to answer this question.
        Requirement: You MUST use your linguistic knowledge and intuition to answer this question.
        Hint: Anything that isn't a part of the command itself should be treated as arguments related to the command.

    ### Input:
    
        Below is the raw human voice c

## Set up training arguments

In [17]:
from peft import LoraConfig, get_peft_config, PeftModel, PeftConfig, get_peft_model, AutoPeftModelForCausalLM

peft_config = LoraConfig(
    r=64, 
    lora_alpha=32, 
    # When target_modules was disabled, it was causing detention layers to be assigned to the CPU, throwing this runtime error:
    # RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! 
    # (when checking argument for argument mat2 in method wrapper_CUDA_mm)
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head" ], 
    lora_dropout=0.10, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [18]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2" )
os.getcwd()

'/var/model/models/Mistral-7B-Instruct-v0.2'

In [19]:
# Define the training arguments
trainingArgs = TrainingArguments(
    output_dir="./training-results", # Output directory where the model predictions and checkpoints will be stored
    num_train_epochs=2, # Number of training epochs
    per_device_train_batch_size=5, # Batch size per GPU for training
    gradient_accumulation_steps=5,  # Number of update steps to accumulate the gradients for
    gradient_checkpointing=True,# Enable gradient checkpointing
    optim="paged_adamw_32bit", # Optimizer to use
    #save_steps=save_steps,
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    
    # Setting this may help with the warning message: The input hidden states seems to be silently casted in float32, 
    # this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
    fp16=False,
    # Test to confirm that this works!
    # BTW: according to PHIND, this may actually improve fine-tuning performance as well: https://www.phind.com/search?cache=ygn9dbyl0ij4kotmgns2nsrw
    
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    #max_steps=max_steps,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    seed=42
)
# Create the trainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=deepily_dataset_train,
    eval_dataset=deepily_dataset_test,
    peft_config=peft_config,
    max_seq_length=4096, #2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)

In [20]:
def print_trainable_parameters( model ):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    
print_trainable_parameters( base_model )    

trainable params: 170,082,304 || all params: 7,411,814,400 || trainable%: 2.29


## Train model

In [21]:
trainer.train()

#stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()

print( "Model saved" )



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 0.3592, 'learning_rate': 0.00019876883405951377, 'epoch': 0.16}
{'loss': 0.1313, 'learning_rate': 0.0001913545457642601, 'epoch': 0.32}
{'loss': 0.0829, 'learning_rate': 0.0001777145961456971, 'epoch': 0.48}
{'loss': 0.068, 'learning_rate': 0.00015877852522924732, 'epoch': 0.65}
{'loss': 0.0609, 'learning_rate': 0.00013583679495453, 'epoch': 0.81}
{'loss': 0.0553, 'learning_rate': 0.00011045284632676536, 'epoch': 0.97}




{'loss': 0.0498, 'learning_rate': 8.435655349597689e-05, 'epoch': 1.13}
{'loss': 0.0468, 'learning_rate': 5.9326335692419995e-05, 'epoch': 1.29}
{'loss': 0.0454, 'learning_rate': 3.7067960895016275e-05, 'epoch': 1.45}
{'loss': 0.0443, 'learning_rate': 1.9098300562505266e-05, 'epoch': 1.61}
{'loss': 0.0436, 'learning_rate': 6.6419573502798374e-06, 'epoch': 1.77}
{'loss': 0.0437, 'learning_rate': 5.478104631726711e-07, 'epoch': 1.94}




{'train_runtime': 2989.6932, 'train_samples_per_second': 0.518, 'train_steps_per_second': 0.021, 'train_loss': 0.08454943304100344, 'epoch': 2.0}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▂▃▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▃▄▅▅▆▇▇██
train/learning_rate,██▇▇▆▅▄▃▂▂▁▁
train/loss,█▃▂▂▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,2.0
train/global_step,62.0
train/learning_rate,0.0
train/loss,0.0437
train/total_flos,2.7734386409472e+17
train/train_loss,0.08455
train/train_runtime,2989.6932
train/train_samples_per_second,0.518
train/train_steps_per_second,0.021




Model saved


In [None]:
# wandb.finish()

In [22]:
os.getcwd()

'/var/model/models/Mistral-7B-Instruct-v0.2'

In [28]:
import gc
# base_model = None 
# adapter_plus_model = None
torch.cuda.empty_cache() 
gc.collect()

565

## RESTART 1st time & load model and tokenizer in FP16

In [6]:
os.chdir( "/var/model/models/" )
os.getcwd()

'/var/model/models'

In [7]:
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path="mistralai/Mistral-7B-Instruct-v0.2", 
    tokenizer_path="mistralai/Mistral-7B-Instruct-v0.2", 
    use_bnb_cuantization=False, 
    device_map="auto" 
)

Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
os.getcwd()

'/var/model/models'

In [9]:
from peft import PeftModel, AutoPeftModelForCausalLM

adapter_plus_model = PeftModel.from_pretrained( base_model, "Mistral-7B-Instruct-v0.2/training-results", use_flash_attention_2=True )

In [17]:
# from accelerate import Accelerator
# 
# accelerator = Accelerator()
# 
# adapter_plus_model = accelerator.prepare( adapter_plus_model )


In [10]:
dupt.print_device_allocation( adapter_plus_model )

base_model.model.model.embed_tokens.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: cuda:0
base_model.model.

## TEST model on validation dataset using adapter loaded on top

In [13]:
run_validation( adapter_plus_model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1" )

validate_df.shape (100, 5)
Commands file for [search new tab] exists: True
Commands file for [search current tab] exists: True
Commands file for [search google new tab] exists: True
Commands file for [search google current tab] exists: True
Commands file for [search google scholar new tab] exists: True
Commands file for [search google scholar current tab] exists: True

Generating responses for 100 rows...
Using HuggingFace model_name [mistralai/Mistral-7B-Instruct-v0.2] in memory...

Processing call [001] out of [100] = [1.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 2,250 ms
Tokens per second [65.3]
Response: [<response><browser-command>search new tab</browser-command><args>TabError</args></response>]

Processing call [002] out of [100] = [2.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 2,457 ms
Tokens per second [95.2]
Response: [<response

## Perform a 16bit merge & write to disk

In [14]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2" )

In [15]:
adapter_plus_model = adapter_plus_model.merge_and_unload()
adapter_plus_model.save_pretrained( "./merged/", safe_serialization=True )

In [16]:
tokenizer.save_pretrained( "./merged/", safe_serialization=True )

('./merged/tokenizer_config.json',
 './merged/special_tokens_map.json',
 './merged/tokenizer.model',
 './merged/added_tokens.json',
 './merged/tokenizer.json')

## RESTART 2nd time & load merged model + tokenizer in FP16

In [6]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/merged-00-2024.01.19" )
print( os.getcwd() )

base_model, tokenizer = get_base_model_and_tokenizer( 
    use_bnb_cuantization=False, 
    device_map="cuda:1" 
)

/var/model/models/Mistral-7B-Instruct-v0.2/merged-00-2024.01.19
Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
%autoreload
run_validation( base_model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1" )

# Generating responses for 100 rows... Done! in 02:15
# [1353.9] ms per item
# ------------------------------------------------------------------------------------------------------------------------
# - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2, raw bfloat16 loaded from w/in Jupiter notebook
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%

# Exact same model loaded two different ways:
# 0: Using TGI with & w/o --dtype bfloat16 flag
#    docker run --name huggingface-tgi --gpus all --shm-size 1g -p 3000:3000 -v `pwd`:/data/model 
#    ghcr.io/huggingface/text-generation-inference:1.3.4 --dtype bfloat16 --sharded false --num-shard 1 --port 3000 
#    --model-id /data/model
# 1: Using jupyter notebook with raw model file: 
#    low_cpu_mem_usage=True, 
#    use_cache=True, 
#    attn_implementation="flash_attention_2",
#    torch_dtype=torch.bfloat16

# +---------------------------------------------------------------------------------------+
# Sat Jan 20 18:12:36 2024: 
# +---------------------------------------------------------------------------------------+
# | NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
# |-----------------------------------------+----------------------+----------------------+
# | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
# | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
# |                                         |                      |               MIG M. |
# |=========================================+======================+======================|
# |   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
# |  0%   36C    P8              20W / 450W |  23146MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# |   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
# |  0%   43C    P8              29W / 450W |  15336MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# 
# +---------------------------------------------------------------------------------------+
# | Processes:                                                                            |
# |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
# |        ID   ID                                                             Usage      |
# |=======================================================================================|
# |    0   N/A  N/A     11326      C   /opt/conda/bin/python3.10                 23136MiB |
# |    1   N/A  N/A      6750      C   /usr/bin/python3                          15326MiB |
# +---------------------------------------------------------------------------------------+

validate_df.shape (100, 5)
Commands file for [search new tab] exists: True
Commands file for [search current tab] exists: True
Commands file for [search google new tab] exists: True
Commands file for [search google current tab] exists: True
Commands file for [search google scholar new tab] exists: True
Commands file for [search google scholar current tab] exists: True

Generating responses for 100 rows...
Using HuggingFace model_name [mistralai/Mistral-7B-Instruct-v0.2] in memory...

Processing call [001] out of [100] = [1.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 2,138 ms
Tokens per second [68.8]
Response: [<response><browser-command>search new tab</browser-command><args>TabError</args></response>]

Processing call [002] out of [100] = [2.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 1,571 ms
Tokens per second [148.9]
Response: [<respons

## Run benchmark on TGI service listening on port 3000

In [14]:
tgi_validator  = XmlFineTuningPromptGenerator( tgi_url="http://172.17.0.4:3000", debug=True )

model_name     = "mistralai/Mistral-7B-Instruct-v0.2-raw-bfloat16"

validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True ).sample( 100, random_state=42 )
validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for `{model_name}` on TGI:3000" )

# Generating responses for 100 rows... Done! in 01:18
# [788.9] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2 raw bfloat16` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%

Commands file for [search new tab] exists: False
Commands file for [search current tab] exists: False
Commands file for [search google new tab] exists: False
Commands file for [search google current tab] exists: False
Commands file for [search google scholar new tab] exists: False
Commands file for [search google scholar current tab] exists: False

Generating responses for 100 rows...
Using TGI w/ model_name [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...
Processing call [001] out of [100] = [1.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...

        <response>
            <browser-command>search new tab</browser-command>
            <args>TabError</args>
        </response>
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]... Done! in 682 ms
Tokens per second [51.3]
Token list length [35]
Processing call [002] out of [100] = [2.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...

        <response>
            <browser-command>search google scholar new tab</bro

## Quantize using AWQ (Adaptive Weight Quantization) and write to disk

In [6]:
# !pip install autoawq

In [3]:
import os
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/" )
print( os.getcwd() )

/var/model/models/Mistral-7B-Instruct-v0.2


In [4]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }

# Load model and tokenizer
raw_16bit_model     = AutoAWQForCausalLM.from_pretrained( "./merged-00-2024.01.19/", device_map="auto", safetensors=True )
raw_16bit_tokenizer = AutoTokenizer.from_pretrained( "./merged-00-2024.01.19/", use_fast=True )

# Quantize
raw_16bit_model.quantize( raw_16bit_tokenizer, quant_config=quant_config )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [09:20<00:00, 17.53s/it]


In [5]:
# Save quantized model
path = "./merged-00-2024.01.19.awq"
raw_16bit_model.save_quantized( path, safetensors=True )
raw_16bit_tokenizer.save_pretrained( path )



('./merged-00-2024.01.19.awq/tokenizer_config.json',
 './merged-00-2024.01.19.awq/special_tokens_map.json',
 './merged-00-2024.01.19.awq/tokenizer.model',
 './merged-00-2024.01.19.awq/added_tokens.json',
 './merged-00-2024.01.19.awq/tokenizer.json')

## Validate AWQ model: In memory loaded by Jupiter notebook

In [1]:
import os
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/" )
print( os.getcwd() )

/var/model/models/Mistral-7B-Instruct-v0.2


In [2]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_aqw     = AutoAWQForCausalLM.from_pretrained( "./merged-00-2024.01.19.awq", device_map="cuda:1", safetensors=True )
tokenizer_awq = AutoTokenizer.from_pretrained( "./merged-00-2024.01.19.awq/", use_fast=True )

In [9]:
run_validation( model_aqw, tokenizer_awq )

# Generating responses for 100 rows... Done! in 01:36
# [966] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2, In memory loaded by Jupiter notebook
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%

validate_df.shape (100, 5)
Commands file for [search new tab] exists: True
Commands file for [search current tab] exists: True
Commands file for [search google new tab] exists: True
Commands file for [search google current tab] exists: True
Commands file for [search google scholar new tab] exists: True
Commands file for [search google scholar current tab] exists: True

Generating responses for 100 rows...
Using HuggingFace model_name [mistralai/Mistral-7B-Instruct-v0.2] in memory...

Processing call [001] out of [100] = [1.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 1,150 ms
Tokens per second [127.8]
Response: [<response><browser-command>search new tab</browser-command><args>TabError</args></response>]

Processing call [002] out of [100] = [2.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]...
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2]... Done! in 1,121 ms
Tokens per second [208.7]
Response: [<respon

## Validate AWQ model: TGI service listening on port 3000

In [10]:
tgi_validator  = XmlFineTuningPromptGenerator( tgi_url="http://172.17.0.4:3000", debug=True )

model_name     = "mistralai/Mistral-7B-Instruct-v0.2-AWQ"

validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True ).sample( 100, random_state=42 )
validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for `{model_name}` on TGI:3000" )

# Generating responses for 100 rows... Done! in 44 seconds
# [442] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-AWQ` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%

# Sat Jan 20 19:40:03 2024
# +---------------------------------------------------------------------------------------+
# | NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
# |-----------------------------------------+----------------------+----------------------+
# | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
# | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
# |                                         |                      |               MIG M. |
# |=========================================+======================+======================|
# |   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
# |  0%   37C    P8              22W / 450W |  20240MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# |   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
# |  0%   43C    P8              30W / 450W |   5482MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# 
# +---------------------------------------------------------------------------------------+
# | Processes:                                                                            |
# |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
# |        ID   ID                                                             Usage      |
# |=======================================================================================|
# |    0   N/A  N/A     27822      C   /opt/conda/bin/python3.10                 20230MiB |
# |    1   N/A  N/A     25717      C   /usr/bin/python3                           5472MiB |
# +---------------------------------------------------------------------------------------+

Commands file for [search new tab] exists: False
Commands file for [search current tab] exists: False
Commands file for [search google new tab] exists: False
Commands file for [search google current tab] exists: False
Commands file for [search google scholar new tab] exists: False
Commands file for [search google scholar current tab] exists: False

Generating responses for 100 rows...
Using TGI w/ model_name [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...
Processing call [001] out of [100] = [1.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...

        <response>
            <browser-command>search new tab</browser-command>
            <args>TabError</args>
        </response>
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]... Done! in 404 ms
Tokens per second [86.6]
Token list length [35]
Processing call [002] out of [100] = [2.0%]... 
Asking LLM [mistralai/Mistral-7B-Instruct-v0.2-AWQ]...

        <response>
            <browser-command>search google scholar new tab</bro