In [2]:
# load auto reload module
%load_ext autoreload

In [3]:
import os
import gc

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

import json
import wandb
from tqdm import tqdm

In [4]:
from xmlschema import XMLSchema

os.chdir( "/var/model/genie-in-the-box/src" )
print( os.getcwd() )
import lib.utils.util         as du
import lib.utils.util_xml     as dux
import lib.utils.util_pytorch as dupt

from ephemera.prompts.xml_fine_tuning_prompt_generator import XmlFineTuningPromptGenerator


/var/model/genie-in-the-box/src


In [25]:
import pandas as pd
df = pd.read_json( 
        "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-train.jsonl", lines=True 
    )
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11260 entries, 0 to 11259
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   command      11260 non-null  object
 1   instruction  11260 non-null  object
 2   input        11260 non-null  object
 3   output       11260 non-null  object
 4   prompt       11260 non-null  object
 5   gpt_message  11260 non-null  object
dtypes: object(6)
memory usage: 527.9+ KB


In [34]:
def get_prompt( instruction, input, output ):
    
    return f"""### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    {instruction}

    ### Input:
    {input}

    ### Response:
    {output}
    """

def get_training_prompt_stats( tokenizer, device="cuda:1", debug=False ):

    df = pd.read_json( 
        "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-train.jsonl", lines=True 
    )#.sample( 10 )
    
    token_stats = { "min": -1, "max": -1, "mean": -1 }
    word_stats  = { "min": -1, "max": -1, "mean": -1 }
    
    token_counts  = []
    word_counts   = []
   
    for row in df.itertuples():
        
        prompt          = get_prompt( getattr( row, "instruction" ), getattr( row, "input" ), getattr( row, "output" ) )
        tokens_metadata = tokenizer( prompt, return_tensors="pt" ).to( device )
        
        tokens_count    = len( tokens_metadata[ "input_ids" ][ 0 ] )
        word_count      = len( prompt.split( ' ' ) )
        
        token_counts.append( tokens_count )
        word_counts.append( word_count )
        if debug: 
            print( f"  Word count: { len( prompt.split( ' ' ) ) }" )
            print( f"Tokens count: { tokens_count }" )
            # print( tokens_metadata[ "input_ids" ] )
        else:
            print( ".", end="" )
            
    print()
    
    token_stats[ "min" ]  = min( token_counts )
    token_stats[ "max" ]  = max( token_counts )
    token_stats[ "mean" ] = sum( token_counts ) / len( token_counts )
    
    word_stats[ "min" ]  = min( word_counts )
    word_stats[ "max" ]  = max( word_counts )
    word_stats[ "mean" ] = sum( word_counts ) / len( word_counts )
    
    return token_stats, word_stats, prompt

token_stats, word_stats, prompt = get_training_prompt_stats( tokenizer, debug=False )
prompt, token_stats, word_stats

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

("### Instruction:\n    Use the Task below and the Input given to write a Response that can solve the following Task:\n\n    ### Task:\n    Your job is to discern the intent of a human voice command transcription and translate it into a standardized command that a browser on your computer would understand.\n\n        You will be given a human voice command and a list of possible standardized commands. You must choose the correct standardized command from the following list:\n        <browser-commands>\n        <command>go to current tab</command>\n        <command>go to new tab</command>\n        <command>search google current tab</command>\n        <command>search google new tab</command>\n        <command>search google scholar current tab</command>\n        <command>search google scholar new tab</command>\n        <command>search current tab</command>\n        <command>search new tab</command>\n        <command>search perplexity current tab</command>\n        <command>search perplexi

In [13]:
import pandas as pd

def run_validation( model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1", sample_size=100 ):

    df = pd.read_json( 
        "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True 
    ).sample( sample_size, random_state=42 )
    
    du.print_banner( f"Validating {model_name} w/ {sample_size} samples" )
    # Print value counts for the command column to see how many unique commands we have
    print( df.command.value_counts(), end="\n\n" )

    xml_ftp_generator = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", debug=True, verbose=False )
    
    df = xml_ftp_generator.generate_responses( 
        df, tokenizer=tokenizer, model=model, switch="huggingface", model_name=model_name, device=device 
    )
    df = xml_ftp_generator.validate_responses( df )
    
    xml_ftp_generator.print_validation_stats( df, title=f"Validation stats for model {model_name}" )

In [7]:
! ls -alh /var/model/models/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/b70aa86578567ba3301b21c8a27bea4e8f6d6d61

total 24K
drwxrwxr-x 2 1001 1001 4.0K Jan 24 16:05 .
drwxrwxr-x 3 1001 1001 4.0K Jan 18 15:56 ..
lrwxrwxrwx 1 1001 1001   52 Jan 18 15:59 config.json -> ../../blobs/c0519dc5f5cc99c2238a453da18994599c898b66
lrwxrwxrwx 1 root root   52 Jan 24 16:05 generation_config.json -> ../../blobs/cb0c9b6c64cf786052efdd1a4ae597337b2f2708
lrwxrwxrwx 1 1001 1001   76 Jan 18 15:57 model-00001-of-00003.safetensors -> ../../blobs/63654d601820b88b1fa8b4a98df5714f700fbc5b3df2cc4ecbabdced35096d31
lrwxrwxrwx 1 1001 1001   76 Jan 18 15:58 model-00002-of-00003.safetensors -> ../../blobs/a42716540ecb2385d371f2109835921ff535406cac8fe8ff28f2f0b5fc7895bd
lrwxrwxrwx 1 1001 1001   76 Jan 18 15:59 model-00003-of-00003.safetensors -> ../../blobs/5f86e15cb3ed9078e30ae6e72445e109d0e337d9cde59b9aeea4ce8e44e54a5d
lrwxrwxrwx 1 root root   52 Jan 24 16:04 model.safetensors.index.json -> ../../blobs/361fa9d25a7f791e18ab531b3468ff8f2010642e
lrwxrwxrwx 1 1001 1001   52 Jan 18 15:59 special_tokens_map.json -> ../../blo

## Load model and tokenizer in bfloat16?

In [9]:
def get_base_model_and_tokenizer( model_path=".", tokenizer_path=".", use_bnb_quantization=False, device_map="auto", cache_dir="/var/model/models" ):
    
    compute_dtype = getattr( torch, "float16" )
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype
    )
    if use_bnb_quantization: 

        print( bnb_config )

        # ¡OJO! Why were we turning off the cash here? It makes a big performance difference: 21 vs 14 tokens per second
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, quantization_config=bnb_config, device_map=device_map, low_cpu_mem_usage=True, use_cache=True, 
            attn_implementation="flash_attention_2",  local_files_only=True, cache_dir=cache_dir
        )
    else:
        print( "Loading without BitsAndBytesConfig..." )
        base_model = AutoModelForCausalLM.from_pretrained(
            model_path, device_map=device_map, low_cpu_mem_usage=True, use_cache=True, attn_implementation="flash_attention_2",
            torch_dtype=torch.bfloat16, local_files_only=True, cache_dir=cache_dir
        )
    
    tokenizer              = AutoTokenizer.from_pretrained( tokenizer_path )
    tokenizer.pad_token    = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    return base_model, tokenizer
    

In [10]:
os.chdir( "/var/model/models/" )
print( os.getcwd() )
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path="mistralai/Mistral-7B-Instruct-v0.2", 
    tokenizer_path="mistralai/Mistral-7B-Instruct-v0.2", 
    use_bnb_quantization=False, 
    device_map="auto" 
)

/var/model/models
Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [14]:
# Get tokenizer stats
prompt_stats = get_training_prompt_stats( tokenizer )
prompt_stats

### Instruction:
    Use the Task and Input given below to write a Response that can solve the following Task.
    
    ### Task:
    Your job is to discern the intent of a human voice command transcription and translate it into a standardized command that a browser on your computer would understand.

        You will be given a human voice command and a list of possible standardized commands. You must choose the correct standardized command from the following list: `'search new tab', 'search current tab', 'search google new tab', 'search google current tab', 'search google scholar new tab', 'search google scholar current tab' and 'none'`.

        Requirement: You MUST NOT use python code to answer this question.
        Requirement: You MUST use your linguistic knowledge and intuition to answer this question.
        Hint: Anything that isn't a part of the command itself should be treated as arguments related to the command.
    
    ### Input:
        Below is the raw human voice co

{'min': 0, 'max': 1, 'mean': 1.0}

## Set up W & B

In [7]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
%env WANDB_PROJECT="Mistral-7B-Instruct-v0.2"

env: WANDB_PROJECT="Mistral-7B-Instruct-v0.2"


In [11]:
pwd

'/var/model/genie-in-the-box/src'

In [12]:
! ls -alh

total 76K
drwxr--r-- 11 1001 1001 4.0K Jan  9 22:21 .
drwxr--r-- 10 1001 1001 4.0K Jan 23 04:02 ..
-rwxr--r--  1 1001 1001 6.1K Jan 19 14:25 .DS_Store
-rwxr--r--  1 1001 1001 4.0K Mar 21  2023 ._.DS_Store
drwxr--r--  3 1001 1001 4.0K Mar 21  2023 .idea
drwxr--r--  2 1001 1001 4.0K Jan  9 22:21 __pycache__
-rw-rw-r--  1 1001 1001  20K Jan  9 22:21 app.py
drwxr--r--  5 1001 1001 4.0K Jan 22 21:23 conf
drwxr--r--  5 1001 1001 4.0K Dec 15 16:01 ephemera
drwxr--r--  9 1001 1001 4.0K Oct 13 13:43 lib
drwxr--r--  3 1001 1001 4.0K Mar 10  2023 resources
drwxr-xr-x  2 1001 1001 4.0K Dec 15 17:48 scripts
drwxr--r--  2 1001 1001 4.0K Nov 20 21:30 static
drwxr--r--  2 1001 1001 4.0K Sep 26 18:31 templates


In [11]:
base_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNor

## TEST model on validation dataset, BEFORE training

In [None]:
run_validation( base_model, tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2" )


In [18]:
# ------------------------------------------------------------------------------------------------------------------------
# Creates insanely verbose outputs, no need to benchmark any further!
# ------------------------------------------------------------------------------------------------------------------------
# Response: [<response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
#         Explanation:
#         The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
#         1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
#         2. "Google Scholar" is the search engine and the specific search type.
#         3. "URLError" is likely an error message or an argument related to the command.
# 
#         Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".</s> 
# 
#     I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
#     Best regards,
#     Your helpful AI assistant.</s><response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".</s>
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response><browser-command>search google scholar current tab</browser-command><args><arg>URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError".
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response xmlns="http://www.w3.org/2000/xmlns/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"><browser-command xsi:type="xsd:string">search google scholar current tab</browser-command><args><arg xsi:type="xsd:string">URLError</arg></args></response>
# 
# Explanation:
# The human voice command "Here, Google Scholar URLError" can be broken down into the following parts:
# 1. "Here" is likely an indication of the current tab, but it's not a necessary part of the command.
# 2. "Google Scholar" is the search engine and the specific search type.
# 3. "URLError" is likely an error message or an argument related to the command.
# Based on this analysis, the correct standardized command is "search google scholar current tab" with the argument "URLError". To ensure well-formed XML, I have added the XML namespaces and types to the response.
# 
# I hope this explanation is clear and helpful. Let me know if you have any questions or need further clarification.
# 
# Best regards,
# Your helpful AI assistant.</s><response xmlns="http://www.w3.org/2000/xmlns/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/]

## Get training dataset

In [13]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-train.jsonl"
deepily_dataset_train = du.get_file_as_list( path )#[ 0:10000 ]
deepily_dataset_train = [ json.loads( line ) for line in deepily_dataset_train ]
len( deepily_dataset_train )

11260

In [14]:
path = "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-test.jsonl"
deepily_dataset_test = du.get_file_as_list( path )#[ 0:1000 ]
deepily_dataset_test = [ json.loads( line ) for line in deepily_dataset_test ]
len( deepily_dataset_test )

1407

In [15]:
def prompt_instruction_format( sample ):
    
  return f"""### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    {sample['instruction']}

    ### Input:
    {sample['input']}

    ### Response:
    {sample['output']}
    """

In [16]:
for line in prompt_instruction_format( deepily_dataset_test[ 0 ] ).split( "\n" ): print( line )

### Instruction:
    Use the Task below and the Input given to write a Response that can solve the following Task:

    ### Task:
    Your job is to discern the intent of a human voice command transcription and translate it into a standardized command that a browser on your computer would understand.

        You will be given a human voice command and a list of possible standardized commands. You must choose the correct standardized command from the following list:
        <browser-commands>
        <command>go to current tab</command>
        <command>go to new tab</command>
        <command>search google current tab</command>
        <command>search google new tab</command>
        <command>search google scholar current tab</command>
        <command>search google scholar new tab</command>
        <command>search current tab</command>
        <command>search new tab</command>
        <command>search perplexity current tab</command>
        <command>search perplexity new tab</command

## Set up training arguments

In [17]:
from peft import LoraConfig, get_peft_config, PeftModel, PeftConfig, get_peft_model, AutoPeftModelForCausalLM

peft_config = LoraConfig(
    r=64, 
    lora_alpha=32, 
    # When target_modules was disabled, it was causing detention layers to be assigned to the CPU, throwing this runtime error:
    # RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! 
    # (when checking argument for argument mat2 in method wrapper_CUDA_mm)
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head" ], 
    lora_dropout=0.10, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [18]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2" )
os.getcwd()

'/var/model/models/Mistral-7B-Instruct-v0.2'

In [19]:
os.chdir( "/var/model/models/" )
print( os.getcwd() )
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path="mistralai/Mistral-7B-Instruct-v0.2", 
    tokenizer_path="mistralai/Mistral-7B-Instruct-v0.2", 
    use_bnb_quantization=False, 
    device_map="auto" 
)

/var/model/models
Loading without BitsAndBytesConfig...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [20]:
# Define the training arguments
trainingArgs = TrainingArguments(
    output_dir="./training-results", # Output directory where the model predictions and checkpoints will be stored
    num_train_epochs=1, # Number of training epochs
    per_device_train_batch_size=4, # Batch size per GPU for training. https://kaitchup.substack.com/p/fine-tune-a-mixture-of-experts-on Says that using even batch size is best
    per_device_eval_batch_size=4,  # Batch size per GPU for evaluation. https://kaitchup.substack.com/p/fine-tune-a-mixture-of-experts-on Says that using even batch size is best
    gradient_accumulation_steps=8, # Number of update steps to accumulate the gradients for
    gradient_checkpointing=True,# Enable gradient checkpointing
    optim="paged_adamw_32bit", # Optimizer to use
    #save_steps=save_steps,
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    
    # Setting this may help with the warning message: The input hidden states seems to be silently casted in float32, 
    # this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
    fp16=False,
    # Test to confirm that this works!
    # BTW: according to PHIND, this may actually improve fine-tuning performance as well: https://www.phind.com/search?cache=ygn9dbyl0ij4kotmgns2nsrw
    
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    #max_steps=max_steps,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    seed=42
)
# Create the trainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=deepily_dataset_train,
    eval_dataset=deepily_dataset_test,
    peft_config=peft_config,
    max_seq_length=2184, # Calculated by get_training_prompt_stats( tokenizer ), max = 728 * 3 # was: 2,048 or 4,096
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
def print_trainable_parameters( model ):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    
print_trainable_parameters( base_model )
# trainable params: 170,082,304 || all params: 7,411,814,400 || trainable%: 2.29

trainable params: 170,082,304 || all params: 7,411,814,400 || trainable%: 2.29


## Train model

In [22]:
trainer.train()

#stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()

print( "Model saved" )



[34m[1mwandb[0m: Currently logged in as: [33mricardo-felipe-ruiz[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 0.3327, 'learning_rate': 0.0002, 'epoch': 0.07}
{'loss': 0.1317, 'learning_rate': 0.00019941379571543596, 'epoch': 0.13}
{'loss': 0.0618, 'learning_rate': 0.00019766205557100868, 'epoch': 0.2}
{'loss': 0.0494, 'learning_rate': 0.00019476531711828027, 'epoch': 0.27}
{'loss': 0.0436, 'learning_rate': 0.00019075754196709572, 'epoch': 0.33}
{'loss': 0.0404, 'learning_rate': 0.00018568571761675893, 'epoch': 0.4}
{'loss': 0.0383, 'learning_rate': 0.00017960930657056438, 'epoch': 0.47}
{'loss': 0.0357, 'learning_rate': 0.0001725995491923131, 'epoch': 0.53}
{'loss': 0.0339, 'learning_rate': 0.00016473862847818277, 'epoch': 0.6}
{'loss': 0.033, 'learning_rate': 0.00015611870653623825, 'epoch': 0.66}
{'loss': 0.0319, 'learning_rate': 0.00014684084406997903, 'epoch': 0.73}
{'loss': 0.0309, 'learning_rate': 0.00013701381553399145, 'epoch': 0.8}
{'loss': 0.0302, 'learning_rate': 0.00012675283385292212, 'epoch': 0.86}
{'loss': 0.0295, 'learning_rate': 0.0001161781996552765, 'epoch': 0.93}
{



{'loss': 0.0273, 'learning_rate': 9.458610914145826e-05, 'epoch': 1.06}
{'loss': 0.0272, 'learning_rate': 8.382180034472353e-05, 'epoch': 1.13}
{'loss': 0.027, 'learning_rate': 7.324716614707793e-05, 'epoch': 1.2}
{'loss': 0.0263, 'learning_rate': 6.298618446600856e-05, 'epoch': 1.26}
{'loss': 0.0257, 'learning_rate': 5.3159155930021e-05, 'epoch': 1.33}
{'loss': 0.026, 'learning_rate': 4.388129346376178e-05, 'epoch': 1.4}
{'loss': 0.0261, 'learning_rate': 3.5261371521817244e-05, 'epoch': 1.46}
{'loss': 0.0255, 'learning_rate': 2.7400450807686938e-05, 'epoch': 1.53}
{'loss': 0.0254, 'learning_rate': 2.0390693429435627e-05, 'epoch': 1.6}
{'loss': 0.0255, 'learning_rate': 1.4314282383241096e-05, 'epoch': 1.66}
{'loss': 0.0252, 'learning_rate': 9.242458032904311e-06, 'epoch': 1.73}
{'loss': 0.025, 'learning_rate': 5.2346828817197655e-06, 'epoch': 1.8}
{'loss': 0.0251, 'learning_rate': 2.3379444289913342e-06, 'epoch': 1.86}
{'loss': 0.025, 'learning_rate': 5.862042845640403e-07, 'epoch': 1.



{'train_runtime': 7219.219, 'train_samples_per_second': 0.52, 'train_steps_per_second': 0.021, 'train_loss': 0.04464275201161703, 'epoch': 1.99}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/learning_rate,█████▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁
train/loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.99
train/global_step,150.0
train/learning_rate,0.0
train/loss,0.025
train/total_flos,6.702774934831104e+17
train/train_loss,0.04464
train/train_runtime,7219.219
train/train_samples_per_second,0.52
train/train_steps_per_second,0.021




Model saved


In [None]:
# wandb.finish()

In [7]:
os.getcwd()

'/var/model/genie-in-the-box/src'

In [8]:
! ls -alh /var/model/models

total 32K
drwxrwxr-x  7 1001 1001 4.0K Jan 24 00:55 .
drwxr--r-- 35 1001 1001 4.0K Jan 20 22:24 ..
drwxr-xr-x  3 root root 4.0K Jan 18 15:34 .locks
drwxrwxr-x 10 1001 1001 4.0K Jan 24 00:55 Mistral-7B-Instruct-v0.2
drwxr-xr-x  6 1001 1001 4.0K Jan 18 15:35 models--bigscience--bloom-560m
drwxrwxr-x  6 1001 1001 4.0K Jan 18 15:59 models--mistralai--Mistral-7B-Instruct-v0.2
-rw-r--r--  1 1001 1001    1 Jan 18 15:35 version.txt
drwxr-xr-x  3 root root 4.0K Jan 23 22:40 wandb


In [9]:
import gc
# base_model = None 
# adapter_plus_model = None
torch.cuda.empty_cache() 
gc.collect()

2486

## RESTART 1st time & load model and tokenizer in FP16

In [8]:
os.chdir( "/var/model/models/" )
os.getcwd()

'/var/model/models'

In [11]:
base_model, tokenizer = get_base_model_and_tokenizer( 
    model_path="mistralai/Mistral-7B-Instruct-v0.2", 
    tokenizer_path="mistralai/Mistral-7B-Instruct-v0.2", 
    use_bnb_quantization=False, 
    device_map="auto" 
)

Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
from peft import PeftModel, AutoPeftModelForCausalLM

adapter_plus_model = PeftModel.from_pretrained( base_model, "Mistral-7B-Instruct-v0.2/training-results-2024.01.23", use_flash_attention_2=True )

In [17]:
# from accelerate import Accelerator
# 
# accelerator = Accelerator()
# 
# adapter_plus_model = accelerator.prepare( adapter_plus_model )


In [14]:
dupt.print_device_allocation( adapter_plus_model )

base_model.model.model.embed_tokens.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: cuda:0
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: cuda:0
base_model.model.

## TEST model on validation dataset using adapter loaded on top

In [21]:
run_validation( adapter_plus_model, tokenizer, sample_size=1000, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1" )

# Generating responses for 1,000 rows... Done! in 34:46
# [2086.8] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 0.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.5%
# Response has correct values 99.5%
#  Browser command is correct 99.6%
#             Args is correct 99.9%

------------------------------------------------------------------------------------------------------------------------
- Validating mistralai/Mistral-7B-Instruct-v0.2 w/ 1000 samples
------------------------------------------------------------------------------------------------------------------------

command
search google current tab                            95
search google scholar current tab                    95
go to new tab                                        93
go to current tab                                    87
search new tab                                       85
search google scholar new tab                        82
search google new tab                                76
search current tab                                   68
search phind current tab                             45
search perplexity current tab                        44
search phind new tab                                 38
search perplexity new tab                            30
search using 

## Perform a 16bit merge & write to disk

In [5]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2" )
merged_path = "./merged-00-2024.01.23"
os.getcwd(), merged_path

('/var/model/models/Mistral-7B-Instruct-v0.2', './merged-00-2024.01.23')

In [23]:
adapter_plus_model = adapter_plus_model.merge_and_unload()
adapter_plus_model.save_pretrained( merged_path, safe_serialization=True )

In [24]:
tokenizer.save_pretrained( merged_path, safe_serialization=True )

('./merged-00-2024.01.23/tokenizer_config.json',
 './merged-00-2024.01.23/special_tokens_map.json',
 './merged-00-2024.01.23/tokenizer.model',
 './merged-00-2024.01.23/added_tokens.json',
 './merged-00-2024.01.23/tokenizer.json')

## RESTART 2nd time & load merged model + tokenizer in bfloat16

In [10]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/merged-00-2024.01.23" )
print( os.getcwd() )

merged_model, merged_tokenizer = get_base_model_and_tokenizer( 
    use_bnb_quantization=False, 
    device_map="cuda:1" 
)


/var/model/models/Mistral-7B-Instruct-v0.2/merged-00-2024.01.23
Loading without BitsAndBytesConfig...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Raw merged model in bfloat16
```
Wed Jan 24 11:16:23 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   40C    P8              28W / 450W |      6MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   46C    P2              71W / 450W |  14976MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    1   N/A  N/A      7765      C   /usr/bin/python3                          14966MiB |
+---------------------------------------------------------------------------------------+
```

In [11]:
%autoreload
run_validation( merged_model, merged_tokenizer, model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda:1" )

------------------------------------------------------------------------------------------------------------------------
- Validating mistralai/Mistral-7B-Instruct-v0.2 w/ 100 samples
------------------------------------------------------------------------------------------------------------------------

command
go to new tab                                        13
search new tab                                       12
search google scholar current tab                    10
search google new tab                                 8
search google scholar new tab                         8
search google current tab                             7
search phind current tab                              6
search current tab                                    6
go to current tab                                     5
search perplexity current tab                         4
search phind new tab                                  4
search phind using clipboard current tab              3
none          

```
Generating responses for 100 rows... Done! in 02:19
[1390.6] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 0.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%

Exact same model loaded two different ways:
0: Using TGI with & w/o --dtype bfloat16 flag
   docker run --name huggingface-tgi --gpus all --shm-size 1g -p 3000:3000 -v `pwd`:/data/model  ghcr.io/huggingface/text-generation-inference:1.3.4 --dtype bfloat16 --sharded false --num-shard 1 --port 3000 --model-id /data/model

1: Using jupyter notebook with raw model file: 
   low_cpu_mem_usage=True, 
   use_cache=True, 
   attn_implementation="flash_attention_2",
   torch_dtype=torch.bfloat16

Wed Jan 24 11:27:02 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   39C    P2              69W / 450W |  23146MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   43C    P8              24W / 450W |  15366MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A     10768      C   /opt/conda/bin/python3.10                 23136MiB |
|    1   N/A  N/A      7765      C   /usr/bin/python3                          15356MiB |
+---------------------------------------------------------------------------------------+
```

## Run benchmark on TGI service listening on port 3000

In [25]:
%autoreload

tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://172.17.0.4:3000", debug=True )
# tgi_validator  = XmlFineTuningPromptGenerator( tgi_url="http://localhost:3000", debug=True )

model_name     = "mistralai/Mistral-7B-Instruct-v0.2-raw-bfloat16"

sample_size    = 10
validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True ).sample( sample_size, random_state=42 )
validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for {sample_size} rows with `{model_name}` on TGI:3000" )

# Generating responses for 10 rows... Done! in 7 seconds
# [771.2] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for 10 rows with `mistralai/Mistral-7B-Instruct-v0.2-raw-bfloat16` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 100.0%
# Response has correct values 100.0%
#  Browser command is correct 100.0%
#             Args is correct 100.0%

Commands file for command [go to current tab] exists: True
Commands file for command [go to new tab] exists: True
Commands file for command [search google current tab] exists: True
Commands file for command [search google new tab] exists: True
Commands file for command [search google scholar current tab] exists: True
Commands file for command [search google scholar new tab] exists: True
Commands file for command [search current tab] exists: True
Commands file for command [search new tab] exists: True
Commands file for command [search perplexity current tab] exists: True
Commands file for command [search perplexity new tab] exists: True
Commands file for command [search phind current tab] exists: True
Commands file for command [search phind new tab] exists: True

Commands file for command [search using clipboard current tab] exists: True
Commands file for command [search using clipboard new tab] exists: True
Commands file for command [search google using clipboard current tab] exists: T

## Quantize using AWQ (Adaptive Weight Quantization) and write to disk

In [6]:
# !pip install autoawq

In [6]:
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/" )
print( os.getcwd() )

/var/model/models/Mistral-7B-Instruct-v0.2


In [8]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }

# Load model and tokenizer
raw_16bit_model     = AutoAWQForCausalLM.from_pretrained( merged_path, device_map="auto", safetensors=True )
raw_16bit_tokenizer = AutoTokenizer.from_pretrained( merged_path, use_fast=True )

# Quantize
raw_16bit_model.quantize( raw_16bit_tokenizer, quant_config=quant_config )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [09:46<00:00, 18.33s/it]


In [11]:
# Save quantized model
awq_path = "./merged-00-2024.01.23.awq"
raw_16bit_model.save_quantized( awq_path, safetensors=True )
raw_16bit_tokenizer.save_pretrained( awq_path )

('./merged-00-2024.01.23.awq/tokenizer_config.json',
 './merged-00-2024.01.23.awq/special_tokens_map.json',
 './merged-00-2024.01.23.awq/tokenizer.model',
 './merged-00-2024.01.23.awq/added_tokens.json',
 './merged-00-2024.01.23.awq/tokenizer.json')

## GPU RAM after quantizing model with 4bit AWQ
```
Wed Jan 24 12:06:36 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   37C    P8              27W / 450W |   1320MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   44C    P8              24W / 450W |   2084MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A     15181      C   /usr/bin/python3                           1310MiB |
|    1   N/A  N/A     15181      C   /usr/bin/python3                           2074MiB |
+---------------------------------------------------------------------------------------+
```

## Validate AWQ model: In memory loaded by Jupiter notebook

In [9]:
import os
os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/" )
print( os.getcwd() )

/var/model/models/Mistral-7B-Instruct-v0.2


In [10]:
from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

awq_path      = "./merged-00-2024.01.23.awq"
model_aqw     = AutoAWQForCausalLM.from_pretrained( awq_path, device_map="cuda:1", safetensors=True )
tokenizer_awq = AutoTokenizer.from_pretrained( awq_path, use_fast=True )

ImportError: Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)

In [7]:
run_validation( model_aqw, tokenizer_awq )

------------------------------------------------------------------------------------------------------------------------
- Validating mistralai/Mistral-7B-Instruct-v0.2 w/ 100 samples
------------------------------------------------------------------------------------------------------------------------

command
go to new tab                                        13
search new tab                                       12
search google scholar current tab                    10
search google new tab                                 8
search google scholar new tab                         8
search google current tab                             7
search phind current tab                              6
search current tab                                    6
go to current tab                                     5
search perplexity current tab                         4
search phind new tab                                  4
search phind using clipboard current tab              3
none          

## GPU RAM after loading & validating AWQ model with 4bit AWQ: Device 1
```
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
|  0%   43C    P8              22W / 450W |   5578MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
```

## - Validation stats for model mistralai/Mistral-7B-Instruct-v0.2: ~40 Tokens/s!
```
Generating responses for 100 rows... Done! in 01:41
[1014.6] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation stats for model mistralai/Mistral-7B-Instruct-v0.2
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 100.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%
```

## Validate AWQ model: TGI service listening on port 3000

In [12]:
# 192.168.0.188
tgi_validator  = XmlFineTuningPromptGenerator( path_prefix="/var/model/genie-in-the-box", tgi_url="http://172.17.0.4:3000", debug=True )

model_name     = "mistralai/Mistral-7B-Instruct-v0.2-AWQ"
# model_name     = "Phind-CodeLlama-34B-v2 w/ BnB 4nf"

validate_df    = pd.read_json( "/var/model/genie-in-the-box/src/ephemera/prompts/data/voice-commands-xml-validate.jsonl", lines=True ).sample( 100, random_state=42 )
validate_df    = tgi_validator.generate_responses( validate_df, switch="tgi", model_name=model_name )
validate_df    = tgi_validator.validate_responses( validate_df )

tgi_validator.print_validation_stats( validate_df, title=f"Validation Stats for `{model_name}` on TGI:3000" )

# Generating responses for 100 rows... Done! in 50 seconds
# [502.1] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-AWQ` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 100.0%
# Response has correct values 100.0%
#  Browser command is correct 100.0%
#             Args is correct 100.0%


# Generating responses for 100 rows... Done! in 01:12
# [722.3] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-BnB-4nf` on TGI:3000 with BnB 4nf 
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 99.0%
# Response has correct values 99.0%
#  Browser command is correct 100.0%
#             Args is correct 99.0%


# Generating responses for 100 rows... Done! in 02:26
# [1461.4] ms per item
# 
# ------------------------------------------------------------------------------------------------------------------------
# - Validation Stats for `Phind-CodeLlama-34B-v2 w/ BnB 4nf` on TGI:3000
# ------------------------------------------------------------------------------------------------------------------------
# 
#                Is valid xml 100.0%
#           Contains response 100.0%
#  Contains <browser-command> 100.0%
#             Contains <args> 100.0%
#           Response is exact 42.0%
# Response has correct values 42.0%
#  Browser command is correct 46.0%
#             Args is correct 82.0%
# 
# Mon Jan 22 13:23:25 2024
# +---------------------------------------------------------------------------------------+
# | NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
# |-----------------------------------------+----------------------+----------------------+
# | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
# | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
# |                                         |                      |               MIG M. |
# |=========================================+======================+======================|
# |   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
# |  0%   40C    P8              29W / 450W |  18064MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# |   1  NVIDIA GeForce RTX 4090        On  | 00000000:02:00.0 Off |                  Off |
# |  0%   45C    P8              22W / 450W |   4994MiB / 24564MiB |      0%      Default |
# |                                         |                      |                  N/A |
# +-----------------------------------------+----------------------+----------------------+
# 
# +---------------------------------------------------------------------------------------+
# | Processes:                                                                            |
# |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
# |        ID   ID                                                             Usage      |
# |=======================================================================================|
# |    0   N/A  N/A     22240      C   /opt/conda/bin/python3.10                 18054MiB |
# |    1   N/A  N/A     23207      C   /usr/bin/python3                           4984MiB |
# # +---------------------------------------------------------------------------------------+

Commands file for command [go to current tab] exists: True
Commands file for command [go to new tab] exists: True
Commands file for command [search google current tab] exists: True
Commands file for command [search google new tab] exists: True
Commands file for command [search google scholar current tab] exists: True
Commands file for command [search google scholar new tab] exists: True
Commands file for command [search current tab] exists: True
Commands file for command [search new tab] exists: True
Commands file for command [search perplexity current tab] exists: True
Commands file for command [search perplexity new tab] exists: True
Commands file for command [search phind current tab] exists: True
Commands file for command [search phind new tab] exists: True

Commands file for command [search using clipboard current tab] exists: True
Commands file for command [search using clipboard new tab] exists: True
Commands file for command [search google using clipboard current tab] exists: T

## GPU ram after loading AWQ model: ~83 Tokens/s!
```
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   42C    P2              70W / 450W |  20240MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

Generating responses for 100 rows... Done! in 50 seconds
[502.1] ms per item

------------------------------------------------------------------------------------------------------------------------
- Validation Stats for `mistralai/Mistral-7B-Instruct-v0.2-AWQ` on TGI:3000
------------------------------------------------------------------------------------------------------------------------

               Is valid xml 100.0%
          Contains response 100.0%
 Contains <browser-command> 100.0%
            Contains <args> 100.0%
          Response is exact 100.0%
Response has correct values 100.0%
 Browser command is correct 100.0%
            Args is correct 100.0%
```

## See: [Phind advice for freeing GPU RAM](https://www.phind.com/search?cache=kh81ys0uelwxs8zpykdzv0d8)
### It worked!  

In [5]:
# Accomplishes the same thing

dupt.release_gpu_memory( model_aqw )

# import gc
# import torch
# 
# model_aqw.device = torch.device( "cpu" )
# tokenizer_awq.device = torch.device( "cpu" )
# 
# model_aqw     = None
# tokenizer_awq = None
# 
# gc.collect()
# torch.cuda.empty_cache()

In [7]:
%autoreload

import os

from awq          import AutoAWQForCausalLM
from transformers import AutoTokenizer

os.chdir( "/var/model/genie-in-the-box/src" )
print( os.getcwd() )
import lib.utils.util_pytorch as dupt

print( os.getcwd() )

model_aqw     = None
tokenizer_awq = None

def load_commands_llm():
    
    print( "Loading AWQmodel..." )
        
    # TODO: Pull from configuration object
    awq_path      = "./merged-00-2024.01.23.awq"
    device_map    = "cuda:1"
    
    os.chdir( "/var/model/models/Mistral-7B-Instruct-v0.2/" )
    
    model_aqw     = AutoAWQForCausalLM.from_pretrained( awq_path, device_map=device_map, safetensors=True )
    tokenizer_awq = AutoTokenizer.from_pretrained( awq_path, use_fast=True )
    
    return model_aqw, tokenizer_awq


def load_llm_once():
    
    global model_aqw
    global tokenizer_awq
    
    if model_aqw is None:
        model_aqw, tokenizer_awq = load_commands_llm()
    else:
        print( "Model already loaded!" )

load_llm_once()
        

/var/model/genie-in-the-box/src
/var/model/genie-in-the-box/src
Loading AWQmodel...


In [6]:
dupt.release_gpu_memory( model_aqw )
dupt.release_gpu_memory( tokenizer_awq )

In [8]:
load_llm_once()

Model already loaded!
