In [15]:
import os

# Hide warnings
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
# Limiting GPU growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

import torch
torch.cuda.empty_cache()   # Emptying Cuda cache in order to free important space
torch.cuda.is_available()

from keras import backend as K
K.clear_session()   # Same but for Keras

2024-05-24 14:11:26.601095: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 14:11:29.922229: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-24 14:11:29.922911: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platf

Model loading

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoConfig
# path to model ./model/Llama3/models--nvidia--Llama3-ChatQA-1.5-8B/snapshots/2a579cf6db7bbf49b138d4026dae6c8f822fc3de/


model_path= "./model/Llama3/models--nvidia--Llama3-ChatQA-1.5-8B/snapshots/2a579cf6db7bbf49b138d4026dae6c8f822fc3de/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.float16,device_map="auto")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.36s/it]


Vocabulary size and context length

In [17]:
vocab_size=len(tokenizer)
print(f"The vocabulary size is: {vocab_size}")
context_length = model.config.max_position_embeddings
print(f"The context length is: {context_length} tokens")

The vocabulary size is: 128256
The context length is: 8192 tokens


Custom functions

In [18]:
# Function that allows user to modify the AI's default instruction
def set_instruction(custom_instruction=None):
    default_instruction = 'Please give a full and complete answer for the question'
    if custom_instruction:
        return custom_instruction
    else:
        return default_instruction

print (set_instruction())

# Function that allows user to modify the system context
def set_system(custom_system=None):
    default_system="System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context."
    if custom_system:
        return custom_system
    else:
        return default_system

print(set_system())

# Function that reads, if necesessary, a document and prints it for the AI to read, the default code is 'document=""" This is a document """'
# The document is called via formatted_input = get_formatted_input(messages, document)
def set_document(custom_doc=None):
    if custom_doc:
        with open(custom_doc, 'r') as f:
            document=f.read()
        return document
    else:
        return ""
print (set_document())

# Function that handles user messages.
def create_message(message):
    messages =[
        {"role": "user", "content": message}
    ]
    return messages

Please give a full and complete answer for the question
System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.



In [19]:
create_message("Hello darkness my old friend")

[{'role': 'user', 'content': 'Hello darkness my old friend'}]

Main code. Inference code

In [20]:
# Test
message="Hello there"


# Message = user input
messages = create_message(message)

# Set/use doc if needed
document = set_document()
def get_formatted_input(messages, context):
    # setting system context and AI instruction 
    system = set_system()
    instruction=set_instruction()

    for item in messages:
        if item['role'] == "user":
            ## only apply this instruction for the first user turn
            item['content'] = instruction + " " + item['content']
            break

    conversation = '\n\n'.join(["User: " + item["content"] if item["role"] == "user" else "Assistant: " + item["content"] for item in messages]) + "\n\nAssistant:"
    formatted_input = system + "\n\n" + context + "\n\n" + conversation
    
    return formatted_input

formatted_input = get_formatted_input(messages, document)
tokenized_prompt = tokenizer(tokenizer.bos_token + formatted_input, return_tensors="pt").to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(input_ids=tokenized_prompt.input_ids, attention_mask=tokenized_prompt.attention_mask, max_new_tokens=128, eos_token_id=terminators)

response = outputs[0][tokenized_prompt.input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 Hello there! How can I help you?
