Download Model

In [None]:
# %%bash
# huggingface-cli download turboderp/Llama-3-8B-Instruct-exl2 \
#     --revision 97d15f8fb9808afd51d7bccc3c3204ef3714f65a \
#     --local-dir ~/gai/models/Llama-3-8B-Instruct-exl2 \
#     --local-dir-use-symlinks False


In [None]:
%%bash
huggingface-cli download turboderp/Llama-3-8B-Instruct-exl2 \
    --revision 4.0bpw \
    --local-dir ~/gai/models/Meta-Llama-3-8B-Instruct-EXL2 \
    --local-dir-use-symlinks False


1. Generating

In [None]:
from gai.gen.ttt.ExLlamav2_TTT import ExLlamav2_TTT
config = {
            "model_name": "ttt-exllama2-llama3",
            "type": "ttt",
            "engine": "ExLlamaV2_TTT",
            "model_path": "models/Meta-Llama-3-8B-Instruct-EXL2",
            "model_basename": "model",
            "max_seq_len": 8192,
            "prompt_format": "llama3",
            "stop_conditions": ["<|eot_id|>"],
            "hyperparameters": {
                "temperature": 0.85,
                "top_p": 0.8,
                "top_k": 50,
                "max_new_tokens": 100,
            },
            "no_flash_attn":True,
            "seed": None
        }    
gen = ExLlamav2_TTT(config)
gen.load()
response = gen.create(messages=[
    {'role':'system','content':'You are a helpful assistant that can generate short stories. You can generate a short story based on a prompt.'},
    {'role':'user','content':'Tell me a one paragraph short story.'},
    {'role':'assistant','content':''}],
    stream=False)
gen.unload()
response

2. Streaming

In [None]:
from gai.gen.ttt.ExLlamav2_TTT import ExLlamav2_TTT
config = {
            "model_name": "ttt-exllama2-llama3",
            "type": "ttt",
            "engine": "ExLlamaV2_TTT",
            "model_path": "models/Meta-Llama-3-8B-Instruct-EXL2",
            "model_basename": "model",
            "max_seq_len": 8192,
            "prompt_format": "llama3",
            "stop_conditions": ["<|eot_id|>"],
            "hyperparameters": {
                "temperature": 0.85,
                "top_p": 0.8,
                "top_k": 50,
                "max_new_tokens": 500,
            },
            "no_flash_attn":True,
            "seed": None
        } 
gen = ExLlamav2_TTT(config)
gen.load()
response = gen.create(messages=[
    {'role':'system','content':'You are a helpful assistant that can generate short stories. You can generate a short story based on a prompt.'},
    {'role':'user','content':'Tell me a one paragraph short story.'},
    {'role':'assistant','content':''}],
    stream=True)
for chunk in response:
    chunk=chunk.choices[0].delta.content
    if chunk:
        print(chunk, end='', flush=True)
gen.unload()

---
## JSON Mode

In [None]:
from gai.gen.ttt.ExLlamav2_TTT import ExLlamav2_TTT
config = {
            "model_name": "ttt-exllama2-llama3",
            "type": "ttt",
            "engine": "ExLlamaV2_TTT",
            "model_path": "models/Meta-Llama-3-8B-Instruct-EXL2",
            "model_basename": "model",
            "max_seq_len": 8192,
            "prompt_format": "llama3",
            "stop_conditions": ["<|eot_id|>"],
            "hyperparameters": {
                "temperature": 0.85,
                "top_p": 0.8,
                "top_k": 50,
                "max_new_tokens": 400,
            },
            "no_flash_attn":True,
            "seed": None
        } 
gen = ExLlamav2_TTT(config)
gen.load()

# Define Schema
from pydantic import BaseModel
class Book(BaseModel):
    title: str
    summary: str
    author: str
    published_year: int

text = """Foundation is a science fiction novel by American writer
Isaac Asimov. It is the first published in his Foundation Trilogy (later
expanded into the Foundation series). Foundation is a cycle of five
interrelated short stories, first published as a single book by Gnome Press
in 1951. Collectively they tell the early story of the Foundation,
an institute founded by psychohistorian Hari Seldon to preserve the best
of galactic civilization after the collapse of the Galactic Empire.
"""
response = gen.create(messages=[{'role':'user','content':text},{'role':'assistant','content':''}], 
    schema=Book.schema()
    )
gen.unload()
response

---
## Function Calling

In [1]:
from gai.gen.ttt.ExLlamav2_TTT import ExLlamav2_TTT
config = {
            "model_name": "ttt-exllama2-llama3",
            "type": "ttt",
            "engine": "ExLlamaV2_TTT",
            "model_path": "models/Meta-Llama-3-8B-Instruct-EXL2",
            "model_basename": "model",
            "max_seq_len": 8192,
            "prompt_format": "llama3",
            "stop_conditions": ["<|eot_id|>"],
            "hyperparameters": {
                "temperature": 0,
                "top_p": 0.1,
                "top_k": 50,
                "max_new_tokens": 1000,
            },
            "no_flash_attn":True,
            "seed": None
        } 
gen = ExLlamav2_TTT(config)
gen.load()

from gai.common.notebook import highlight
tools = [
    {
        "type": "function",
        "function": {
            "name": "google",
            "description": "The 'google' function is a powerful tool that allows the AI to gather external information from the internet using Google search. It can be invoked when the AI needs to answer a question or provide information that requires up-to-date, comprehensive, and diverse sources which are not inherently known by the AI. For instance, it can be used to find current date, current news, weather updates, latest sports scores, trending topics, specific facts, or even the current date and time. The usage of this tool should be considered when the user's query implies or explicitly requests recent or wide-ranging data, or when the AI's inherent knowledge base may not have the required or most current information. The 'search_query' parameter should be a concise and accurate representation of the information needed.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The search query to search google with. For example, to find the current date or time, use 'current date' or 'current time' respectively."
                    }
                },
                "required": ["search_query"]
            }
        }
    }
]
highlight("Model decided to use tool: ")
user_prompt = "What time is it in Singapore right now?"
response = gen.create(
    messages=[
        {'role':'user','content':user_prompt},
        {'role':'assistant','content':''}],
    tools=tools,
    stream=False,
    max_new_tokens=1000)
print(response)

highlight("Model decided not to use tool: ")
user_prompt = "Tell me a one paragraph story."
response = gen.create(
    messages=[
        {'role':'user','content':user_prompt},
        {'role':'assistant','content':''}],
    tools=tools,
    stream=False,
    max_new_tokens=1000)
print(response)

gen.unload()


2024-06-30 18:05:22 INFO gai.gen.ttt.ExLlamav2_TTT:[32mExLlama_TTT2.load: Loading model from /home/roylai/gai/models/Meta-Llama-3-8B-Instruct-EXL2[0m


2024-06-30 18:05:34 INFO gai.gen.ttt.ExLlamav2_TTT:[32mExLlama_TTT2.create:
	prompt=`<|begin_of_text|><|start_header_id|>system<|end_header_id|>


            1. Review the <tools> below and assess if any of them is suitable for responding to the user's message.

                [{'type': 'function', 'function': {'name': 'google', 'description': "The 'google' function is a powerful tool that allows the AI to gather external information from the internet using Google search. It can be invoked when the AI needs to answer a question or provide information that requires up-to-date, comprehensive, and diverse sources which are not inherently known by the AI. For instance, it can be used to find current date, current news, weather updates, latest sports scores, trending topics, specific facts, or even the current date and time. The usage of this tool should be considered when the user's query implies or explicitly requests recent or wide-ranging data, or when the AI's inherent knowledge bas

ChatCompletion(id='chatcmpl-6e97190b-86e5-49ba-9df0-07f1a648f7fe', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_eedd6b78-c865-4d1b-acce-644484787909', function=Function(arguments='{"search_query": "current time in Singapore"}', name='google'), type='function')]))], created=1719741938, model='ttt-exllama2-llama3', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=29, prompt_tokens=444, total_tokens=473))


2024-06-30 18:05:38 INFO gai.gen.ttt.ExLlamav2_TTT:[32mExLlama_TTT2.create:
	prompt=`<|begin_of_text|><|start_header_id|>system<|end_header_id|>


            1. Review the <tools> below and assess if any of them is suitable for responding to the user's message.

                [{'type': 'function', 'function': {'name': 'google', 'description': "The 'google' function is a powerful tool that allows the AI to gather external information from the internet using Google search. It can be invoked when the AI needs to answer a question or provide information that requires up-to-date, comprehensive, and diverse sources which are not inherently known by the AI. For instance, it can be used to find current date, current news, weather updates, latest sports scores, trending topics, specific facts, or even the current date and time. The usage of this tool should be considered when the user's query implies or explicitly requests recent or wide-ranging data, or when the AI's inherent knowledge bas

Exception: Failed to generate JSON for schema after maximum number of retries.