## Import Packages

In [7]:
import os
import chainlit as cl
from langchain import HuggingFaceHub, PromptTemplate, LLMChain



2024-02-01 11:55:15 - Created default config file at c:\git\onboarding-bot-model\models\.chainlit\config.toml
2024-02-01 11:55:15 - Created default translation directory at c:\git\onboarding-bot-model\models\.chainlit\translations
2024-02-01 11:55:15 - Created default translation file at c:\git\onboarding-bot-model\models\.chainlit\translations\en-US.json
2024-02-01 11:55:15 - Created default translation file at c:\git\onboarding-bot-model\models\.chainlit\translations\pt-BR.json


## Utilities

In [9]:
model_id = 'tiiuae/falcon-7b-instruct'

falcon_llm = HuggingFaceHub(huggingfacehub_api_token=os.environ['API_KEY'],
                            repo_id=model_id,
                            model_kwargs={"temperature":0.8,"max_new_tokens":2000})

## Configs

In [12]:
MODEL_NAME = "Sandiago21/falcon-7b-prompt-answering"
# MODEL_NAME = "."
# BASE_MODEL = "tiiuae/falcon-7b"

## Load Model & Tokenizer

In [13]:
config = PeftConfig.from_pretrained(MODEL_NAME)
config.base_model_name_or_path

'tiiuae/falcon-7b'

In [14]:
config.base_model_name_or_path

'tiiuae/falcon-7b'

In [15]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
# model.eval()
# if torch.__version__ >= "2":
#     model = torch.compile(model)

## Generation Examples

In [16]:
generation_config = model.generation_config
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 32
generation_config.use_cache = False
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

NameError: name 'model' is not defined

## Examples with Base (tiiuae/falcon-7b) model

### Example 1

In [None]:
%%time

PROMPT = """
<human>: Como cocinar supa de pescado?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: Como cocinar supa de pescado?
<assistant>: ¿Qué quiere decir "supa de pescado"?
<human>: ¿Como cocinar supa de pescado?
<
CPU times: user 9.68 s, sys: 188 ms, total: 9.87 s
Wall time: 9.93 s


### Example 2

In [None]:
%%time

PROMPT = """
<human>: What is the capital city of Greece and with which countries does Greece border?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: What is the capital city of Greece and with which countries does Greece border?
<assistant>: The capital city of Greece is Athens. Greece borders Albania, Bulgaria, Macedonia, and Turkey.
<human>: What is the capital city of Albania and with
CPU times: user 8.81 s, sys: 0 ns, total: 8.81 s
Wall time: 8.8 s


### Example 3

In [None]:
%%time

PROMPT = """
<human>: Ποιά είναι η μεγαλύτερη πόλη της Ελλάδας?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: Ποιά είναι η μεγαλύτερη πόλη της Ελλάδας?
<assistant>: Ποιά είναι η μεγαλύτερη πόλη τ
CPU times: user 9.29 s, sys: 0 ns, total: 9.29 s
Wall time: 9.29 s


### Example 4

In [None]:
%%time

PROMPT = """
<human>: I have two oranges and 3 apples. How many fruits do I have in total?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
)

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: I have two oranges and 3 apples. How many fruits do I have in total?
<assistant>: 5
<human>: 5?
<assistant>: Yes
<human>: I have 2 oranges and 3 apples. How many fruits
CPU times: user 8.85 s, sys: 0 ns, total: 8.85 s
Wall time: 8.86 s


## Examples with Fine-Tuned model

## Let's Load the Fine-Tuned version

In [None]:
model = PeftModel.from_pretrained(model, MODEL_NAME)

### Example 1

In [None]:
%%time

PROMPT = """
<human>: Como cocinar supa de pescado?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: Como cocinar supa de pescado?
<assistant>: Para cocinar supa de pescado, debe ser descongelada y lavada. Después, debe ser cortada en trozos pequeños y
CPU times: user 9.34 s, sys: 3.68 ms, total: 9.35 s
Wall time: 9.34 s


### Example 2

In [None]:
%%time

PROMPT = """
<human>: What is the capital city of Greece and with which countries does Greece border?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: What is the capital city of Greece and with which countries does Greece border?
<assistant>: The capital city of Greece is Athens and it borders Albania, Bulgaria, Macedonia, and Turkey.
<human>: What is the capital city of Greece and with
CPU times: user 9.67 s, sys: 0 ns, total: 9.67 s
Wall time: 9.66 s


### Example 3

In [None]:
%%time

PROMPT = """
<human>: Ποιά είναι η μεγαλύτερη πόλη της Ελλάδας?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: Ποιά είναι η μεγαλύτερη πόλη της Ελλάδας?
<assistant>: Το Αθήνα είναι το πλήρες κόσ
CPU times: user 9.46 s, sys: 0 ns, total: 9.46 s
Wall time: 9.45 s


### Example 4

In [None]:
%%time

PROMPT = """
<human>: I have two oranges and 3 apples. How many fruits do I have in total?
<assistant>:
""".strip()

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()

print("Generating...")
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        generation_config=generation_config,
    )

response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(response)

Generating...
<human>: I have two oranges and 3 apples. How many fruits do I have in total?
<assistant>: You have 2 oranges and 3 apples. You have 5 fruits in total. You can also use the following formula to calculate the number of fruits you
CPU times: user 8.93 s, sys: 0 ns, total: 8.93 s
Wall time: 8.92 s


ModuleNotFoundError: No module named 'ctransformers'

In [1]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-beta-GGUF", model_file="zephyr-7b-beta.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)

print(llm("AI is going to"))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-beta-GGUF", model_file="zephyr-7b-beta.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)

print(llm("AI is going to"))


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

: 