In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import `haystack` modules

In [18]:
# Install HuggingFace Datasets using "pip install datasets"
from datasets import load_dataset
from haystack import Document, Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders import ChatPromptBuilder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.dataclasses import ChatMessage

# Import LlamaCppChatGenerator
from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator

Import my custom modules

In [19]:
from modules import utils

Load environment variables

In [20]:
env_file_path = "./secrets/env"
env_variables = utils.load_env_file(env_file_path=env_file_path)
api_key = env_variables["OPENAI_SECRET_KEY"]

Show 100 records of the dataset

In [21]:
dataset = load_dataset("pszemraj/simple_wikipedia", split="validation[:100]")

In [24]:
docs = [
    Document(
        content=doc["text"],
        meta={
            "title": doc["title"],
            "url": doc["url"],
        },
    )
    for doc in dataset
]

In [8]:
for doc in dataset:
    print(f"Title: {doc['title']}")
    print(f"URL: {doc['url']}")
    print(f"Text: {doc['text'][:100]}...")
    print()

Title: List of wars involving Azerbaijan
URL: https://simple.wikipedia.org/wiki/List%20of%20wars%20involving%20Azerbaijan
Text: This is a list of wars involving the Republic of Azerbaijan and its predecessor states, the Azerbaij...

Title: József Sas
URL: https://simple.wikipedia.org/wiki/J%C3%B3zsef%20Sas
Text: Jozsef Sas (born Jozsef Polacsek; 3 January 1939 - 17 January 2021) was a Hungarian actor, comedian ...

Title: Thomas Bickel
URL: https://simple.wikipedia.org/wiki/Thomas%20Bickel
Text: Thomas Bickel (born 6 October 1963) is a former Swiss football player. He played for Switzerland nat...

Title: Kevin Beattie
URL: https://simple.wikipedia.org/wiki/Kevin%20Beattie
Text: Thomas Kevin Beattie (18 December 1953 - 16 September 2018) was an English footballer. He played as ...

Title: Gloster Gladiator
URL: https://simple.wikipedia.org/wiki/Gloster%20Gladiator
Text: The Gloster Gladiator is a biplane fighter aircraft of the World War II. It was built in UK and used...

Title: Ville

Index documents

In [47]:
doc_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
# Install sentence transformers using "pip install sentence-transformers"
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

# Indexing Pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=doc_embedder, name="DocEmbedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=doc_store), name="DocWriter")
indexing_pipeline.connect("DocEmbedder", "DocWriter")

indexing_pipeline.run({"DocEmbedder": {"documents": docs}})

Batches: 100%|██████████| 4/4 [00:02<00:00,  1.45it/s]


{'DocWriter': {'documents_written': 100}}

In [48]:
system_message = ChatMessage.from_system(
    """
    Answer the question using the provided context.
    Context:
    {% for doc in documents %}
        {{ doc.content }}
    {% endfor %}
    """
)
user_message = ChatMessage.from_user("Question: {{question}}")
assistent_message = ChatMessage.from_assistant("Answer: ")


chat_template = [system_message, user_message, assistent_message]

Using LLAMA model as LLM

In [49]:
rag_pipeline = Pipeline()

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

# Load the LLM using LlamaCppChatGenerator
model_path = "/mnt/kalista/models/llm/openchat-3.5-1210.Q3_K_S.gguf"
generator = LlamaCppChatGenerator(model=model_path, n_ctx=4096, n_batch=128)

rag_pipeline.add_component(
    instance=text_embedder,
    name="text_embedder",
)
rag_pipeline.add_component(instance=InMemoryEmbeddingRetriever(document_store=doc_store, top_k=3), name="retriever")
rag_pipeline.add_component(instance=ChatPromptBuilder(template=chat_template), name="prompt_builder")
rag_pipeline.add_component(instance=generator, name="llm")
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")

rag_pipeline.connect("text_embedder", "retriever")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
rag_pipeline.connect("llm", "answer_builder")
rag_pipeline.connect("retriever", "answer_builder.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7d05faeff580>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: ChatPromptBuilder
  - llm: LlamaCppChatGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - retriever.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.messages (List[ChatMessage])
  - llm.replies -> answer_builder.replies (List[ChatMessage])

In [50]:
question = "Which year did the Joker movie release?"
result = rag_pipeline.run(
    {
        "text_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_tokens": 128, "temperature": 0.1}},
        "answer_builder": {"query": question},
    }
)

generated_answer = result["answer_builder"]["answers"][0]
print(generated_answer.data)

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /mnt/kalista/models/llm/openchat-3.5-1210.Q3_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = openchat_openchat-3.5-1210
llama_model_loader: - kv   2:                       llama.context_length u32              = 8192
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:          

 The Joker movie was released on October 4, 2019.


In [56]:
print(generated_answer.data)

 The Joker movie was released on October 4, 2019.


In [57]:
question = "Who is John Edward James?"
result = rag_pipeline.run(
    {
        "text_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_tokens": 128, "temperature": 0.1}},
        "answer_builder": {"query": question},
    }
)

generated_answer = result["answer_builder"]["answers"][0]
print(generated_answer.data)

Batches: 100%|██████████| 1/1 [00:00<00:00, 23.20it/s]
Llama.generate: 27 prefix-match hit, remaining 511 prompt tokens to eval
llama_perf_context_print:        load time =  195759.14 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   511 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   46208.43 ms /   526 tokens


 There isn't enough information provided to determine who John Edward James is.


In [59]:
question = "What year does the anime Detective Conan end?"
result = rag_pipeline.run(
    {
        "text_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_tokens": 128, "temperature": 0.1}},
        "answer_builder": {"query": question},
    }
)

generated_answer = result["answer_builder"]["answers"][0]
print(generated_answer.data)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.72it/s]
Llama.generate: 577 prefix-match hit, remaining 25 prompt tokens to eval
llama_perf_context_print:        load time =  195759.14 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    25 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    22 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    6853.05 ms /    47 tokens


 The anime Detective Conan does not have a definitive end date yet, as it is still ongoing.


In [60]:
question = "Could you compare the performance of Macbook pro m1 and Macbook pro m2?"
result = rag_pipeline.run(
    {
        "text_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_tokens": 128, "temperature": 0.1}},
        "answer_builder": {"query": question},
    }
)

generated_answer = result["answer_builder"]["answers"][0]
print(generated_answer.data)

Batches: 100%|██████████| 1/1 [00:00<00:00, 94.55it/s]
Llama.generate: 28 prefix-match hit, remaining 618 prompt tokens to eval
llama_perf_context_print:        load time =  195759.14 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   618 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   127 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   78730.64 ms /   745 tokens


 The MacBook Pro with M1 and M2 chips are both powerful and efficient Apple processors, but they have some differences in performance and capabilities.

The M1 chip, introduced in November 2020, is a 8-core CPU with 4 performance cores and 4 efficiency cores, and an 8-core GPU. It has a 16-core Neural Engine. The M1 chip has shown impressive performance in various benchmarks, offering excellent performance-per-watt ratios and excellent performance in tasks that benefit from its integrated GPU.

The M2 chip,
