In [1]:
from outlines import models, generate, samplers
from transformers import BitsAndBytesConfig

qt8 = BitsAndBytesConfig(load_in_8bit=True)

In [11]:
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.retrievers import InMemoryEmbeddingRetriever

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

documents = [Document(content="There are over 7,000 languages spoken around the world today."),
						Document(content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."),
						Document(content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.")]

In [12]:
document_embedder = SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2")
document_embedder.warm_up()

documents_with_embeddings = document_embedder.run(documents)["documents"]
document_store.write_documents(documents_with_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

3

In [16]:
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2"))
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

<haystack.core.pipeline.pipeline.Pipeline object at 0x000001B72F545310>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

In [28]:
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
model = models.transformers(checkpoint, device='cuda', model_kwargs={"quantization_config":qt8})
sampler = samplers.MultinomialSampler(temperature=1.0)

In [43]:
question = "How many languages are there?"
result = query_pipeline.run({"text_embedder": {"text": question}})
print(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'retriever': {'documents': [Document(id=cfe93bc1c274908801e6670440bf2bbba54fad792770d57421f85ffa2a4fcc94, content: 'There are over 7,000 languages spoken around the world today.', score: 0.7557791896534849), Document(id=6f20658aeac3c102495b198401c1c0c2bd71d77b915820304d4fbc324b2f3cdb, content: 'Elephants have been observed to behave in a way that indicates a high level of self-awareness, such ...', score: 0.04221236301567667), Document(id=7f225626ad1019b273326fbaf11308edfca6d663308a4a3533ec7787367d59a2, content: 'In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the ph...', score: -0.0016677476447660253)]}}


In [44]:
best = result['retriever']['documents'][0]
best = best.content if best.score > 0.2 else ""
prompt = f"""<|system|>Answer the following question.
{best}<|question|>{question}<|end|>
<|answer|>"""

In [50]:
generator = generate.text(model, sampler)
answer = generator(prompt, max_tokens=20, stop_at=['.', '!', '<|', '|>', '\n'])
print(answer)

This is a well-known fact, as <|


In [19]:
print(f'{model.model.get_memory_footprint()/1e9:.2f} GB')

1.81 GB
