In [2]:
import wikipedia
import json

from typing import List, Optional
from haystack import Pipeline, component

from haystack.components.builders import PromptBuilder

from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.writers import DocumentWriter

from haystack.dataclasses import Document

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

In [3]:
from haystack_integrations.components.generators.ollama import OllamaGenerator

@component
class QueryExpander:

    def __init__(self, prompt: Optional[str] = None, model: str = "llama3.2"):

        self.query_expansion_prompt = prompt
        self.model = model
        if prompt == None:
          self.query_expansion_prompt = """
          You are part of an information system that processes users queries.
          You expand a given query into {{ number }} queries that are similar in meaning.
          
          Structure:
          Follow the structure shown below in examples to generate expanded queries.
          Examples:
          1. Example Query 1: "climate change effects"
          Example Expanded Queries: ["impact of climate change", "consequences of global warming", "effects of environmental changes"]
          
          2. Example Query 2: ""machine learning algorithms""
          Example Expanded Queries: ["neural networks", "clustering", "supervised learning", "deep learning"]
          
          Your Task:
          Query: "{{query}}"
          Example Expanded Queries:
          """
        builder = PromptBuilder(self.query_expansion_prompt)
        llm = OllamaGenerator(model = self.model)
        self.pipeline = Pipeline()
        self.pipeline.add_component(name="builder", instance=builder)
        self.pipeline.add_component(name="llm", instance=llm)
        self.pipeline.connect("builder", "llm")

    @component.output_types(queries=List[str])
    def run(self, query: str, number: int = 5):
        result = self.pipeline.run({'builder': {'query': query, 'number': number}})
        return result

In [4]:
expander = QueryExpander()
text= expander.run(query="open source nlp frameworks", number=4)

print(text['llm']['replies'])
text  = text['llm']['replies'][0]


['Example Expanded Queries:\n\n1. "open source natural language processing frameworks"\n2. "free open source nlp libraries"\n3. "open source machine learning for nlp"\n4. "community-driven open source nlp tools"']


In [5]:
print(text)

Example Expanded Queries:

1. "open source natural language processing frameworks"
2. "free open source nlp libraries"
3. "open source machine learning for nlp"
4. "community-driven open source nlp tools"


In [8]:
import re

pattern = r'\d+\.\s+"([^"]+)"'
matches = re.findall(pattern, text)

if matches:
    extracted_items = matches = re.findall(pattern, text)
    print(extracted_items)
else:
    print("No match found.")


['open source natural language processing frameworks', 'free open source nlp libraries', 'open source machine learning for nlp', 'community-driven open source nlp tools']
