In [4]:
import wikipedia
import json

from typing import List, Optional
from haystack import Pipeline, component

from haystack.components.builders import PromptBuilder

from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.writers import DocumentWriter

from haystack.dataclasses import Document

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

In [23]:
from haystack_integrations.components.generators.ollama import OllamaGenerator

@component
class QueryExpander:

    def __init__(self, prompt: Optional[str] = None, model: str = "llama3.2"):

        self.query_expansion_prompt = prompt
        self.model = model
        if prompt == None:
          self.query_expansion_prompt = """
          You are part of an information system that processes users queries.
          You expand a given query into {{ number }} queries that are similar in meaning.
          
          Structure:
          Follow the structure shown below in examples to generate expanded queries.
          Examples:
          1. Example Query 1: "climate change effects"
          Example Expanded Queries: ["impact of climate change", "consequences of global warming", "effects of environmental changes"]
          
          2. Example Query 2: ""machine learning algorithms""
          Example Expanded Queries: ["neural networks", "clustering", "supervised learning", "deep learning"]
          
          Your Task:
          Query: "{{query}}"
          Example Expanded Queries:
          """
        builder = PromptBuilder(self.query_expansion_prompt)
        llm = OllamaGenerator(model = self.model)
        self.pipeline = Pipeline()
        self.pipeline.add_component(name="builder", instance=builder)
        self.pipeline.add_component(name="llm", instance=llm)
        self.pipeline.connect("builder", "llm")

    @component.output_types(queries=List[str])
    def run(self, query: str, number: int = 5):
        result = self.pipeline.run({'builder': {'query': query, 'number': number}})
        return result

In [None]:
expander = QueryExpander()
text= expander.run(query="open source nlp frameworks", number=4)

print(text['llm']['replies'])
text  = text['llm']['replies'][0]


['Example Expanded Queries:\n\n["open source natural language processing frameworks", "free machine learning libraries for NLP", "public domain NLP tools and models", "community-driven NLP software libraries"]']
Example Expanded Queries:

["open source natural language processing frameworks", "free machine learning libraries for NLP", "public domain NLP tools and models", "community-driven NLP software libraries"]


In [34]:
print(text)

Example Expanded Queries:

["open source natural language processing frameworks", "free machine learning libraries for NLP", "public domain NLP tools and models", "community-driven NLP software libraries"]


In [35]:
import re


# Regular expression to extract the list items
pattern = r'\["([^"]+)", "([^"]+)", "([^"]+)", "([^"]+)"\]'

# Find the match
match = re.search(pattern, text)

# Extract and print the list items if a match is found
if match:
    items = match.groups()
    print(list(items))
else:
    print("No match found.")

['open source natural language processing frameworks', 'free machine learning libraries for NLP', 'public domain NLP tools and models', 'community-driven NLP software libraries']


In [26]:
import re

# text = """{'queries': ['Here are the expanded queries:\\n\\n1. "open source nlp frameworks"\\nExpanded Queries:\\n["open source natural language processing tools", "free nlp libraries for python", "open source machine learning nlp frameworks", "nlp open source software"]\\n\\n2. "machine learning algorithms"\\nExpanded Queries:\\n["neural networks for classification", "decision trees in machine learning", "random forests for regression", "support vector machines"]\\n\\n3. "data science courses online"\\nExpanded Queries:\\n["online data science certifications", "free data science tutorials", "data science boot camps with certification", "online courses for data science with hands on experience"]',
#   ['open source nlp frameworks']]}"""

# Regular expression to extract the expanded queries lists
pattern = r'Expanded Queries:\\n(\[[^\]]*\])'

# Find all matches
matches = re.findall(pattern, text, re.DOTALL)

# Print the extracted lists
for match in matches:
    print(match)

TypeError: expected string or bytes-like object, got 'dict'