# Generator Filter

## Creating Filter with Generator
membuat filter retreival dengan menggunakan generator dan custom component di haystack

In [1]:
from haystack import Pipeline, component
from pymongo import MongoClient
from haystack.components.builders import PromptBuilder
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
import os
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret
from typing import List
from getpass import getpass

membuat custom component untuk mengambil list material dan category dari database

In [None]:
os.environ['MONGO_CONNECTION_STRING'] = getpass("Enter your MongoDB connection string: ")

In [None]:
os.environ['OPENAI_API_KEY'] = getpass("Enter your OpenAI API key: ")

In [5]:
class MongoDBAtlas:
    def __init__(self, mongo_connection_string:str):
        self.client = MongoClient(mongo_connection_string)
        self.db = self.client.depato_store
        self.material_collection = self.db.materials
        self.category_collection = self.db.categories

    def get_materials(self):
        return [doc['name'] for doc in self.material_collection.find()]

    def get_categories(self):
        return [doc['name'] for doc in self.category_collection.find()]

In [6]:
@component
class GetMaterials:
    def __init__(self):
        self.db = MongoDBAtlas(os.environ['MONGO_CONNECTION_STRING'])
    
    @component.output_types(materials=List[str])
    def run(self):
        materials = self.db.get_materials()
        return {"materials": materials}

In [7]:
@component
class GetCategories:
    def __init__(self):
        self.db = MongoDBAtlas(os.environ['MONGO_CONNECTION_STRING'])
    
    @component.output_types(categories=List[str])
    def run(self):
        categories = self.db.get_categories()
        return {"categories": categories}

membuat pipeline

In [8]:
TEMPLATE = """
You are a json generator that have a job to generate json based on the input.
The return json should be in the format:
```json
{
    "operator": "AND",
    "conditions":[
        {"field": "meta.category", "operator":"==", "value": <category>},
        {"field": "meta.material", "operator":"==", "value": <material>},
        {"filed": "meta.gender", "operator":"==", "value" : <male|female|unisex>},
        {"field": "meta.price", "operator":<"<="|">="|"==">, "value": <price>}
    ]
}
```
The json key above can be omiitted if the value is not provided in the input, so please make sure to only return the keys that are provided in the input.

For the material and category, you can only use the material and category that are provided below:
Materials: [ {% for material in materials %} {{ material }} {% if not loop.last %}, {% endif %} {% endfor %} ]

Categories: [ {% for category in categories %} {{ category }} {% if not loop.last %}, {% endif %} {% endfor %} ]

if the input does not contain any of the keys above, you should return an empty json object like this:
```json
{}
```
Sometimes the material and category can be negated, so you should also handle that by using the operator "!=" for material and category. 

Sometimes the material and category is not explicitly mentioned, you should analyze which material and category is the most suitable based on the input, and return the json with the material and category that you think is the most suitable.

Nestede conditions are allowed, for nested conditions, you can use "OR" and "AND" as the operator, and the conditions should be in the "conditions" array.

The example of the result are expected to be like this:

1. Input: "can you give me a adress with cotton material?"
   output:
```json
{
    "operator": "AND",
    "conditions": [
        {"field": "meta.material", "operator": "==", "value": "Cotton"},
        {"field": "meta.category", "operator": "==", "value": "Dresses/Jumpsuits"}
    ]
}
```

2. Input: "Give me Shirt that is not made of cotton and has a price less than $100"
output:
```json
{
    "operator": "AND",
    "conditions": [
        {"field": "meta.category", "operator": "==", "value": "Tops"},
        {"field": "meta.material", "operator": "!=", "value": "Cotton"},
        {"field": "meta.price", "operator": "<=", "value": 100}
    ]
}
3. Input: "I want a dress that is not hot and has a price greater than $50"
output:
```json
{
    "operator": "AND",
    "conditions": [
        {"field": "meta.category", "operator": "==", "value": "Dresses/Jumpsuits"},
        {"field": "meta.price", "operator": ">=", "value": 50},
        {
            "operator": "OR",
            "conditions": [
                {"field": "meta.material", "operator": "==", "value": "Cotton"},
                {"field": "meta.material", "operator": "==", "value": "Polyester"}
            ]
        }
    ]
}

4. Input i want tops that have price between $20 and $50
output:
```json
{
    "operator": "AND",
    "conditions": [
        {"field": "meta.category", "operator": "==", "value": "Tops"},
        {
            "operator": "AND",
            "conditions":[
                {"field": "meta.price", "operator": ">=", "value": 20},
                {"field": "meta.price", "operator": "<=", "value": 50}
            ]
        }
    ]
}
```
5. Input: {{input}}
output:

```

"""

In [9]:
pipeline = Pipeline()
pipeline.add_component("materials", GetMaterials())
pipeline.add_component("categories", GetCategories())
pipeline.add_component(
    "prompt_builder",
    PromptBuilder(
        template=TEMPLATE,
        required_variables=["input", "materials", "categories"],
    )
)
pipeline.add_component("generator", OpenAIGenerator(
    model="gpt-4.1",
    api_key=Secret.from_token(os.environ['OPENAI_API_KEY'])
))

In [10]:
pipeline.connect("materials.materials", "prompt_builder.materials")
pipeline.connect("categories.categories", "prompt_builder.categories")
pipeline.connect("prompt_builder","generator")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000020073253620>
🚅 Components
  - materials: GetMaterials
  - categories: GetCategories
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
🛤️ Connections
  - materials.materials -> prompt_builder.materials (List[str])
  - categories.categories -> prompt_builder.categories (List[str])
  - prompt_builder.prompt -> generator.prompt (str)

In [11]:
user_input = "I want to find an Outerwear that is not make me hot"

In [12]:
response = pipeline.run(
    {
        "prompt_builder":{
            "input": user_input
        }
    }
)

In [13]:
response

{'generator': {'replies': ['```json\n{\n    "operator": "AND",\n    "conditions": [\n        {"field": "meta.category", "operator": "==", "value": "Outerwear"},\n        {\n            "operator": "OR",\n            "conditions": [\n                {"field": "meta.material", "operator": "==", "value": "Cotton"},\n                {"field": "meta.material", "operator": "==", "value": "Polyester"}\n            ]\n        }\n    ]\n}\n```'],
  'meta': [{'model': 'gpt-4.1-2025-04-14',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 101,
     'prompt_tokens': 977,
     'total_tokens': 1078,
     'completion_tokens_details': {'accepted_prediction_tokens': 0,
      'audio_tokens': 0,
      'reasoning_tokens': 0,
      'rejected_prediction_tokens': 0},
     'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}}]}}

mengubah response menjadi JSON

In [14]:
import re
import json
response_selected = response['generator']['replies'][0]
json_match = re.search(r'```json\n(.*?)\n```', response_selected, re.DOTALL)
if json_match:
    json_string = json_match.group(1)
    # Parse the JSON string into a Python object
    data = json.loads(json_string)
    # print(data)
else:
    print("No JSON found.")

In [15]:
data

{'operator': 'AND',
 'conditions': [{'field': 'meta.category',
   'operator': '==',
   'value': 'Outerwear'},
  {'operator': 'OR',
   'conditions': [{'field': 'meta.material',
     'operator': '==',
     'value': 'Cotton'},
    {'field': 'meta.material', 'operator': '==', 'value': 'Polyester'}]}]}