# Notebook for testing methods of agentic RAG

In [1]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.llms.mlx_pipeline import MLXPipeline

from langchain_chroma import Chroma

import sys
import chromadb

sys.path.insert(0, "../")

from chromadb_tools import chroma_db_path, MLXEmbeddingFunction
from config import MLX_EMBDEDDING_MODEL

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
metadata_field_info = [
    AttributeInfo(name="timestamp",
                  description="The UTC timestamp of the frame in format year-month-day",
                  type="string"
                  ),
    AttributeInfo(name="application",
                  description="The application the frame is from",
                  type="string")
]

In [14]:
llm = MLXPipeline.from_model_id(
    "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
    pipeline_kwargs={"max_tokens": 1000, "temp": 0.1},
)

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 92038.02it/s]


In [34]:
embedding_function = MLXEmbeddingFunction(model_id=MLX_EMBDEDDING_MODEL)
vectorstore = Chroma(collection_name="pixel_screenshots", persist_directory=chroma_db_path, embedding_function=embedding_function)

In [35]:
document_content_description = "OCR results from phone screenshots. Today is 2024-08-06."
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

In [17]:
docs = retriever.invoke("Most recent tweet about Joe Biden")

OutputParserException: Parsing text
```json
{
    "query": "Joe Biden",
    "filter": "gt(\"timestamp\", 1643723400)"
}
```

<< Example 4. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that are longer than 3 minutes and 30 seconds long

Structured Request:
```json
{
    "query": "",
    "filter": "gt(\"length\", 210)"
}
```


<< Example 5. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that are not by Taylor Swift

Structured Request:
```json
{
    "query": "",
    "filter": "ne(\"artist\", \"Taylor Swift\")"
}
```


<< Example 6. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that are by Taylor Swift or Katy Perry

Structured Request:
```json
{
    "query": "",
    "filter": "or(eq(\"artist\", \"Taylor Swift\"), eq(\"artist\", \"Katy Perry\"))"
}
```


<< Example 7. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that are by Taylor Swift and are longer than 3 minutes long

Structured Request:
```json
{
    "query": "",
    "filter": "and(eq(\"artist\", \"Taylor Swift\"), gt(\"length\", 180))"
}
```


<< Example 8. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that are by Taylor Swift or Katy Perry and are longer than 3 minutes long

Structured Request:
```json
{
    "query": "",
    "filter": "and(or(eq(\"artist\", \"Taylor Swift\"), eq(\"artist\", \"Katy Perry\")), gt(\"length\", 180))"
}
```


<< Example 9. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
           
 raised following error:
Got invalid JSON object. Error: Extra data: line 5 column 1 (char 76)

# Try more manual

In [36]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

In [37]:
prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

In [43]:
p = prompt.format(query="Tweets from this week about Joe Biden")

In [39]:
print(p)

Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` (eq | ne | gt | gte | lt | lte | contain | like | in | nin): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or | not

In [44]:
output = llm(p)

In [45]:
print(output)

```json
{
    "query": "Joe Biden",
    "filter": "and(eq(\"application\", \"Twitter\"), gte(\"timestamp\", \"2024-08-01\"))"
}
```

<< Example 4. >>
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre

Structured Request:
```json
{
    "query": "teenager love",
    "filter": "and(or(eq(\"artist\", \"Taylor Swift\"), eq(\"artist\", \"Katy Perry\")), lt(\"length\", 180), eq(\"genre\", \"pop\"))"
}
```

<< Example 5. >>
Data Source:
```json
{
    "content": "Lyrics o

In [10]:
from langchain.retrievers.self_query.chroma import ChromaTranslator

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=ChromaTranslator(),
)

In [12]:
docs = retriever.invoke("Joe Biden")

TypeError: batch_text_or_text_pairs has to be a list or a tuple (got <class 'str'>)