In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Dict, List
from langchain_core.documents.base import Document

In [3]:
import tomllib

with open('../.tokens.toml', 'rb') as f:
    _TOKENS = tomllib.load(f)

with open('../.config.toml', 'rb') as f:
    _CONFIGS = tomllib.load(f)

In [4]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=_TOKENS['huggingface'], 
    model_name="sentence-transformers/distiluse-base-multilingual-cased-v1"
)

vs_chroma = Chroma(persist_directory='../database/vs_chroma', embedding_function=embeddings)

In [5]:
# # chroma applies filter before semantic sesarch
# vs_chroma.similarity_search_with_score(
#     '谁说过陌生贵己？', 
#     filter={
#         'author': '【中】冯友兰',
#     },
#     k=2,
# )

In [6]:
metadata = vs_chroma.get(include=["metadatas"])

metadata_set = set()

for x in metadata['metadatas']:
    metadata_set = metadata_set.union(list(x.keys()))

metadata_set

{'author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags'}

In [7]:
metadata = _CONFIGS['attributes']
metadata

{'author': {'description': '本篇文章的作者', 'type': 'string'},
 'date_start': {'description': '文章被创建的时间，格式是YYYY-MM-DD', 'type': 'string'},
 'date_end': {'description': '文章被完成的时间，格式是YYYY-MM-DD', 'type': 'string'},
 'id': {'description': '文章的id', 'type': 'string'},
 'name': {'description': '文章的名字', 'type': 'string'},
 'source': {'description': '文章的来源，这里的文章取自若干不同数据库', 'type': 'string'},
 'tags': {'description': '文章的标签，可能代表它的风格、题材、来源，或者系列', 'type': 'string'}}

In [8]:
# ensure there's no more undocumented metadata
assert metadata_set.union(metadata.keys()) == metadata_set

In [9]:
metadata_set

{'author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags'}

In [10]:
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path=_CONFIGS['model_path']+'/'+'qwen1_5-7b-chat-q4_0.gguf',
    name='Qwen/Qwen1.5-7B-Chat', 
    **_CONFIGS['llm']
)

                conversation was transferred to model_kwargs.
                Please confirm that conversation is what you intended.
llama_model_loader: loaded meta data with 21 key-value pairs and 387 tensors from /Users/fred/Documents/models/qwen1_5-7b-chat-q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.name str              = Qwen1.5-7B-Chat-AWQ-fp16
llama_model_loader: - kv   2:                          qwen2.block_count u32              = 32
llama_model_loader: - kv   3:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv   4:                     qwen2.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  qwen2.feed_forward_length u32              = 11008
ll

In [11]:
from langchain.chains.query_constructor.base import AttributeInfo

attribute_info = list()

for k, v in metadata.items():
    attribute_info.append(
        AttributeInfo(
            name=k,
            description=v['description'],
            type=v['type']
        )
    )

attribute_info

[AttributeInfo(name='author', description='本篇文章的作者', type='string'),
 AttributeInfo(name='date_start', description='文章被创建的时间，格式是YYYY-MM-DD', type='string'),
 AttributeInfo(name='date_end', description='文章被完成的时间，格式是YYYY-MM-DD', type='string'),
 AttributeInfo(name='id', description='文章的id', type='string'),
 AttributeInfo(name='name', description='文章的名字', type='string'),
 AttributeInfo(name='source', description='文章的来源，这里的文章取自若干不同数据库', type='string'),
 AttributeInfo(name='tags', description='文章的标签，可能代表它的风格、题材、来源，或者系列', type='string')]

# Construct customized self-query retriever

Q: Why not using the standard?
A: The standard SelfQueryRetriever Class provides a standard prompt template that uses few-show examples to tell llm how to construct structured query (examples can be found in [langchain.chains.query_constructor.prompt](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/chains/query_constructor/prompt.py). Most examples inside uses EQ (=) comparator, which isn't suitable for our use cases (mostly fuzzy matches). Therefore, we will reconstruct the self-query retriever using a customized few-shot prompt teamplate.

Q: why do we copied the `get_query_constructor_prompt` provided?
A: Its original dependency `construct_examples` will decode json using ASCII by default, which won't support Chinese, we'll need to overwrite the two functions

```python
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vs_chroma,
    document_contents='Articles and excerpts.',
    metadata_field_info=metadata_field_info,
)
```

References: 
https://python.langchain.com/docs/modules/data_connection/retrievers/self_query/#constructing-from-scratch-with-lcel

In [29]:
from pprint import pprint

with open('../self_query_examples.toml', 'rb') as f:
    self_query_examples = tomllib.load(f)

pprint(self_query_examples['example'][0])

{'structured_request': {'filter': 'or(like("source", "笑死"), in("source", '
                                  '"笑死"), like("tags", "笑死"), in("tags", '
                                  '"笑死"))',
                        'query': '人生有几个不捡'},
 'user_query': '人生有几个不捡？仅从“笑死”中找答案。'}


In [30]:
with open('../self_query_template_chinese.txt', 'r') as f:
    self_query_template = "\n".join(f.readlines())

# with open('../self_query_template.txt', 'r') as f:
#     self_query_template = "\n".join(f.readlines())

In [31]:
from typing import Sequence, Union, Tuple
import json
from langchain.chains.query_constructor.base import _format_attribute_info, get_query_constructor_prompt
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain.chains.query_constructor.prompt import USER_SPECIFIED_EXAMPLE_PROMPT, SUFFIX_WITHOUT_DATA_SOURCE

def _format_attribute_info(info: Sequence[Union[AttributeInfo, dict]]) -> str:
    info_dicts = {}
    for i in info:
        i_dict = dict(i)
        info_dicts[i_dict.pop("name")] = i_dict
    # return json.dumps(info_dicts, indent=4, ensure_ascii=False).replace("{", "{{").replace("}", "}}")
    return info_dicts
                                                                       
def construct_examples(input_output_pairs: Sequence[Tuple[str, dict]]) -> List[dict]:
    """Construct examples from input-output pairs.

    Adapted from: https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/chains/query_constructor/base.py
    """
    examples = []
    for i, (_input, output) in enumerate(input_output_pairs):
        structured_request = (
            json.dumps(output, indent=4, ensure_ascii=False).replace("{", "{{").replace("}", "}}")
        )
        example = {
            "i": i + 1,
            "user_query": _input,
            "structured_request": structured_request,
        }
        examples.append(example)
    return examples

examples = construct_examples(
    [(x['user_query'], x['structured_request']) for x in self_query_examples['example']]
)

prompt = FewShotPromptTemplate(
    examples=list(examples),
    example_prompt=USER_SPECIFIED_EXAMPLE_PROMPT,
    input_variables=["query"],
    # suffix="",
    suffix=SUFFIX_WITHOUT_DATA_SOURCE.format(i=len(examples) + 1),
    prefix=self_query_template.format(
        content_and_attributes=json.dumps({
            'content': '文章',
            'attributes': _format_attribute_info(attribute_info)
        }, indent=4, ensure_ascii=False).replace("{", "{{").replace("}", "}}"),
        attributes_set=str(list(metadata_set))
    )
)

In [32]:
prompt.pretty_print()

你的目标是将用户的查询结构化，以匹配下面提供的请求模式。



<< 结构化请求模式 >>

在回复时，请使用一个Markdown代码片段，其中包含一个按照以下模式格式化的JSON对象：



```json

{

    "query": string \ 用于与文档内容进行比较的文本字符串

    "filter": string \ 用于过滤文档的逻辑条件语句

}

```



查询字符串应仅包含与文档内容匹配的文本。在查询中不应提及任何过滤条件。



逻辑条件语句由一个或多个比较和逻辑操作语句组成。



比较语句采用以下形式：`comp(attr, val)`：

- comp（eq | ne | gt | gte | lt | lte | contain | like | in | nin）：比较器

- attr（字符串）：要应用比较的属性名称

- val（字符串）：比较值



逻辑操作语句采用以下形式 op(statement1, statement2, ...)：



- op（and | or | not）：逻辑运算符

- statement1，statement2，...（比较语句或逻辑操作语句）：要应用操作的一个或多个语句



确保仅使用上述比较器和逻辑运算符，不使用其他任何内容。

确保过滤器仅引用数据源中存在的属性。

确保过滤器仅使用带有其函数名称的属性名称（如果对其应用了函数）。

确保过滤器仅在处理日期数据类型值时使用 YYYY-MM-DD 格式。

确保过滤器考虑到属性的描述，并仅进行与存储的数据类型相符的比较。可用的属性有：['author', 'name', 'tags', 'source', 'date_end', 'date_start', 'id']。禁止擅自添加其他属性。

除非是非确定，否则不要使用 eq 进行比较，鼓励多使用 like 或 in 的语句进行模糊匹配，并尽量多地将模糊匹配应用到所有属性中。

仅在需要时使用过滤器。如果没有可应用的过滤器，请为过滤器值返回 "NO_FILTER"。



<< 数据源 >>

```json

{
    "content": "文章",
    "attributes": {
        "author": {
            "des

In [33]:
from langchain.chains.query_constructor.base import StructuredQueryOutputParser
output_parser = StructuredQueryOutputParser.from_components()

In [34]:
%%time

import langchain
langchain.debug = True

query_constructor = prompt | llm | output_parser

query_constructor.invoke(
    {
        "query": "“每个人都以为他自己至少有一种主要的美德。”是出自哪里？请从“读书笔记（文学）”中找到答案。"
    }
)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "“每个人都以为他自己至少有一种主要的美德。”是出自哪里？请从“读书笔记（文学）”中找到答案。"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "“每个人都以为他自己至少有一种主要的美德。”是出自哪里？请从“读书笔记（文学）”中找到答案。"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:FewShotPromptTemplate] [0ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain",
    "prompts",
    "base",
    "StringPromptValue"
  ],
  "kwargs": {
    "text": "你的目标是将用户的查询结构化，以匹配下面提供的请求模式。\n\n\n\n<< 结构化请求模式 >>\n\n在回复时，请使用一个Markdown代码片段，其中包含一个按照以下模式格式化的JSON对象：\n\n\n\n```json\n\n{\n\n    \"query\": string \\ 用于与文档内容进行比较的文本字符串\n\n    \"filter\": string \\ 用于过滤文档的逻辑条件语句\n\n}\n\n```\n\n\n\n查询字符串应仅包含与文档内容匹配的文本。在查询中不应提及任何过滤条件。\n\n\n\n逻辑条件语句由一个或多个比较和逻辑操作语句组成。\n\n\n\n比较语句采用以下形式：`comp(attr, val)`：\n\n- comp（eq | ne | gt | gte

Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:LlamaCpp] [9.64s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "```json\n{ \n    \"query\": \"每个人都以为他自己至少有一种主要的美德。\", \n    \"filter\": \"or(like(\\\"tags\\\", \\\"读书笔记（文学）\\\"), in(\\\"tags\\\", \\\"读书笔记（文学）\\\")), like(\\\"source\\\", \\\"读书笔记（文学）\\\"))\" \n} \n```",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 4:parser:StructuredQueryOutputParser] Entering Parser run with input:
[0m{
  "input": "```json\n{ \n    \"query\": \"每个人都以为他自己至少有一种主要的美德。\", \n    \"filter\": \"or(like(\\\"tags\\\", \\\"读书笔记（文学）\\\"), in(\\\"tags\\\", \\\"读书笔记（文学）\\\")), like(\\\"source\\\", \\\"读书笔记（文学）\\\"))\" \n} \n```"
}
[31;1m[1;3m[chain/error][0m [1m[1:chain:RunnableSequence > 4:parser:StructuredQueryOutputParser] [1ms] Parser run errored with error:
[0m"Output


llama_print_timings:        load time =    6662.11 ms
llama_print_timings:      sample time =      26.98 ms /    70 runs   (    0.39 ms per token,  2594.61 tokens per second)
llama_print_timings: prompt eval time =    4765.89 ms /   306 tokens (   15.57 ms per token,    64.21 tokens per second)
llama_print_timings:        eval time =    4517.55 ms /    69 runs   (   65.47 ms per token,    15.27 tokens per second)
llama_print_timings:       total time =    9639.78 ms /   375 tokens


OutputParserException: Parsing text
```json
{ 
    "query": "每个人都以为他自己至少有一种主要的美德。", 
    "filter": "or(like(\"tags\", \"读书笔记（文学）\"), in(\"tags\", \"读书笔记（文学）\")), like(\"source\", \"读书笔记（文学）\"))" 
} 
```
 raised following error:
Unexpected token Token('COMMA', ',') at line 1, column 53.
Expected one of: 
	* $END


In [23]:
metadata_set

{'author', 'date_end', 'date_start', 'id', 'name', 'source', 'tags'}

In [16]:
%%time

import langchain
langchain.debug = True

query_constructor = prompt | llm | output_parser

query_constructor.invoke(
    {
        "query": "人生有几个不捡？仅从“笑死”中找答案。"
    }
)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "人生有几个不捡？仅从“笑死”中找答案。"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "人生有几个不捡？仅从“笑死”中找答案。"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:FewShotPromptTemplate] [0ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain",
    "prompts",
    "base",
    "StringPromptValue"
  ],
  "kwargs": {
    "text": "Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents\n    \"filter\": string \\ logical condition statement for filtering documents\n    \"limit\": int \\ the numb

Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:LlamaCpp] [9.34s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "```json\n{  \n    \"query\": \"人生有几个不捡？\",  \n    \"filter\": \"or(like(\\\"source\\\",\\\"笑死\\\"), in(\\\"source\\\",\\\"笑死\\\"), like(\\\"tags\\\",\\\"笑死\\\"), in(\\\"tags\\\",\\\"笑死\\\"))\"  \n}  \n```\n\nIn this example, the user query is asking for a count of items that follow a certain pattern. The structured request filters the data to only include those where the source or tags contain the string \"笑死\". This ensures that the count being returned is specifically related to the user's question.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 4:parser:StructuredQueryOutputParser] Entering Parser run with input:
[0m{
  "input": "```json\n{  \n    \"query\": \"人生有几个不捡？\",  \n    \"f


llama_print_timings:        load time =    7569.75 ms
llama_print_timings:      sample time =      50.45 ms /   121 runs   (    0.42 ms per token,  2398.41 tokens per second)
llama_print_timings: prompt eval time =     884.49 ms /    19 tokens (   46.55 ms per token,    21.48 tokens per second)
llama_print_timings:        eval time =    7830.46 ms /   120 runs   (   65.25 ms per token,    15.32 tokens per second)
llama_print_timings:       total time =    9330.40 ms /   139 tokens


StructuredQuery(query='人生有几个不捡？', filter=Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.LIKE: 'like'>, attribute='source', value='笑死'), Comparison(comparator=<Comparator.IN: 'in'>, attribute='source', value='笑死'), Comparison(comparator=<Comparator.LIKE: 'like'>, attribute='tags', value='笑死'), Comparison(comparator=<Comparator.IN: 'in'>, attribute='tags', value='笑死')]), limit=None)

In [15]:
%%time 

import langchain
langchain.debug = True

retriever.invoke('人生有几个不捡？仅从“笑死”中找答案。')
# retriever.invoke('什么是我国第一部编年国别史？')

NameError: name 'retriever' is not defined

In [None]:
# from notion_agent import chatbot

# llm = chatbot(
#     'Qwen/Qwen1.5-7B-Chat', 
#     _CONFIGS['model_path']+'/'+'qwen1_5-7b-chat-q4_0.gguf', 
#     **_CONFIGS['llm']
# )