Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion app/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ class ElasticsearchSettings(BaseModel):
"""Elasticsearch 相关配置"""

url: str
number_of_shards: int
number_of_replicas: int
index_max_result_window: int
index_refresh_interval: str
index_option_type: str
index_option_m: int
index_option_ef_construction: int
metadata_index_suffix: str
chunk_index_suffix: str
request_timeout: int = 15
Expand All @@ -44,7 +51,6 @@ class EmbedderSettings(BaseModel):
model_name: str
dimensions: int
similarity_metric: str
index_type: str


class RerankerSettings(BaseModel):
Expand Down Expand Up @@ -82,9 +88,16 @@ class RetrievalSettings(BaseModel):

multiplier: int = Field(5, description="召回倍数配置")
vector_weight: float = Field(2.0, description="向量搜索权重")
vector_similarity: float = Field(0.7, description="相似度")
text_weight: float = Field(1.0, description="文本搜索权重")


class SearchSettings(BaseModel):
"""搜索相关配置"""

max_top_k: int = Field(50, description="最大top_k值限制")


class TencentOssSettings(BaseModel):
"""
腾讯云对象存储相关配置。
Expand Down Expand Up @@ -114,6 +127,7 @@ class Settings(BaseSettings):
storage: StorageSettings
upload: UploadSettings
retrieval: RetrievalSettings
search: SearchSettings

@property
def cos_config(self) -> CosConfig:
Expand Down
70 changes: 36 additions & 34 deletions app/service/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ def _ensure_metadata_index_exists(self, metadata_index: str) -> None:
if not self._client.indices.exists(index=metadata_index):
body = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"number_of_shards": self._settings.elasticsearch.number_of_shards,
"number_of_replicas": self._settings.elasticsearch.number_of_replicas,
"index": {
"max_result_window": 10000,
"refresh_interval": "1s",
"max_result_window": self._settings.elasticsearch.index_max_result_window,
"refresh_interval": self._settings.elasticsearch.index_refresh_interval,
},
},
"mappings": {
Expand Down Expand Up @@ -138,11 +138,11 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
if not self._client.indices.exists(index=chunk_index):
body = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"number_of_shards": self._settings.elasticsearch.number_of_shards,
"number_of_replicas": self._settings.elasticsearch.number_of_replicas,
"index": {
"max_result_window": 10000,
"refresh_interval": "1s",
"max_result_window": self._settings.elasticsearch.index_max_result_window,
"refresh_interval": self._settings.elasticsearch.index_refresh_interval,
},
},
"mappings": {
Expand All @@ -159,9 +159,9 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
"similarity": self._embedder.similarity_metric,
"index": True,
"index_options": {
"type": self._settings.embedder.index_type,
"m": 32,
"ef_construction": 100,
"type": self._settings.elasticsearch.index_option_type,
"m": self._settings.elasticsearch.index_option_m,
"ef_construction": self._settings.elasticsearch.index_option_ef_construction,
},
},
"chunk_index": {"type": "integer"},
Expand Down Expand Up @@ -207,7 +207,9 @@ def store_for_vector_hybrid_search(self, document: Document) -> str:
metadata_index, chunk_index = self._ensure_indexes_exist(
document.index_prefix
)

logger.info(
f"向量混合搜索: 元数据索引名={metadata_index} 分片索引名={chunk_index}"
)
metadata_id = self._create_metadata(metadata_index, document)
document.id = metadata_id # 确保 document 对象持有 ID
logger.info(f"元数据占位符创建成功,ID: {metadata_id}")
Expand Down Expand Up @@ -359,10 +361,11 @@ def search(self, parameters: SearchParameters) -> SearchResult:
)

# 执行ES搜索
logger.info(f"在 {parameters.index_name} 上执行查询: {search_body}")
response = self._client.search(
index=parameters.index_name, body=search_body
)

logger.info(f"查询结果: {response}")
# 计算搜索耗时
search_time_ms = int((time.time() - start_time) * 1000)

Expand Down Expand Up @@ -416,57 +419,57 @@ def _build_hybrid_search_body(
ES查询体
"""
# 获取文本查询进行向量化
text_query: str | None = None
for condition in search_conditions["vector"]:
if isinstance(condition.value, str):
text_query = condition.value
if not text_query:
raise ValueError("向量混合搜索需要文本查询内容")
text_query = cast("str", search_conditions["vector"][0].value)

# 生成查询向量
query_vector = self._embedder.embed_documents([text_query])[0]

# 计算召回数量(用于后续重排序)
retrieval_size = parameters.limit * self._settings.retrieval.multiplier
k = parameters.limit * self._settings.retrieval.multiplier
vector_similarity = self._settings.retrieval.vector_similarity

# 获取权重配置
vector_weight = self._settings.retrieval.vector_weight
text_weight = self._settings.retrieval.text_weight

# # 确保 num_candidates 至少为 k 的 2 倍或 100,取较大值
num_candidates = max(k * 2, 100)

# 构建混合搜索查询体
search_body: dict[str, Any] = {
"size": retrieval_size,
"size": parameters.limit,
"_source": ["content", "file_metadata_id"], # 只返回需要的字段
"knn": {
"field": "content_vector", # 固定向量字段
"query_vector": query_vector,
"k": retrieval_size,
"num_candidates": 100,
"k": k,
"num_candidates": num_candidates,
"boost": vector_weight,
"similarity": vector_similarity,
},
"query": {
"bool": {
"should": [
# 普通匹配
"must": [
{
"match": {
"content": {
"query": text_query,
"boost": text_weight * 0.5,
"boost": text_weight * 0.7, # 基础匹配权重
}
}
},
# 短语匹配
}
],
"should": [
{
"match_phrase": {
"content": {
"query": text_query,
"boost": text_weight * 0.3,
"boost": text_weight * 0.3, # 短语匹配加分
}
}
},
}
],
"minimum_should_match": 0,
"minimum_should_match": 0, # should是纯加分项
}
},
}
Expand Down Expand Up @@ -554,7 +557,7 @@ def _convert_to_search_result(
# 根据搜索类型处理结果
if is_hybrid_search:
documents = self._process_hybrid_search_results(
cast("str", search_conditions["vector"][0].value), hits, limit
cast("str", search_conditions["vector"][0].value), hits
)
else:
documents = self._process_structured_search_results(hits)
Expand All @@ -569,7 +572,6 @@ def _process_hybrid_search_results(
self,
text_query: str,
hits: list[dict[str, Any]],
limit: int,
) -> list[DocumentResult]:
"""
处理混合搜索结果:去重 + 重排序
Expand Down Expand Up @@ -603,7 +605,7 @@ def _process_hybrid_search_results(
unique_chunks.append(chunk)

# 重排
return self._reranker.rerank(text_query, unique_chunks)[:limit]
return self._reranker.rerank(text_query, unique_chunks)

@staticmethod
def _process_structured_search_results(
Expand Down
12 changes: 7 additions & 5 deletions app/utils/converters/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ def request_vo_to_domain(request: SearchRequest) -> SearchParameters:
conditions = [
SearchCondition(
field_name=cond.field,
mode=SearchMode.TERM
if cond.op == ConditionOperator.TERM
else SearchMode.MATCH,
mode=(
SearchMode.TERM
if cond.op == ConditionOperator.TERM
else SearchMode.MATCH
),
value=cond.value,
)
for cond in request.query.conditions
Expand All @@ -83,7 +85,7 @@ def result_domain_to_vo(
if search_type == SearchType.VECTOR_HYBRID:
results = [
VectorHybridSearchResult(
text=doc.content.get("text", ""),
text=doc.content.get("content", ""),
file_metadata_id=doc.content.get("file_metadata_id", ""),
score=doc.score,
)
Expand All @@ -100,4 +102,4 @@ def result_domain_to_vo(
if doc.id
]

return SearchResponse(type=search_type, results=results)
return SearchResponse(results=results)
5 changes: 5 additions & 0 deletions app/web/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pathlib import Path
from urllib.parse import urlparse

from elasticsearch import NotFoundError
from fastapi import (
APIRouter,
BackgroundTasks,
Expand Down Expand Up @@ -426,6 +427,10 @@ async def search(self, request: SearchRequest) -> SearchResponse:
f"✅ 搜索完成, 返回{len(domain_response.documents)}条结果"
)
return resp
except NotFoundError as e:
raise HTTPException(
status_code=404, detail=f"索引 {request.query.index} 不存在"
) from e
except Exception as e:
logger.error(f"❌ 搜索失败: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="搜索处理失败") from e
Expand Down
24 changes: 19 additions & 5 deletions app/web/vo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from pydantic import BaseModel, Field, HttpUrl, Json, field_validator
from pydantic_core.core_schema import ValidationInfo

from app.config.settings import settings


class FileUploadResponse(BaseModel):
"""文件上传后的标准响应模型"""
Expand Down Expand Up @@ -75,6 +77,16 @@ class Condition(BaseModel):
..., description="字段值,支持多种类型"
)

@field_validator("value")
@classmethod
def validate_value_not_empty_string(
cls, v: str | int | float | bool
) -> str | int | float | bool:
"""验证字符串值不能为空"""
if isinstance(v, str) and v.strip() == "":
raise ValueError("字符串类型的查询值不能为空")
return v


class Query(BaseModel):
"""查询对象"""
Expand All @@ -93,13 +105,16 @@ class SearchRequest(BaseModel):

type: SearchType = Field(..., description="搜索类型")
query: Query = Field(..., description="查询条件")
top_k: int = Field(..., ge=1, description="返回结果数量,至少为1")
top_k: int = Field(
...,
ge=1,
le=settings.search.max_top_k,
description="返回结果数量 1 <= top_k <= 配置文件中的max_top_k",
)

@field_validator("query")
@classmethod
def validate_query_for_search_type(
cls, v: Query, info: ValidationInfo
) -> Query:
def validate_query(cls, v: Query, info: ValidationInfo) -> Query:
"""根据搜索类型验证查询条件"""
search_type = info.data.get("type")

Expand Down Expand Up @@ -134,7 +149,6 @@ class StructuredSearchResult(BaseModel):
class SearchResponse(BaseModel):
"""搜索响应"""

type: SearchType = Field(..., description="搜索类型") # 保持一致性
results: list[VectorHybridSearchResult | StructuredSearchResult] = Field(
default_factory=list, description="搜索结果"
)
Expand Down
14 changes: 13 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
elasticsearch:
url: "http://localhost:9200"
number_of_shards: 1
number_of_replicas: 0
index_max_result_window: 10000
index_refresh_interval: 1s
index_option_type: "int8_hnsw"
index_option_m: 32 # 控制HNSW图中每个节点可以连接的最大邻居节点数量
index_option_ef_construction: 100 # 索引构建时每个节点考虑的候选邻居数量,影响索引质量。
metadata_index_suffix: "_metadatas"
chunk_index_suffix: "_chunks"
request_timeout: 60
Expand All @@ -26,7 +33,12 @@ upload:
- ".pdf"
- ".md"
- ".txt"

retrieval:
multiplier: 5 # 召回倍数配置
vector_weight: 2.0 # 向量搜索权重
text_weight: 1.0 # 文本搜索权重
vector_similarity: 0.1 # 向量搜索相似度阈值
text_weight: 1.0 # 文本搜索权重

search:
max_top_k: 50 # 最大top_k值限制
Loading