ecodeclub · longyue0521 · Sep 25, 2025 · Sep 24, 2025
diff --git a/app/config/settings.py b/app/config/settings.py
@@ -33,6 +33,13 @@ class ElasticsearchSettings(BaseModel):
     """Elasticsearch 相关配置"""
 
     url: str
+    number_of_shards: int
+    number_of_replicas: int
+    index_max_result_window: int
+    index_refresh_interval: str
+    index_option_type: str
+    index_option_m: int
+    index_option_ef_construction: int
     metadata_index_suffix: str
     chunk_index_suffix: str
     request_timeout: int = 15
@@ -44,7 +51,6 @@ class EmbedderSettings(BaseModel):
     model_name: str
     dimensions: int
     similarity_metric: str
-    index_type: str
 
 
 class RerankerSettings(BaseModel):
@@ -82,9 +88,16 @@ class RetrievalSettings(BaseModel):
 
     multiplier: int = Field(5, description="召回倍数配置")
     vector_weight: float = Field(2.0, description="向量搜索权重")
+    vector_similarity: float = Field(0.7, description="相似度")
     text_weight: float = Field(1.0, description="文本搜索权重")
 
 
+class SearchSettings(BaseModel):
+    """搜索相关配置"""
+
+    max_top_k: int = Field(50, description="最大top_k值限制")
+
+
 class TencentOssSettings(BaseModel):
     """
     腾讯云对象存储相关配置。
@@ -114,6 +127,7 @@ class Settings(BaseSettings):
     storage: StorageSettings
     upload: UploadSettings
     retrieval: RetrievalSettings
+    search: SearchSettings
 
     @property
     def cos_config(self) -> CosConfig:

diff --git a/app/service/elasticsearch.py b/app/service/elasticsearch.py
@@ -97,11 +97,11 @@ def _ensure_metadata_index_exists(self, metadata_index: str) -> None:
         if not self._client.indices.exists(index=metadata_index):
             body = {
                 "settings": {
-                    "number_of_shards": 1,
-                    "number_of_replicas": 0,
+                    "number_of_shards": self._settings.elasticsearch.number_of_shards,
+                    "number_of_replicas": self._settings.elasticsearch.number_of_replicas,
                     "index": {
-                        "max_result_window": 10000,
-                        "refresh_interval": "1s",
+                        "max_result_window": self._settings.elasticsearch.index_max_result_window,
+                        "refresh_interval": self._settings.elasticsearch.index_refresh_interval,
                     },
                 },
                 "mappings": {
@@ -138,11 +138,11 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
         if not self._client.indices.exists(index=chunk_index):
             body = {
                 "settings": {
-                    "number_of_shards": 1,
-                    "number_of_replicas": 0,
+                    "number_of_shards": self._settings.elasticsearch.number_of_shards,
+                    "number_of_replicas": self._settings.elasticsearch.number_of_replicas,
                     "index": {
-                        "max_result_window": 10000,
-                        "refresh_interval": "1s",
+                        "max_result_window": self._settings.elasticsearch.index_max_result_window,
+                        "refresh_interval": self._settings.elasticsearch.index_refresh_interval,
                     },
                 },
                 "mappings": {
@@ -159,9 +159,9 @@ def _ensure_chunk_index_exists(self, chunk_index: str) -> None:
                             "similarity": self._embedder.similarity_metric,
                             "index": True,
                             "index_options": {
-                                "type": self._settings.embedder.index_type,
-                                "m": 32,
-                                "ef_construction": 100,
+                                "type": self._settings.elasticsearch.index_option_type,
+                                "m": self._settings.elasticsearch.index_option_m,
+                                "ef_construction": self._settings.elasticsearch.index_option_ef_construction,
                             },
                         },
                         "chunk_index": {"type": "integer"},
@@ -207,7 +207,9 @@ def store_for_vector_hybrid_search(self, document: Document) -> str:
         metadata_index, chunk_index = self._ensure_indexes_exist(
             document.index_prefix
         )
-
+        logger.info(
+            f"向量混合搜索： 元数据索引名={metadata_index} 分片索引名={chunk_index}"
+        )
         metadata_id = self._create_metadata(metadata_index, document)
         document.id = metadata_id  # 确保 document 对象持有 ID
         logger.info(f"元数据占位符创建成功，ID: {metadata_id}")
@@ -359,10 +361,11 @@ def search(self, parameters: SearchParameters) -> SearchResult:
             )
 
         # 执行ES搜索
+        logger.info(f"在 {parameters.index_name} 上执行查询: {search_body}")
         response = self._client.search(
             index=parameters.index_name, body=search_body
         )
-
+        logger.info(f"查询结果: {response}")
         # 计算搜索耗时
         search_time_ms = int((time.time() - start_time) * 1000)
 
@@ -416,57 +419,57 @@ def _build_hybrid_search_body(
             ES查询体
         """
         # 获取文本查询进行向量化
-        text_query: str | None = None
-        for condition in search_conditions["vector"]:
-            if isinstance(condition.value, str):
-                text_query = condition.value
-        if not text_query:
-            raise ValueError("向量混合搜索需要文本查询内容")
+        text_query = cast("str", search_conditions["vector"][0].value)
 
         # 生成查询向量
         query_vector = self._embedder.embed_documents([text_query])[0]
 
         # 计算召回数量（用于后续重排序）
-        retrieval_size = parameters.limit * self._settings.retrieval.multiplier
+        k = parameters.limit * self._settings.retrieval.multiplier
+        vector_similarity = self._settings.retrieval.vector_similarity
 
         # 获取权重配置
         vector_weight = self._settings.retrieval.vector_weight
         text_weight = self._settings.retrieval.text_weight
 
+        # # 确保 num_candidates 至少为 k 的 2 倍或 100，取较大值
+        num_candidates = max(k * 2, 100)
+
         # 构建混合搜索查询体
         search_body: dict[str, Any] = {
-            "size": retrieval_size,
+            "size": parameters.limit,
             "_source": ["content", "file_metadata_id"],  # 只返回需要的字段
             "knn": {
                 "field": "content_vector",  # 固定向量字段
                 "query_vector": query_vector,
-                "k": retrieval_size,
-                "num_candidates": 100,
+                "k": k,
+                "num_candidates": num_candidates,
                 "boost": vector_weight,
+                "similarity": vector_similarity,
             },
             "query": {
                 "bool": {
-                    "should": [
-                        # 普通匹配
+                    "must": [
                         {
                             "match": {
                                 "content": {
                                     "query": text_query,
-                                    "boost": text_weight * 0.5,
+                                    "boost": text_weight * 0.7,  # 基础匹配权重
                                 }
                             }
-                        },
-                        # 短语匹配
+                        }
+                    ],
+                    "should": [
                         {
                             "match_phrase": {
                                 "content": {
                                     "query": text_query,
-                                    "boost": text_weight * 0.3,
+                                    "boost": text_weight * 0.3,  # 短语匹配加分
                                 }
                             }
-                        },
+                        }
                     ],
-                    "minimum_should_match": 0,
+                    "minimum_should_match": 0,  # should是纯加分项
                 }
             },
         }
@@ -554,7 +557,7 @@ def _convert_to_search_result(
         # 根据搜索类型处理结果
         if is_hybrid_search:
             documents = self._process_hybrid_search_results(
-                cast("str", search_conditions["vector"][0].value), hits, limit
+                cast("str", search_conditions["vector"][0].value), hits
             )
         else:
             documents = self._process_structured_search_results(hits)
@@ -569,7 +572,6 @@ def _process_hybrid_search_results(
         self,
         text_query: str,
         hits: list[dict[str, Any]],
-        limit: int,
     ) -> list[DocumentResult]:
         """
         处理混合搜索结果：去重 + 重排序
@@ -603,7 +605,7 @@ def _process_hybrid_search_results(
                 unique_chunks.append(chunk)
 
         # 重排
-        return self._reranker.rerank(text_query, unique_chunks)[:limit]
+        return self._reranker.rerank(text_query, unique_chunks)
 
     @staticmethod
     def _process_structured_search_results(

diff --git a/app/utils/converters/search.py b/app/utils/converters/search.py
@@ -58,9 +58,11 @@ def request_vo_to_domain(request: SearchRequest) -> SearchParameters:
             conditions = [
                 SearchCondition(
                     field_name=cond.field,
-                    mode=SearchMode.TERM
-                    if cond.op == ConditionOperator.TERM
-                    else SearchMode.MATCH,
+                    mode=(
+                        SearchMode.TERM
+                        if cond.op == ConditionOperator.TERM
+                        else SearchMode.MATCH
+                    ),
                     value=cond.value,
                 )
                 for cond in request.query.conditions
@@ -83,7 +85,7 @@ def result_domain_to_vo(
         if search_type == SearchType.VECTOR_HYBRID:
             results = [
                 VectorHybridSearchResult(
-                    text=doc.content.get("text", ""),
+                    text=doc.content.get("content", ""),
                     file_metadata_id=doc.content.get("file_metadata_id", ""),
                     score=doc.score,
                 )
@@ -100,4 +102,4 @@ def result_domain_to_vo(
                 if doc.id
             ]
 
-        return SearchResponse(type=search_type, results=results)
+        return SearchResponse(results=results)
diff --git a/app/web/document.py b/app/web/document.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 from urllib.parse import urlparse
 
+from elasticsearch import NotFoundError
 from fastapi import (
     APIRouter,
     BackgroundTasks,
@@ -426,6 +427,10 @@ async def search(self, request: SearchRequest) -> SearchResponse:
                 f"✅ 搜索完成, 返回{len(domain_response.documents)}条结果"
             )
             return resp
+        except NotFoundError as e:
+            raise HTTPException(
+                status_code=404, detail=f"索引 {request.query.index} 不存在"
+            ) from e
         except Exception as e:
             logger.error(f"❌ 搜索失败: {e}", exc_info=True)
             raise HTTPException(status_code=500, detail="搜索处理失败") from e

diff --git a/app/web/vo.py b/app/web/vo.py
@@ -20,6 +20,8 @@
 from pydantic import BaseModel, Field, HttpUrl, Json, field_validator
 from pydantic_core.core_schema import ValidationInfo
 
+from app.config.settings import settings
+
 
 class FileUploadResponse(BaseModel):
     """文件上传后的标准响应模型"""
@@ -75,6 +77,16 @@ class Condition(BaseModel):
         ..., description="字段值，支持多种类型"
     )
 
+    @field_validator("value")
+    @classmethod
+    def validate_value_not_empty_string(
+        cls, v: str | int | float | bool
+    ) -> str | int | float | bool:
+        """验证字符串值不能为空"""
+        if isinstance(v, str) and v.strip() == "":
+            raise ValueError("字符串类型的查询值不能为空")
+        return v
+
 
 class Query(BaseModel):
     """查询对象"""
@@ -93,13 +105,16 @@ class SearchRequest(BaseModel):
 
     type: SearchType = Field(..., description="搜索类型")
     query: Query = Field(..., description="查询条件")
-    top_k: int = Field(..., ge=1, description="返回结果数量，至少为1")
+    top_k: int = Field(
+        ...,
+        ge=1,
+        le=settings.search.max_top_k,
+        description="返回结果数量 1 <= top_k <= 配置文件中的max_top_k",
+    )
 
     @field_validator("query")
     @classmethod
-    def validate_query_for_search_type(
-        cls, v: Query, info: ValidationInfo
-    ) -> Query:
+    def validate_query(cls, v: Query, info: ValidationInfo) -> Query:
         """根据搜索类型验证查询条件"""
         search_type = info.data.get("type")
 
@@ -134,7 +149,6 @@ class StructuredSearchResult(BaseModel):
 class SearchResponse(BaseModel):
     """搜索响应"""
 
-    type: SearchType = Field(..., description="搜索类型")  # 保持一致性
     results: list[VectorHybridSearchResult | StructuredSearchResult] = Field(
         default_factory=list, description="搜索结果"
     )

diff --git a/config.yaml b/config.yaml
@@ -1,5 +1,12 @@
 elasticsearch:
   url: "http://localhost:9200"
+  number_of_shards: 1
+  number_of_replicas: 0
+  index_max_result_window: 10000
+  index_refresh_interval: 1s
+  index_option_type: "int8_hnsw"
+  index_option_m: 32 # 控制HNSW图中每个节点可以连接的最大邻居节点数量
+  index_option_ef_construction: 100 # 索引构建时每个节点考虑的候选邻居数量，影响索引质量。
   metadata_index_suffix: "_metadatas"
   chunk_index_suffix: "_chunks"
   request_timeout: 60
@@ -26,7 +33,12 @@ upload:
     - ".pdf"
     - ".md"
     - ".txt"
+
 retrieval:
   multiplier: 5        # 召回倍数配置
   vector_weight: 2.0   # 向量搜索权重
-  text_weight: 1.0     # 文本搜索权重
+  vector_similarity: 0.1 # 向量搜索相似度阈值
+  text_weight: 1.0     # 文本搜索权重
+
+search:
+  max_top_k: 50        # 最大top_k值限制