In [ ]:
# ✅ 安装依赖（本地环境可跳过）
# !pip install pymysql pymilvus FlagEmbedding


In [ ]:
import pymysql
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType
from FlagEmbedding import BGEM3FlagModel
import numpy as np

# ✅ 连接 MySQL
conn = pymysql.connect(
    host="localhost",
    user="root",
    password="your_password",
    database="xunfei",
    charset="utf8mb4",
    cursorclass=pymysql.cursors.DictCursor
)

cursor = conn.cursor()


In [ ]:
# ✅ 表名与source_type映射
table_mapping = {
    "上海政府采购公告": "政府采购",
    "上海政府采购中标结果": "政府采购",
    "采购合同": "政府采购",
    "招标公告和资格预审公告": "工程建设",
    "中标候选人公示": "工程建设",
    "中标结果公示": "工程建设"
}


In [ ]:
def record_to_text(record: dict) -> str:
    return "。".join([f"{key}：{str(value)}" for key, value in record.items() if value]) + "。"


In [ ]:
# ✅ 加载 BGE-M3 模型
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)


In [ ]:
# ✅ 连接 Milvus 并创建 Collection（仅首次执行）
connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="project_id", dtype=DataType.VARCHAR, max_length=100, is_primary=True),
    FieldSchema(name="source_type", dtype=DataType.VARCHAR, max_length=20),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="dense_vec", dtype=DataType.FLOAT_VECTOR, dim=1024)
]

schema = CollectionSchema(fields)
collection_name = "structured_policy_rag"

if collection_name not in [c.name for c in Collection.list()]:
    collection = Collection(name=collection_name, schema=schema)
else:
    collection = Collection(collection_name)

collection.load()


In [ ]:
# ✅ 处理所有表并写入 Milvus
for table_name, source_type in table_mapping.items():
    cursor.execute(f"SELECT * FROM `{table_name}`")
    rows = cursor.fetchall()
    print(f"读取表 {table_name}：{len(rows)} 条")

    batch_data = {
        "project_id": [],
        "source_type": [],
        "text": [],
        "dense_vec": []
    }

    for row in rows:
        if "项目编号" not in row or not row["项目编号"]:
            continue  # 项目编号为空时跳过

        text = record_to_text(row)
        vector = model.encode(text, return_dense=True)["dense_vecs"][0]
        vector = np.array(vector, dtype=np.float32).tolist()

        batch_data["project_id"].append(row["项目编号"])
        batch_data["source_type"].append(source_type)
        batch_data["text"].append(text)
        batch_data["dense_vec"].append(vector)

    # 写入 Milvus
    if batch_data["project_id"]:
        collection.insert([
            batch_data["project_id"],
            batch_data["source_type"],
            batch_data["text"],
            batch_data["dense_vec"]
        ])
        print(f"✅ 已写入 {len(batch_data['project_id'])} 条记录到 Milvus")
