In [1]:
import pymysql
import numpy as np

# ✅ 连接 MySQL
conn = pymysql.connect(
    host="localhost",
    user="root",
    password="1234",
    database="xunfei",
    charset="utf8mb4",
    cursorclass=pymysql.cursors.DictCursor
)

cursor = conn.cursor()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 表名与source_type映射
table_mapping = {
    "上海政府采购公告": "政府采购",
    "上海政府采购中标结果": "政府采购",
    "采购合同": "政府采购",
    "招标公告和资格预审公告": "工程建设",
    "中标候选人公示": "工程建设",
    "中标结果公示": "工程建设"
}


In [3]:
def record_to_text(record: dict) -> str:
    return "。".join([f"{key}：{str(value)}" for key, value in record.items() if value]) + "。"


In [8]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3',
                      use_fp16=False,
                      pooling_method='cls',
                      devices=['cuda:0'])

def get_embeddings(text):
    embeddings = model.encode(
        text,
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=False
    )
    return embeddings

Fetching 30 files: 100%|████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 30002.17it/s]


In [32]:
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="project_id", dtype=DataType.VARCHAR, max_length=100, is_primary=True),
    FieldSchema(name="source_type", dtype=DataType.VARCHAR, max_length=20),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=1024),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR)
]

schema = CollectionSchema(fields, description="Policy Paragraph Embeddings")
collection_name = "AllBiddings"

if utility.has_collection(collection_name):
    Collection(collection_name).drop()
collection = Collection(collection_name, schema, consistency_level="Strong")

dense_index = {"index_type": "HNSW", "metric_type": "L2"}
collection.create_index("dense_vector", dense_index)
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
collection.create_index("sparse_vector", sparse_index)
# col = Collection(col_name)
collection.load()

In [34]:
from tqdm import tqdm
max_length = 8192
for table_name, source_type in table_mapping.items():
    if table_name == "采购合同":
        continue
    cursor.execute(f"SELECT * FROM `{table_name}`")
    rows = cursor.fetchall()
    print(f"读取表 {table_name}：{len(rows)} 条")

    project_ids = []
    source_types = []
    texts = []
    dense_vectors = []
    sparse_vectors = []
    BATCH_SIZE = 100
    
    for row in tqdm(rows, desc='向量生成中...'):
        text = record_to_text(row).replace('"', '')
        if len(text.encode("utf-8")) > 8192:
            continue

        # print(text)
        # break
        try:
            project_ids.append(row["项目编号"])
        except:
            project_ids.append(row["招标项目编号"])
        vector = get_embeddings(text)
        dense_vector = vector['dense_vecs']
        sparse_vector = vector['lexical_weights']
        source_types.append(source_type)
        texts.append(text)
        dense_vectors.append(dense_vector)
        sparse_vectors.append(sparse_vector)
        try:
            if len(project_ids) > BATCH_SIZE:
                collection.insert([project_ids, source_types, texts, dense_vectors, sparse_vectors])
                project_ids = []
                source_types = []
                texts = []
                dense_vectors = []
                sparse_vectors = []
        except:
            continue
    if source_types:
        collection.insert([project_ids, source_types, texts, dense_vectors, sparse_vectors])

print("生成已完成。")

读取表 上海政府采购公告：4096 条


向量生成中...: 100%|███████████████████████████████████████████████████████████████| 4096/4096 [08:42<00:00,  7.85it/s]


读取表 上海政府采购中标结果：4469 条


向量生成中...: 100%|███████████████████████████████████████████████████████████████| 4469/4469 [05:56<00:00, 12.52it/s]


读取表 招标公告和资格预审公告：1088 条


向量生成中...: 100%|███████████████████████████████████████████████████████████████| 1088/1088 [02:18<00:00,  7.88it/s]


读取表 中标候选人公示：999 条


向量生成中...: 100%|█████████████████████████████████████████████████████████████████| 999/999 [01:09<00:00, 14.45it/s]


读取表 中标结果公示：999 条


向量生成中...: 100%|█████████████████████████████████████████████████████████████████| 999/999 [01:14<00:00, 13.42it/s]

生成已完成。





In [35]:
import re
from tqdm import tqdm

pattern = r'项目编号：(.*)'
for table_name, source_type in table_mapping.items():
    if table_name != "采购合同":
        continue
    cursor.execute(f"SELECT * FROM `{table_name}`")
    rows = cursor.fetchall()
    print(f"读取表 {table_name}：{len(rows)} 条")

    project_ids = []
    source_types = []
    texts = []
    dense_vectors = []
    sparse_vectors = []
    BATCH_SIZE = 100
    
    for row in tqdm(rows, desc='向量生成中...'):
        text = record_to_text(row).replace('"', '')
        if len(text.encode("utf-8")) > 8192:
            continue

        # print(text)
        project_id = re.search(pattern, text)
        if not project_id:
            continue
        project_id = project_id.group(1)
        project_ids.append(project_id)
        vector = get_embeddings(text)
        dense_vector = vector['dense_vecs']
        sparse_vector = vector['lexical_weights']
        source_types.append(source_type)
        texts.append(text)
        dense_vectors.append(dense_vector)
        sparse_vectors.append(sparse_vector)
        try:
            if len(project_ids) > BATCH_SIZE:
                collection.insert([project_ids, source_types, texts, dense_vectors, sparse_vectors])
                project_ids = []
                source_types = []
                texts = []
                dense_vectors = []
                sparse_vectors = []
        except:
            continue
    if source_types:
        collection.insert([project_ids, source_types, texts, dense_vectors, sparse_vectors])

print("生成已完成。")

读取表 采购合同：952 条


向量生成中...: 100%|█████████████████████████████████████████████████████████████████| 952/952 [01:56<00:00,  8.19it/s]

生成已完成。



