In [1]:
!pip3 install milvus==2.3.3 pymilvus==2.3.3 sentence-transformers

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
# # # # 优先尝试git clone
# # # !git clone https://github.com/Ac-heron/luxun/

# # # # 如果git连接不上，手动上传从git上下载的zip，再进行解压
# # # !apt-get update
# # # !apt-get install unzip
# !unzip luxun-master.zip

In [3]:
# 加载鲁迅全集数据
import os

# file_path = 'luxun/全集' # git clone后，用这个path
file_path = 'luxun-master/全集' # 若是解压zip，用这个path
book_names = os.listdir(file_path)

data = []

for book_name in book_names:
    if book_name[0] != '.':
        chapter_names = os.listdir(file_path+'/'+book_name)
        for chapter_name in chapter_names:
            if chapter_name[0] != '.':
                with open(file_path+'/'+book_name+'/'+chapter_name, 'r', encoding='utf-8') as f:
                    data.append([book_name, chapter_name, f.read()])
            

In [4]:
# 从ms上下载bge模型
from modelscope import snapshot_download
from sentence_transformers import SentenceTransformer

model_path = snapshot_download('Xorbits/bge-large-zh-v1.5', cache_dir='bge')
model = SentenceTransformer(model_path)

retrieval_instruction = "为这个句子生成表示以用于检索相关文章："

2023-12-10 18:08:33,609 - modelscope - INFO - PyTorch version 2.0.1+cu118 Found.
2023-12-10 18:08:33,612 - modelscope - INFO - TensorFlow version 2.13.0 Found.
2023-12-10 18:08:33,613 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2023-12-10 18:08:33,657 - modelscope - INFO - Loading done! Current index file version is 1.9.5, with md5 569b36a1f8cf6226713a963a2d3762a4 and a total number of 945 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 使用milvus构建向量引擎
from milvus import default_server
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection

 # milvus-lite用法（无需docker安装，方便在jupyter环境下使用）
default_server.set_base_dir('milvus_data')
default_server.cleanup()
default_server.start()
      
# connect to a server
connections.connect(host='127.0.0.1', port=default_server.listen_port)

In [6]:
# milvus - create a collection
fields = [
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
FieldSchema(name="book", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="chapter", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="passage_embedding", dtype=DataType.FLOAT_VECTOR, dim=1024) # small 512 base 768 large 1024
]
schema = CollectionSchema(fields = fields, description = "luxun collection")
luxun_milvus= Collection(name = "luxun", schema = schema)

In [7]:
# passage切分函数
import re

def prune_text(text):
    
    if len(text) > 0 and '〔' == text[0]:
        return ''
    
    if len(text) > 3 and '```' == text[:3]:
        return ''
                                
    text = re.sub(r'〔.〕', '', text)
    text = re.sub(r'[\n\t\u3000]', '', text)
    text = re.sub(r' ', '', text)
    return text

def parse_luxun(text, max_len=1024):
    passages = text.split('\n\u3000\u3000')
    passages = [prune_text(passage) for passage in passages]
    passages = [passage[:max_len] for passage in passages if len(passage) >= 10]
    return passages
    

In [8]:
# milvus - insert entities into collection
from tqdm import tqdm
row1, row2, row3, row4, row5 = [], [], [], [], []
index = 0
for d_i in tqdm(range(len(data))):
    d = data[d_i]
    book_name = d[0]
    chapter_name = d[1][:-3]
    for p_i, passage in enumerate(parse_luxun(d[2])):
        # print(f'{book_name}-{chapter_name}-passage{p_i+1}:{passage}')
        embedding = model.encode(passage, normalize_embeddings=True)
        row1.append(index)
        row2.append(book_name)
        row3.append(chapter_name)
        row4.append(passage)
        row5.append(embedding)
        index += 1

insert_result = luxun_milvus.insert([row1, row2, row3, row4, row5])
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
luxun_milvus.flush()  

100%|██████████| 309/309 [02:48<00:00,  1.84it/s]


In [9]:
# milvus - build index of entities
index = {
"index_type": "FLAT",
"metric_type": "IP",
}
luxun_milvus.create_index("passage_embedding", index)

Status(code=0, message=)

In [41]:
# milvus - Loads the collection to memory and performs a vector similarity search
luxun_milvus.load()

def retrieval(query):
    q_emb = model.encode(retrieval_instruction+query, normalize_embeddings=True)
    search_params = {
        "metric_type": "IP",
    }
    result = luxun_milvus.search([q_emb], "passage_embedding", search_params, limit=10, output_fields=["pk", "book", "chapter", "passage"])
    return result

In [55]:
# 执行搜索测试
query = '中国文化问题'
print(f'query:{query}\n')

res = retrieval(query)

for hits in res:
    for i, hit in enumerate(hits):
        print(f'top-{i+1}')
        print(f'''{hit.entity.get('book')}-{hit.entity.get('chapter')}''')
        print(hit.entity.get('passage'))
        print()

query:中国文化问题

top-1
华盖集续编-马上支日记
中国人总不肯研究自己。从小说来看民族性，也就是一个好题目。此外，则道士思想（不是道教，是方士）与历史上大事件的关系，在现今社会上的势力；孔教徒怎样使“圣道”变得和自己的无所不为相宜；战国游士说动人主的所谓“利”“害”是怎样的，和现今的政客有无不同；中国从古到今有多少文字狱；历来“流言”的制造散布法和效验等等……

top-2
热风-三十八
中国人向来有点自大。——只可惜没有“个人的自大”，都是“合群的爱国的自大”。这便是文化竞争失败之后，不能再见振拔改进的原因。

top-3
三闲集-现今的新文学的概观
中国的文化，便是怎样的爱国者，恐怕也大概不能不承认是有些落后。新的事物，都是从外面侵入的。新的势力来到了，大多数的人们还是莫名其妙。北平还不到这样，譬如上海租界，那情形，外国人是处在中央，那外面，围着一群翻译，包探，巡捕，西崽……之类，是懂得外国话，熟悉租界章程的。这一圈之外，才是许多老百姓。

top-4
而已集-当陶元庆君的绘画展览时
中国现今的一部份人，确是很有些苦闷。我想，这是古国的青年的迟暮之感。世界的时代思潮早已六面袭来，而自己还拘禁在三千年陈的桎梏里。于是觉醒，挣扎，反叛，要出而参与世界的事业--我要范围说得小一点：文艺之业。倘使中国之在世界上不算在错，则这样的情形我以为也是对的。

top-5
而已集-略谈香港
若夫“香江”（案：盖香港之雅称）之于国粹，则确是正在大振兴而特振兴。如六月二十五日《循环日报》“昨日下午督宪府茶会”条下，就说：“（上略）赖济熙太史即席演说，略谓大学堂汉文专科异常重要，中国旧道德与乎国粹所关，皆不容缓视，若不贯彻进行，深为可惜，（中略）周寿臣爵士亦演说汉文之宜见重于当世，及汉文科学之重要，关系国家与个人之荣辱等语，后督宪以华语演说，略谓华人若不通汉文为第一可惜，若以华人而中英文皆通达，此后中英感情必更融洽，故大学汉文一科，非常重要，未可以等闲视之云云。（下略）”

top-6
坟-论睁了眼看
中国的文人，对于人生，——至少是对于社会现象，向来就多没有正视的勇气。我们的圣贤，本来早已教人“非礼勿视”的了；而这“礼”又非常之严，不但“正视”，连“平视”“斜视”也不许。现在青年的精神未可知，在体质，却大半还是弯腰曲背，低眉顺眼，表示着老牌的老成的子弟，驯良的百姓，——

In [54]:
# 关闭milvus服务
default_server.stop()