In [None]:
# 任务：下载文档
file="https://arxiv.org/pdf/2307.09288.pdf"
!mkdir -p data && wget --user-agent "Mozilla" {file} -O "./data/llama2.pdf"

In [None]:
# 任务：下载文档
file="https://ir.manutd.com/~/media/Files/M/Manutd-IR/documents/manu-20f-2022-09-24.pdf"
!mkdir -p data && wget --user-agent "Mozilla" {file} -P "./data/"

In [None]:
# 任务：下载模型，量化模型q4
# 参考：（size: 7.73G）https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/tree/main
# 说明：local_dir 使用的是软链接
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="TheBloke/Llama-2-13B-chat-GGUF",
                filename="llama-2-13b-chat.Q4_0.gguf",
                local_dir="./models/")

In [None]:
# 任务：下载模型，量化模型q4
# 说明：该模型llamacpp无法加载，存在错误
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="soulteary/Chinese-Llama-2-7b-ggml-q4",
                filename="Chinese-Llama-2-7b-ggml-q4.bin",
                local_dir="./models/")

In [None]:
# 任务：（本地微服务）嵌入式milvus
# 启动时间：1m19.3s
from src.local_milvus import LocalMilvus

localMilvus = LocalMilvus.instance()
localMilvus.start()

In [None]:
# 任务：关闭嵌入式milvus
from src.local_milvus import LocalMilvus

localMilvus = LocalMilvus.instance()
localMilvus.stop()

In [None]:
# 任务：启动milvus调试模式
# 参考：https://github.com/milvus-io/milvus-lite
!milvus-server --data "./db/milvus_data" --debug

In [None]:
# 任务：milvus api
from pymilvus import connections, utility, Collection, db

print("==list_connections:", connections.list_connections())
alias = connections.list_connections()[0][0]
print(alias)

connections.connect(
  "default",
  host='39.104.228.125',
  port='19530'
)

# 查询数据库，集群最多支持64个数据库。
print("==list_database:", db.list_database())

print("==get_server_version:", utility.get_server_version())

# 使用指定的数据库，如果未指定，则collection默认使用default数据库
# db.using_database("default")

# 获取default数据库中的集合，一个集合由多个分区组成（表），会创建一个默认的分区_default
# 集合可以放在多个分片（不同的硬件节点，提升多写能力）上
print("==list_collections:", utility.list_collections())

print("==has_collection:", utility.has_collection("gptcache"))

collection = Collection("gptcache")
print("==schema", collection.schema)
print("==description", collection.description)
print("==name", collection.name)
print("==is_empty", collection.is_empty)
print("==num_entities", collection.num_entities)
print("==primary_field", collection.primary_field)
print("==partitions", collection.partitions)
print("==indexes", collection.indexes)
# print("==properties", collection.properties)

# 删除集合
# utility.drop_collection("gptcache")


# 查看索引构建进度（向量索引，标量索引）
print("==index_building_progress", utility.index_building_progress("gptcache"))

In [None]:
# 任务：删除指定集合
utility.drop_collection("LangChainCollection")

In [None]:

# 任务：sqlit
from sqlalchemy import create_engine, inspect, func, select
from gptcache.manager import CacheBase
from sqlalchemy.ext.serializer import loads, dumps
from sqlalchemy.orm import load_only, aliased

# 创建数据库链接，包含链接池
cacheBase = CacheBase('sqlite', sql_url="sqlite:///./db/sqlite.db")

ins = inspect(cacheBase._engine)
# 查询数据库名
print(ins.get_schema_names())
# 获取表名
print(ins.get_table_names())
# 获取表的字段名
print(ins.get_columns('gptcache_question'))

# 获取session
session = cacheBase.Session()

# 查看指定表的长度
count = session.query(func.count(cacheBase._ques.id)).scalar()
print("count=", count)

# 查询cacheBase._ques表
# list_item = session.query(cacheBase._ques).all()
# for item in list_item:
#     print(item.__dict__)

#  分页查询
# list_item = session.scalars(select(cacheBase._ques).limit(1).offset(0))
# for item in list_item:
#     print(item.__dict__)

def as_dict(obj):
       print(obj)
       print(type(obj))
       # print(isinstance(obj, ))
       print(obj.__table__)
       print(obj.__table__.columns)
       return {c.name: str(getattr(obj, c.name)) for c in obj.__table__.columns}

# list_item = [as_dict(item) for item in session.scalars(select(cacheBase._ques).limit(1).offset(0))]
# print(list_item)

# 分页查询 join
# list_item = [as_dict(item) for item in session.scalars(
#        select(cacheBase._answer).join(cacheBase._ques, cacheBase._answer.question_id == cacheBase._ques.id).limit(1).offset(0))]
# print(list_item)

# id =  aliased(cacheBase._answer.id, name="id")

list_item = [item._mapping for item in session.query(
       cacheBase._answer.id, cacheBase._answer.answer, cacheBase._ques.question).filter(
              cacheBase._answer.question_id == cacheBase._ques.id).limit(1).offset(0)]
print(list_item)


In [None]:
# 任务：加载外部数据
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/llama2.pdf")
pages = loader.load_and_split()
print(pages)

In [None]:
# 任务：RAG
import os
from langchain.chains import StuffDocumentsChain, LLMChain
from gptcache.embedding import Huggingface
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.document_transformers import (
    LongContextReorder,
)
# from langchain.vectorstores import Chroma
from langchain.vectorstores import Milvus
from dotenv import load_dotenv

load_dotenv()


# 默认提示
stuff_prompt_override = """Given this text extracts:
-----
{context}
-----
Please answer the following question:
{query}"""
prompt = PromptTemplate(
    template=stuff_prompt_override, input_variables=["context", "query"]
)

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
llm = OpenAI()
llm_chain = LLMChain(llm=llm, prompt=prompt)

# 覆盖提示
document_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}"
)
document_variable_name = "context"
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)


# 需要查询的问题
query = "What can you tell me about the Celtics?"
# 嵌入的文本
texts = [
    "Basquetball is a great sport.",
    "Fly me to the moon is one of my favourite songs.",
    "The Celtics are my favourite team.",
    "This is a document about the Boston Celtics",
    "I simply love going to the movies",
    "The Boston Celtics won the game by 20 points",
    "This is just a random text.",
    "Elden Ring is one of the best games in the last 15 years.",
    "L. Kornet is one of the best Celtics players.",
    "Larry Bird was an iconic NBA player.",
]

# 创建一个检索器
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-zh-v1.5")

# 测试
# emb = embeddings.client.encode("你好，世界!")
# print("emb1=", emb)

# emb2 = Huggingface(model="BAAI/bge-small-zh-v1.5").to_embeddings("你好，世界!")
# print("emb2=", emb2)
os.environ["MILVUS_HOST"] = os.getenv('MILVUS_HOST')
retriever = Milvus.from_texts(texts, embedding=embeddings).as_retriever(
    search_kwargs={"k": 10}
)
docs = retriever.get_relevant_documents(query)
print("docs=", docs)

# 重新对文档进行排序，相关性较高的位于文档的开头和结尾，相关性较低的位于中间
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)
print("reordered_docs=", reordered_docs)

result = chain.run(input_documents=reordered_docs, query=query)
print(result)