In [None]:
!pip install llama-index llama-index-llms-ollama llama-index-embeddings-huggingface   vector_stores

In [None]:
!pip install llama-index-vector-stores-milvus

In [None]:
!pip install nest_asyncio

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model="qwen2.5:7b", request_timeout=60.0)

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-m3"
)

In [None]:
# 构造一个文档
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    excluded_llm_metadata_keys=["file_name"],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

#LLM视角最终读取的内容
print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
#Embding模型视角最终读取的内容
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

In [None]:
# 文件索引查询
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader(input_dir="/Users/luxun/workspace/ai/mine/mlearn/tmp", recursive=True).load_data()

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=Settings.embed_model
)

query_engine = index.as_query_engine()
response = query_engine.query("aiohttp==")
print(response)
response = query_engine.query("decorator=")
print(response)


In [None]:
# db 初始化
from llama_index.vector_stores.milvus import MilvusVectorStore
vector_store = MilvusVectorStore(
    uri="/Users/luxun/workspace/ai/mine/mlearn/tmp/db/milvus.db", overwrite=False, dim=1024
)

In [None]:
# pipeline 方式 处理文档
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        TitleExtractor(),
        Settings.embed_model
    ],
    vector_store=vector_store,
)

# Ingest directly into a vector db
pipeline.run(documents=[Document.example()])


In [None]:
# 从DB创建索引和查询
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
response = query_engine.query("LLM")
print(response)