In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.extractors import SummaryExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.llms.llm import LLM
from llama_index.core.vector_stores.types import BasePydanticVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document, MetadataMode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from ReadLoad import read_jsonl, write_jsonl
from tqdm import tqdm

import nest_asyncio

nest_asyncio.apply()

### load document

In [12]:
# read data
def read_data(path: str = "data") -> list[Document]:
    reader = SimpleDirectoryReader(
        input_dir=path,
        recursive=True,
        required_exts=[
            ".txt",
        ],
    )
    return reader.load_data()


# data_splite
data = read_data("./aiops2024-challenge-dataset/data")

In [16]:
data[0].metadata

{'file_path': '/mnt/workspace/aiops2024-challenge-dataset/data/director/License申请操作指南/1602298756694.txt',
 'file_name': '1602298756694.txt',
 'file_type': 'text/plain',
 'file_size': 5087,
 'creation_date': '2024-06-18',
 'last_modified_date': '2024-03-12'}

### Setup Embeding

In [6]:
# embeding
embeding = HuggingFaceEmbedding(
        model_name="bge-small-zh-v1.5",
        cache_folder="./",
        embed_batch_size=512,
    )
Settings.embed_model = embeding

### Setup LLM

In [24]:
from zhipuLLM import GLM
Settings.llm = GLM()
llm = GLM()

### setup matedata

In [67]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor
)
from llama_index.core.node_parser import SentenceSplitter

#construct text splitter to split texts into chunks for processing
#text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=128)

#create metadata extractor
extractors=[
    TitleExtractor(nodes=1, llm=llm), #title is located on the first page, so pass 1 to nodes param
    QuestionsAnsweredExtractor(questions=3, llm=llm), #let's extract 3 questions for each node, you can customize this.
    SummaryExtractor(summaries=["prev", "self"], llm=llm), #let's extract the summary for both previous node and current node.
    KeywordExtractor(keywords=10, llm=llm) #let's extract 10 keywords for each node.
]


#create node parser to parse nodes from document
transformations = extractors

from llama_index.core.ingestion import IngestionPipeline
pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=data[:2])

# for file in data:
#     file.metadata['file_name'] = file.text.splitlines()[0]
#     file.metadata['document'] = file.metadata['file_path'].split('/')[5]
# data[0].metadata



100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
100%|██████████| 2/2 [00:21<00:00, 10.88s/it]
100%|██████████| 2/2 [00:31<00:00, 15.76s/it]
100%|██████████| 2/2 [00:08<00:00,  4.35s/it]


In [73]:
nodes[0].metadata

{'file_path': '/mnt/workspace/aiops2024-challenge-dataset/data/director/License申请操作指南/1602298756694.txt',
 'file_name': '1602298756694.txt',
 'file_type': 'text/plain',
 'file_size': 5087,
 'creation_date': '2024-06-18',
 'last_modified_date': '2024-03-12',
 'document_title': '"Comprehensive Guide: Applying for a TECS Director License - Step-by-Step Process, Eligibility Requirements, and Procedures"',
 'questions_this_excerpt_can_answer': '1. What are the specific steps required to apply for a TECS Director License, and which browsers are recommended for this process?\n\n   Higher-level summary: The context provides a detailed step-by-step guide on how to apply for a TECS Director License and suggests compatible browsers for the application process.\n\n2. How does one acquire the License application excel template for TECS Director, and what are the differences between the available versions of the TECS Director License?\n\n   Higher-level summary: The document outlines the method to o

### Index

In [8]:
#vector_index = VectorStoreIndex(nodes)
#vector_index = VectorStoreIndex.from_documents(data)
index = VectorStoreIndex(nodes=nodes)

### Retriever

In [11]:
#!pip install llama-index-retrievers-bm25
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
import jieba


def chinese_tokenizer(text: str) -> list[str]:
    tokens = jieba.lcut(text)
    # TOOD: 短语不可分割
    # TODO: remove stopwords
    return tokens

bm25_retriever = BM25Retriever.from_defaults(
    docstore=vector_index.docstore,  # 使用从 VectorStoreIndex 获取的文档存储
    similarity_top_k=3,        # 返回最相似的前3个文档
    tokenizer=chinese_tokenizer # 使用中文分词器Jieba
)


In [None]:
from llama_index import get_response_synthesizer
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.query_engine.retriever_query_engine import (
    RetrieverQueryEngine,
)

# build retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    vector_store_query_mode="default",
    filters=[ExactMatchFilter(key="name", value="paul graham")],
    alpha=None,
    doc_ids=None,
)

# build query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever, response_synthesizer=get_response_synthesizer()
)

# query
response = query_engine.query("what did the author do growing up?")

### Retriever result

In [None]:
nodes = bm25_retriever.retrieve("虚拟资源包括哪些种类")
for node in nodes:
    display_source_node(node)

### Query Engine

In [79]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores.types import MetadataFilters


def perform_query(vector_index, query_content, current_keyword):
    # 更新查询引擎的过滤条件
    query_engine = RetrieverQueryEngine(
        retriever=bm25_retriever
        # filters=MetadataFilters(
        #     filters=[{"key":"document", 
        #               "value":current_keyword}]
        # ),
        # similarity_top_k=3
    )
    # 执行查询
    response = query_engine.query(query_content)
    return response

In [80]:
#读取问题
question = read_jsonl('./aiops2024-challenge-dataset/question.jsonl')
results = [
    {
        "id": q['id'],
        "query": q['query'],
        "answer": perform_query(vector_index, q['query'], q['document'])
    } for q in tqdm(question)
]

100%|██████████| 103/103 [14:13<00:00,  8.28s/it]


In [81]:
final_result = [
    {
        "id": q['id'],
        "query": q['query'],
        "answer": q['answer'].response
    } for q in tqdm(results)
]
write_jsonl(final_result)

100%|██████████| 103/103 [00:00<00:00, 444001.35it/s]


In [82]:
from submit import submit
submission_id = submit(final_result,
    judge_server = "http://judge.aiops-challenge.com",
    contest = "1780211530478944282",
    ticket = "1799659042575011879")
if submission_id:
    print("提交成功！提交 ID: ", submission_id)
else:
    print("提交失败")
    
# 1718515109355
# 1718542978458 vector query
# 1718642577909 bm25
# python submit.py -c 1780211530478944282 -k 1799659042575011879 -i 1718642577909

提交成功！提交 ID:  ('1718642577909', 146)
