In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.extractors import SummaryExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.llms.llm import LLM
from llama_index.core.vector_stores.types import BasePydanticVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document, MetadataMode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from ReadLoad import read_jsonl, write_jsonl
from tqdm import tqdm

import nest_asyncio

nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


### load document

In [2]:
# read data
def read_data(path: str = "data") -> list["data"]:
    reader = SimpleDirectoryReader(
        input_dir=path,
        recursive=True,
        required_exts=[
            ".txt",
        ],
    )
    return reader.load_data()

#data_splite
data = read_data("./aiops2024-challenge-dataset/data")

In [3]:
data = [ d for d in data if len(d.text) > 100]
len(data)

20733

In [3]:
#print(nodes[0].get_content(metadata_mode="all"))

### Setup embeding

In [4]:
# embeding 
embeding = HuggingFaceEmbedding(
        model_name="bge-small-zh-v1.5",
        cache_folder="./",
        embed_batch_size=512,
    )
Settings.embed_model = embeding

### Setup LLM

In [5]:
from zhipuLLM import GLM
glm =  GLM()
Settings.llm = glm

In [6]:
import jieba
import jieba.analyse
import json
# 打开并读取JSON文件
# with open('dictionary.json', 'r', encoding='utf-8') as f:
#     dictionary = json.load(f)
# dictionary =  {k.lower(): v for k, v in dictionary.items()}

jieba.load_userdict("words.txt")
def chinese_tokenizer(text: str) -> list[str]:
    tokens = jieba.lcut(text)
    # TOOD: 短语不可分割
    # TODO: remove stopwords
    return tokens

def expand_abbreviations(sentence):
    # 将句子分割成单词列表
    words = jieba.lcut(sentence)
    
    # 遍历单词列表
    for i, word in enumerate(words):
        # 检查单词是否在字典的键中
        word = word.lower()
        if word in dictionary:
            # 替换缩写为全写
            words[i] = f"{words[i]}({dictionary[word]})"
    
    # 将修改后的单词列表重新组合成一个句子
    expanded_sentence = ''.join(words)
    
    return expanded_sentence

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.644 seconds.
Prefix dict has been built successfully.


### Metadata exctraction

In [12]:
for file in tqdm(data):
    l = file.metadata['file_path'].split('/')
    file.metadata['product_name'] = l[5]
    file.metadata['document_name'] = l[6]
    file.metadata['Topic'] = file.text.splitlines()[0]
    # file.metadata['keywords'] = jieba.analyse.extract_tags(file.text, topK=5)

100%|██████████| 20733/20733 [00:00<00:00, 226476.75it/s]


In [13]:
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes = [2048,1024,256])
nodes = node_parser.get_nodes_from_documents(data)

# splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=32)
# nodes = splitter.get_nodes_from_documents(data)

In [14]:
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes
leaf_nodes = get_leaf_nodes(nodes)
root_nodes = get_root_nodes(nodes)

In [8]:
from llama_index.core.schema import IndexNode
sub_chunk_sizes = [256, 512]
sub_node_parsers = [
    SentenceSplitter(chunk_size=c, chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in tqdm(nodes):
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

100%|██████████| 31377/31377 [02:16<00:00, 230.18it/s]


In [9]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

### Indexing

In [15]:
#vector_index = VectorStoreIndex(nodes)
# import torch
# del my_tensor
# torch.cuda.empty_cache()
#vector_index = VectorStoreIndex.from_documents(data)
#vector_index = VectorStoreIndex(nodes=nodes)
#vector_index = VectorStoreIndex(nodes=all_nodes) # Recursive Retriever
vector_index = VectorStoreIndex(nodes=leaf_nodes) # AutoMergingRetriever

### Storing

In [22]:
# save index to disk
vector_index.set_index_id("vector_index")
vector_index.storage_context.persist("./storage")

NameError: name 'StorageContext' is not defined

### Retriever

In [16]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

#vector_retriever = vector_index.as_retriever(similarity_top_k=10,similarity_threshold=0.6)
vector_retriever = vector_index.as_retriever(similarity_top_k=10)
bm25_retriever = BM25Retriever.from_defaults(nodes=leaf_nodes, similarity_top_k=10,tokenizer=chinese_tokenizer)
# recursive_retriever = RecursiveRetriever(
#     "vector",
#     retriever_dict={"vector": vector_retriever},
#     node_dict=all_nodes_dict,
#     similarity_threshold=0.5,
#     #verbose=True,
# )
AutoMerging_retriever = AutoMergingRetriever(vector_retriever,storage_context)

In [26]:
from llama_index.core.response.notebook_utils import display_source_node
query = "发布虚机时最多可以为虚机分配几块网卡？"
#test_nodes = bm25_retriever.retrieve(query)
#test_nodes = vector_retriever.retrieve(query)
#test_nodes = hybrid_retriever.retrieve(query)
#test_nodes = vector_retriever.retrieve(query)
#test_nodes = recursive_retriever.retrieve(query)
test_nodes = AutoMerging_retriever.retrieve(query)
print(query)

for node in test_nodes:
    display_source_node(node)


发布虚机时最多可以为虚机分配几块网卡？


**Node ID:** 9e4cbee1-4fbe-4101-bd93-af57b50b84a3<br>**Similarity:** 0.669169815873576<br>**Text:** 图5 网络配置（虚机）

说明：

虚机场景注意事项：

       * 运维网络”VIP填写运维网络浮动IP，地址池填写预置虚机所有节点的真实运维IP地址（固化到虚机网卡文件的net_iap...<br>

**Node ID:** 2a55564c-79d1-4696-82a0-0909495c18ce<br>**Similarity:** 0.6642504539058531<br>**Text:** 表7 网络平面规划数据（非SDN组网）VM| 网络平面  
---|---  
OMU| ZTE_EMS_NET  
ZTE_AMF_x_MGT_INT_NET  
ZTE_AMF_x_SERV...<br>

**Node ID:** 305006b1-f4ae-413b-8f0c-305f4d3cf2d8<br>**Similarity:** 0.6562133250682154<br>**Text:** C100013001 虚机网卡数目

  * C100013002 虚机最大网卡数目

  * C100013003 虚机最小网卡数目

  * C100013004 虚机自启动以来虚机运行时长...<br>

**Node ID:** 6d4616b1-b808-470b-ab35-fa5ee52c72a6<br>**Similarity:** 0.6542932158055011<br>**Text:** 表2 IPU虚机网卡配置参数说明 参数名称 | 参数含义  
---|---  
关联网络名称 | 此参数表示IPU虚机关联的网络平面名称，IPU需要关联的网络平面如下。

  * ZTE_AM...<br>

**Node ID:** 0eaa1002-76d8-4f11-baf6-52f3eefc8cf8<br>**Similarity:** 0.6530379905498399<br>**Text:** IMU虚机配置

概述

在部署了安全网关功能的情况下，才需要配置此种类型的虚机。

本节只介绍与IMU虚机相关的主要参数，其它的没有说明的参数参见“OMU虚机配置”。

虚机网卡配置

各种类...<br>

**Node ID:** 7fc490a8-0864-4dba-bf23-20a4d59cb247<br>**Similarity:** 0.6490676265024378<br>**Text:** IAU虚机配置

概述

在部署了安全网关功能的情况下，才需要配置此种类型的虚机。

本节只介绍与IAU虚机相关的主要参数，其它的没有说明的参数参见“OMU虚机配置”。

虚机网卡配置

各种类...<br>

**Node ID:** 7972571e-c0b8-43d2-83ee-420ec41f1356<br>**Similarity:** 0.6458370164628259<br>**Text:** 登录到虚机节点，执行ifconfig -a，可查看虚机网卡的mac地址，如图26所示。

图26 查看虚机网卡的mac地址

通过mac地址就可以确定虚机网卡挂载的网络信息。

  2. 固化n...<br>

**Node ID:** b40d10f8-f9ec-467c-9839-e8c7640fd2bc<br>**Similarity:** 0.644548537394399<br>**Text:** 图1 CDU虚机网卡配置

CDU虚机网卡配置主要参数说明参见表2。

表2 CDU虚机网卡配置参数说明 参数名称 | 参数含义  
---|---  
关联网络名称 | 此参数表示CDU虚机关...<br>

**Node ID:** c92d22ea-6580-467d-acd0-8b2bd5c15c85<br>**Similarity:** 0.6431587099688861<br>**Text:** 表8 OMU虚机网卡配置参数说明参数名称| 参数含义  
---|---  
关联网络名称| 该参数的取值有两种情况：

  * 设置为通用页面中，配置的网络平面。<br>

**Node ID:** 2ca5b08b-3b2b-4f43-b37c-62bf718db437<br>**Similarity:** 0.6416023963862393<br>**Text:** * 在计算资源充足的情况下，虚拟机必须部署在不同主机（计算节点）上。
    * 在计算资源不足情况下，尽可能将这些虚拟机部署在不同的主机（计算节点）上，避免剩余的虚机无法部署。

  
  
...<br>

In [18]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.retrievers import QueryFusionRetriever

class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever #vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes
        
#vector_index.as_retriever(similarity_top_k=10)
hybrid_retriever = HybridRetriever(AutoMerging_retriever, bm25_retriever)
# QueryFusionretriever = QueryFusionRetriever(
#     [vector_retriever, bm25_retriever],
#     similarity_top_k=2,
#     num_queries=4,  # set this to 1 to disable query generation
#     mode="reciprocal_rerank",
#     use_async=True,
#     verbose=True,
#     # query_gen_prompt="...",  # we could override the query generation prompt here
# )

### ReRank

In [20]:
from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(top_n=5, model="bge-reranker-large")

In [21]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
def rerank_nodes(retriever,reranker,query):

    retrieved_nodes = retriever.retrieve(query)
    reranked_nodes = reranker.postprocess_nodes(
        retrieved_nodes,
        query_bundle=QueryBundle(query),
    )
    return reranked_nodes


### Setup prompt

In [22]:
from llama_index.core import PromptTemplate

new_text_qa_template_str = (
    """\
    上下文信息如下：
    ----------
    {context_str}
    ----------
    根据上下文信息而非先验知识，构建一个经过严谨思考且内容详实的答案，来回答问题。
    充分运用上下文信息来支撑你的答案，并确保回答符合人类的偏好以及遵循指示的原则。
    如果上下文信息没有相关知识，可以回答不确定，不要复述上下文信息。
    
    问题：{query_str}
    回答：\
    """
)

text_qa_template_str = (
    "你是一名中兴通讯网络运维专家\n"
    "以下是上下文信息：\n"
    "---------------------\n{context_str}\n---------------------\n"
    "运用上下文信息并结合你自己的知识，回答以下问题。\n"
    "如果上下文信息没有帮助，你也可以凭自己的知识来回答问题。\n"
    "请以专业得风格回答问题\n"
    "问题：{query_str}\n"
    "回答："
)

text_qa_template = PromptTemplate(new_text_qa_template_str)

refine_template_str = (
    """\
    原始问题如下：
    ----------
    {query_str}
    ----------
    我们已经给出一个现有的答案：
    ----------
    {existing_answer}
    ----------
    现在，我们有机会根据以下附加信息来优化这个现有答案（仅在必要时进行）。
    ----------
    {context_msg}
    ----------
    考虑到新提供的信息，请对原始答案进行改进，以更准确地回答提问。如果新增的信息没有帮助，则直接返回原始答案。
    优化后的答案：\
    """
)
refine_template = PromptTemplate(refine_template_str)

d ={"rcp": "Resource Control Platform资源控制平台",
    "umac": "unified Mobile Access Controller统一的移动性接入控制器",
    "emsplus": "Element Management System网元管理系统",
    "director": "TECS Director ICT融合的电信级云管理平台"}
    

### Query

In [39]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters,MetadataFilter

def hybrid_query2(current_keyword):
    query_engine = RetrieverQueryEngine.from_args(
        retriever = AutoMerging_retriever, 
        llm=glm,
        text_qa_template=text_qa_template,
        refine_template=refine_template,
        response_mode="compact",
        filters = MetadataFilters(
            filters=[ExactMatchFilter(key="product_name", value=current_keyword)]
        ),
        node_postprocessors=[reranker],
        )
    return query_engine

### Test

In [40]:
#!pip install deepeval
# # text
question = read_jsonl('./aiops2024-challenge-dataset/question.jsonl')
#query_engine = perform_query(vector_index,question[3]['document'])
query = question[10]['query']
current_keyword = question[10]['document']
query_engine = hybrid_query2(current_keyword)
response = query_engine.query(query)

# print(response.response)
# print("========================================")
# for source in response.source_nodes:
#     print(source.text)
#     print("========================================")
# def display_prompt_dict(prompts_dict):
#     for k, p in prompts_dict.items():
#         text_md = f"**Prompt Key**: {k}" f"**Text:** "
#         display(Markdown(text_md))
#         print(p.get_template())
#         display(Markdown(""))

# prompts_dict = query_engine.get_prompts()
# display_prompt_dict(prompts_dict)

'director'

In [41]:
print(query,current_keyword)
print(response.response)
response.metadata

发布虚机时最多可以为虚机分配几块网卡？ director
根据提供的上下文信息，关于虚机网卡的分配，我们在“性能计数器参考”文档中找到了相关的信息。具体来说，性能计数器C100013002提到了“虚机最大网卡数目”。然而，文档中并没有给出具体的数字，只是定义了这一性能计数器的用途。

因此，基于上下文信息，我无法给出一个具体的数值来回答“发布虚机时最多可以为虚机分配几块网卡”的问题。实际能够分配的最大网卡数目可能取决于具体的系统配置、虚机类型以及所使用的管理软件等因素。

如果需要得到这一具体数值，建议查阅具体的系统配置文档或相关的管理软件手册。


{'305006b1-f4ae-413b-8f0c-305f4d3cf2d8': {'file_path': '/mnt/workspace/aiops2024-challenge-dataset/data/director/性能计数器参考/1626748224563.txt',
  'file_name': '1626748224563.txt',
  'file_type': 'text/plain',
  'file_size': 6067,
  'creation_date': '2024-07-04',
  'last_modified_date': '2024-03-12',
  'product_name': 'director',
  'document_name': '性能计数器参考',
  'Topic': '虚机'},
 '2a55564c-79d1-4696-82a0-0909495c18ce': {'file_path': '/mnt/workspace/aiops2024-challenge-dataset/data/umac/软件安装（MANO）/23-OMU虚机配置(AMF).txt',
  'file_name': '23-OMU虚机配置(AMF).txt',
  'file_type': 'text/plain',
  'file_size': 27307,
  'creation_date': '2024-07-04',
  'last_modified_date': '2024-05-11',
  'product_name': 'umac',
  'document_name': '软件安装（MANO）',
  'Topic': 'OMU虚机配置'},
 '2ca5b08b-3b2b-4f43-b37c-62bf718db437': {'file_path': '/mnt/workspace/aiops2024-challenge-dataset/data/umac/软件安装（MANO）/30-IPU虚机配置(AMF).txt',
  'file_name': '30-IPU虚机配置(AMF).txt',
  'file_type': 'text/plain',
  'file_size': 2776,
  'creatio

### Evaluation

In [72]:
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator,
    ContextRelevancyEvaluator
)
def context_evaluator(query,response):
    evaluator = ContextRelevancyEvaluator(llm=glm)
    try:
        eval_result = evaluator.evaluate_response(query=query, response=response)
        return {
                "Query": str(query),
                "Response": response.response,
                "Context": '\n==========================\n'.join(eval_result.contexts),
                "Score": eval_result.score,
                "Evaluation Result": eval_result.passing,
                "Reasoning": eval_result.feedback
                #"eval_result":eval_result
            }
    except Exception as e:  # Add exception handling for clarity and control
        print(f"An error occurred during evaluation: {e}")
        context_l = [node.text for node in response.source_nodes]
    return {
        "Query": str(query),
        "Response": response.response,
        "Context": '\n==========================\n'.join(context_l),
        "Score": "",
        "Evaluation Result": "",
        "Reasoning": ""
        #"eval_result":eval_result
    }

### Display prompts

In [None]:
from IPython.display import Markdown, display
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}" f"**Text:** "
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown(""))
# prompts_dict = query_engine.get_prompts()
# display_prompt_dict(prompts_dict)

### Query_process

In [42]:
import concurrent.futures
#读取问题
questions = read_jsonl('./aiops2024-challenge-dataset/question.jsonl')

def process_question(q):
    #query_engine = perform_query(vector_index,q['document'])
    query_engine = hybrid_query(q['document'])
    #缩略短语换全写
    #query_content = expand_abbreviations(q['query'])
    return {
        "id": q['id'],
        "query": q['query'],
        "answer": query_engine.query(q['query'])
    }

# 并行批量处理
def batch_process_questions(questions, max_workers=5):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_question, q) for q in questions]
        results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures), total=len(questions))]
    return results

results = batch_process_questions(questions)

100%|██████████| 103/103 [05:57<00:00,  3.47s/it]


### Evaluation

In [84]:
#并行批量处理
def batch_process_evaluation(results, max_workers=5):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(context_evaluator, result["query"],result["answer"]) for result in results]
        results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures), total=len(results))]
    return results

eval_results = batch_process_evaluation(results)

 43%|████▎     | 44/103 [01:39<02:20,  2.38s/it]

An error occurred during evaluation: could not convert string to float: '3.0.'


 62%|██████▏   | 64/103 [02:20<01:17,  1.99s/it]

An error occurred during evaluation: could not convert string to float: '4.0.'


 77%|███████▋  | 79/103 [02:53<01:11,  2.96s/it]

An error occurred during evaluation: could not convert string to float: '4.0.'


 91%|█████████▏| 94/103 [03:25<00:25,  2.79s/it]

An error occurred during evaluation: could not convert string to float: '3.0.'


100%|██████████| 103/103 [03:53<00:00,  2.27s/it]


In [86]:
import pandas as pd
df = pd.DataFrame(eval_results)
df.to_excel("1719549918569_0.66.xlsx",index=False)

### Submission

In [44]:
final_result = [
    {
        "id": q['id'],
        "query": q['query'],
        "answer": q['answer'].response
    } for q in tqdm(results)
]
from submit import submit
submission_id = submit(final_result,
    judge_server = "http://judge.aiops-challenge.com",
    contest = "1780211530478944282",
    ticket = "1799659042575011879")
if submission_id:
    print("提交成功！提交 ID: ", submission_id)
    print("查阅成绩：python submit.py -c 1780211530478944282 -k 1799659042575011879 -i", submission_id[0])
    write_jsonl(final_result, submission_id[0])
else:
    print("提交失败")
    
# 1718515109355
# 1718542978458
# 1718515109355
# 1718813723742 metadata vector query 0.59
# 1718874184956 中文提示词 0.66
# 1718875566919 批量处理
# 1718876357723 修改了模型参数 温度为0.1
# 1718876982203
# 1718881451217
# 1718882867809
# 1718895050114
# 1718898534468
# 1718983016865
# 1718983745050
# 1719145584515
# 1719241616258
# python submit.py -c 1780211530478944282 -k 1799659042575011879 -i 1719042989786

100%|██████████| 103/103 [00:00<00:00, 383330.36it/s]


[Error 429] Too many requests (wait 587 seconds)
提交失败


In [None]:
write_jsonl(final_result)

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator

# create llm
llm = glm

# define evaluator
evaluator = FaithfulnessEvaluator(llm=llm)

# query index
question = questions[5]
query_engine = perform_query(vector_index,question['document'])
response = query_engine.query(
    question['query']
)
eval_result = evaluator.evaluate_response(response=response)
print(str(eval_result.passing))

In [None]:
eval_result

In [None]:
# git clone https://www.modelscope.cn/Xorbits/bge-small-zh-v1.5.git
# git clone https://www.aiops.cn/gitlab/aiops-challenge/aiops-2024-submit.git
# git clone https://www.modelscope.cn/datasets/issaccv/aiops2024-challenge-dataset.git
# git clone https://www.modelscope.cn/Xorbits/bge-reranker-base.git
# pip install -r requirements.txt
# unzip data.zip

In [None]:
# print(results[1]['query']+"\n",results[1]['answer'])
# results[0]['answer'].metadata