In [1]:
import os
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.indices.loading import load_index_from_storage
from llama_index import StorageContext

from llama_index.prompts.base import PromptTemplate
from dotenv import load_dotenv
load_dotenv()

from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    get_response_synthesizer,
)
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import OpenAI

from read import get_filelisform

In [2]:
def load_pdf(pdf_file):
    from pathlib import Path
    from llama_index import download_loader

    PDFReader = download_loader("PDFReader")

    loader = PDFReader()
    documents = loader.load_data(file=Path('./data/自律修炼手册.pdf'))

In [3]:
def get_docs(directory,format):
    pdf_files = get_filelisform(directory,format)
    # Load all wiki documents
    city_docs = []
    for file in pdf_files:
        docs = SimpleDirectoryReader(
            input_files=[file]
        ).load_data()
        title = file.split(':')[0]
        docs[0].doc_id = title
        city_docs.extend(docs)
    return city_docs,pdf_files

In [None]:
api_base1 = os.environ['openai_api_base1']
api_key1 = os.environ['openai_api_key1']

In [34]:
api_key1

'sk-A2IJsOOcVEjwYlph31BaB9B48cBf459bA4D3212b48D2Ed83'

In [42]:
api_base1

'https://aigc789.top/v1'

In [35]:


# LLM (gpt-3.5-turbo)
models= ["gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4-1106-preview", "gpt-3.5-turbo-16k", "gpt-4-0613"]
system_prompt = "Always respond in Chinese"
chatgpt = OpenAI(temperature=0, model=models[1], api_base=api_base1, api_key=api_key1,
                system_prompt=system_prompt)

persist_dir = "index-pdf-zilv"
context_window= 16385

In [39]:
chatgpt = OpenAI(temperature=0, model=models[1], api_base=api_base1, api_key=api_key1,
                system_prompt=system_prompt)

In [None]:
city_docs,pdf_files = get_docs('./data/表达','.pdf')

In [40]:

service_context = ServiceContext.from_defaults(llm=chatgpt,
                                                chunk_size=1024,
                                                context_window = context_window)

In [41]:
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=16385, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=OpenAIEmbedding(model_name='text-embedding-ada-002', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f58f8639750>, additional_kwargs={}, api_key='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', api_base='https://api.openai.com/v1', api_version='', max_retries=10, timeout=60.0, default_headers=None, reuse_client=True), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f58f8639750>, id_func=<function default_id_func at 0x7f590192e3b0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex

In [43]:
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
doc_summary_index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
        

### High-level Querying

In [44]:
query_engine = doc_summary_index.as_query_engine(
            response_mode="tree_summarize", use_async=True
        )
question = "如何进行有效的表达"

response = query_engine.query(question)

In [45]:
print(response)

通过设问法，可以让听众主动靠近你、接纳你。首先抛出一个跟现场听众都有关的问题，然后再说你要讲的内容。这样可以让听众更容易理解并接受你的表达。


### LLM-based Retrieval

In [56]:


from llama_index.indices.document_summary import (
    DocumentSummaryIndexLLMRetriever,
)

retriever = DocumentSummaryIndexLLMRetriever(
    doc_summary_index,
    # choice_select_prompt=None,
    # choice_batch_size=10,
    # choice_top_k=1,
    # format_node_batch_fn=None,
    # parse_choice_select_answer_fn=None,
    service_context=service_context,
)


In [57]:
retrieved_nodes = retriever.retrieve(question)

IndexError: list index out of range

In [None]:

len(retrieved_nodes)

print(retrieved_nodes[0].node.get_text())

# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

In [47]:
print(doc_summary_index.service_context)

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=16385, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=OpenAIEmbedding(model_name='text-embedding-ada-002', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f58f8639750>, additional_kwargs={}, api_key='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', api_base='https://api.openai.com/v1', api_version='', max_retries=10, timeout=60.0, default_headers=None, reuse_client=True), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f58f8639750>, id_func=<function default_id_func at 0x7f590192e3b0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex

In [None]:
print(retrieved_nodes = retriever.retrieve("如何进行有效的表达"))

In [None]:


print(len(retrieved_nodes))

print(retrieved_nodes[0].score)
print(retrieved_nodes[0].node.get_text())



In [None]:
# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

### Embedding-based Retrieval

In [50]:
from llama_index.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    # similarity_top_k=1,
)

In [None]:
retrieved_nodes = retriever.retrieve(question)

In [55]:
len(retrieved_nodes)

1

In [52]:


print(retrieved_nodes[0].node.get_text())

去讨好别人？我可不干， 而且我也不会说那些虚头巴脑的话。 ”  还
有同学私下会嘀咕： “我又不是他们肚子里的蛔虫， 没那么强的同
理心，怎么办啊？”  
  
放心，这些我都想到了，所以我不是教你怎么去迎合讨好，而是
教你一套具体的工具，让你的听众主动靠近你、接纳你。  
  
果核：设问法  
  
请注意，这一讲的果核是：好的当众表达，不是迎合讨好听众，
而是先抛出一个跟现场听众都有关的问题， 然后再说你要讲的内
容。我给这个方法起了个名字叫 ——设问法。  
举个例子，比如明天要开周例会，你重点想汇报两个工作。一开
始可能是这么说的：  
1.我做了上半年行业优秀案例分析。  
2.还做了用户需求调研和分析。  
虽然听着挺专业，也知道你在说什么。但听的人并不知道跟自己
有什么关系，我为什么要听？那如果试着把他们变成设问，你再
听听看。  
我上周做了不少工作，今天主要想汇报三个重点内容：  
首先，上半年咱们的几个友商都成了国潮网红，销售额大增，怎
么学习他们的经验？为此我专门做了上半年行业内优秀案例分


In [53]:
# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [54]:

# query
response = query_engine.query(question)
print(response)

通过使用设问法可以进行有效的表达。设问法是一种技巧，通过先提出一个与听众相关的问题，然后再介绍要讲的内容。这样可以引起听众的兴趣和共鸣，使他们更愿意倾听你的讲话。在进行表达时，可以将要传达的信息转化为问题，并在介绍时提出这些问题，以吸引听众的注意力和参与度。这种方法可以帮助你与听众建立更好的连接，使他们更主动地靠近你并接受你的观点。


通过使用设问法可以进行有效的表达。设问法是一种技巧，通过先提出一个与听众相关的问题，然后再介绍要讲的内容。这样可以引起听众的兴趣和共鸣，使他们更愿意倾听你的讲话。在进行表达时，可以将要传达的信息转化为问题，并在介绍时提出这些问题，以吸引听众的注意力和参与度。这种方法可以帮助你与听众建立更好的连接，使他们更主动地靠近你并接受你的观点。
