In [1]:
# !pip install faiss-cpu

In [2]:
# !pip install -U langchain  nest_asyncio httpx

In [3]:
# from langchain.agents import ConversationalAgent
# dir(ConversationalAgent)

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import os
import torch
from dotenv import load_dotenv

from transformers import pipeline, TextStreamer

from llama_index.indices.composability import ComposableGraph
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.langchain_helpers.memory_wrapper import GPTIndexChatMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, LlamaIndexTool
from llama_index import download_loader, SummaryPrompt, LLMPredictor, GPTListIndex, PromptHelper, load_index_from_storage, StorageContext, ServiceContext, LangchainEmbedding

from langchain.agents import Tool
from langchain.llms.base import LLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.agents import initialize_agent
from langchain.embeddings import HuggingFaceEmbeddings

load_dotenv()

INFO:numexpr.utils:Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


True

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
# define prompt helper
# set maximum input size
max_input_size = 512
# set number of output tokens
num_output = 128
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

In [5]:
generate_text = pipeline(model="databricks/dolly-v2-12b", torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True)

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
llm_predictor = LLMPredictor(llm=hf_pipeline)

In [6]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Use pytorch device: cuda


In [7]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model)

## Load data from disk

In [9]:
storage_context = StorageContext.from_defaults(persist_dir='./storage')
rss_feed_index = load_index_from_storage(storage_context=storage_context, service_context=service_context)
storage_context = StorageContext.from_defaults(persist_dir='./pytorch_vector')
pytorch_index = load_index_from_storage(storage_context=storage_context, service_context=service_context)

INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


In [10]:
indices = []
indices.append(rss_feed_index)
indices.append(pytorch_index)
# indices.append(github_site_index)

In [22]:
%%time

prompt = f"""Write a concise summary for the following question:


{question}
"""
indices_summary = []
for index in indices:
    index_engine = index.as_query_engine(response_mode="tree_summarize")
    summary = index_engine.query(
        f"What is a summary of this document?")
    indices_summary.append(str(summary))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 1 chunks
> Building index from nodes: 1 chunks
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 308 tokens
> [get_response] Total LLM token usage: 308 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1065 tokens
> [get_response] Total LLM token usage: 1065 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 t

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 1 chunks
> Building index from nodes: 1 chunks




INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 363 tokens
> [get_response] Total LLM token usage: 363 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1128 tokens
> [get_response] Total LLM token usage: 1128 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
CPU times: user 1min 49s, sys: 1.03 s, total: 1min 50s
Wall time: 1min 50s


In [23]:
print(indices_summary)

['\nThe summary of this document is at the level of a high school student, but it is clear and contains the main ideas from the conversation.\n\nThe patient was diagnosed with a number of medical issues (Heart Valve problem, hyperthyroidism, joint problems, and sleep apnea) and the doctor recommended that the patient seek additional medical care for these issues.', '\nThe document explains two main NLP components that play a role in automating the creation of clinical documentation. The first component, Automatic Speech Recognition (ASR), is used to translate speech into text. It takes the audio recording of the encounter and generates a conversation transcription (cf. Figure 2). The second component, Automatic Text Summarization, helps generate summaries from large text documents. This component is responsible for understanding and capturing the nuances and most essential aspects from the transcribed conversation into a final report in narrative form (cf. Figure 3), structured form, o

In [24]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    indices,
    index_summaries=indices_summary,
    service_context=service_context,
)
root_id = graph.root_id

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [25]:
# define a decompose transform
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor, verbose=True
)

# define custom retrievers
from llama_index.query_engine.transform_query_engine import TransformQueryEngine


In [26]:
custom_query_engines = {}
for index in indices:
    query_engine = index.as_query_engine()
    query_engine = TransformQueryEngine(
        query_engine,
        query_transform=decompose_transform,
        transform_extra_info={'index_summary': index.index_struct.summary},
    )
    custom_query_engines[index.index_id] = query_engine
custom_query_engines[graph.root_id] = graph.root_index.as_query_engine(
    response_mode='tree_summarize',
    verbose=True,
)

# construct query engine
graph_query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)

In [27]:
# tool config
graph_config = IndexToolConfig(
    query_engine=graph_query_engine,
    name=f"Graph Index",
    description="useful for when you want to answer queries that require analyzing multiple documents.",
    tool_kwargs={"return_direct": True}
)

In [28]:
# index configs
index_configs = []
for index in indices:
    query_engine = index.as_query_engine(
        similarity_top_k=3,
    )
    tool_config = IndexToolConfig(
        query_engine=query_engine, 
        name=f"Vector Index",
        description=f"useful for when you want to answer queries",
        tool_kwargs={"return_direct": True, "return_sources": True},
    )
    index_configs.append(tool_config)

toolkit = LlamaToolkit(
    index_configs=index_configs + [graph_config],
    
)


In [29]:
template = """You are a helpful assistant than answers questions related to PyTorch.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

{chat_history}
Human: {input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "input"], 
    template=template
)

In [30]:
memory = GPTIndexChatMemory(
    index=GPTListIndex([], service_context=service_context), 
    memory_key="chat_history", 
    query_kwargs={"response_mode": "compact"},
    # return_source returns source nodes instead of querying index
    return_source=True,
    # return_messages returns context in message format
    return_messages=True
)

llm=hf_pipeline
agent_chain = create_llama_chat_agent(
    toolkit,
    llm=HuggingFacePipeline(pipeline=generate_text),
    prompt=prompt,
    memory=memory,
    verbose=True
)


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [31]:
text_input = input()
try:
    response = agent_chain.run(input=text_input)
except Exception as e:
    response = str(e)
    if not response.startswith("Could not parse LLM output: `"):
        raise e
    response = response.removeprefix("Could not parse LLM output: `").removesuffix("`")
print("=======================================")
print(str(response))
print("=======================================")


how to check if pytorch is using gpu
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 0 tokens
> [get_response] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


[1m> Entering new AgentExecutor chain...[0m

Use `detectron2 lsp detect` to check pytorch model details.

Detection result:
- using gpu: yes
- available gpus: 0
- cpu: unknown
- memory usage: unknown
- disk usage: unknown
- register usage: unknown
- multiprocessing: unknown
- experiment configuration: unknown
- optimizer: unknown
- compile status: unknown
- training status: unknown
- test status: unknown

> use detectron2 lsp detect

Detection result:
- using gpu: yes
- available gpus: 0
- cpu: unknown
- memory usage: unknown
- disk usage: unknown
- register usage: unknown
- multiprocessing: unknown
- experiment configuration: unknown
- optimizer: unkno