In [1]:
import os
import torch
import uuid
from typing import Any, List, Mapping, Optional
from dotenv import load_dotenv
import langchain 
from threading import Thread
from tqdm import tqdm
from IPython.display import Markdown, display
from peft import PeftModel


from transformers import pipeline, TextStreamer, TextIteratorStreamer, LlamaTokenizer, LlamaForCausalLM
from llama_index.prompts.prompts import SimpleInputPrompt

from llama_index.indices.composability import ComposableGraph
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.langchain_helpers.memory_wrapper import GPTIndexChatMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, LlamaIndexTool, create_llama_agent
from llama_index import load_index_from_storage, download_loader, SummaryPrompt, LLMPredictor, GPTListIndex, GPTVectorStoreIndex, PromptHelper, StorageContext, ServiceContext, LangchainEmbedding, SimpleDirectoryReader
from llama_index.langchain_helpers.text_splitter import SentenceSplitter, TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser, NodeParser

from langchain.agents import Tool
from langchain import LLMChain
from langchain.llms.base import LLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.callbacks import tracing_enabled
from langchain.agents import Agent, BaseSingleActionAgent, LLMSingleActionAgent, initialize_agent, StructuredChatAgent, ConversationalChatAgent, ConversationalAgent, AgentExecutor, ZeroShotAgent, AgentType
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains.conversation.memory import ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.memory.chat_memory import ChatMessageHistory
from langchain.memory.chat_message_histories import RedisChatMessageHistory
from langchain.cache import RedisSemanticCache
import huggingface_hub as hf_hub

load_dotenv()


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)


True

In [2]:
# api = ""

# tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
# streamer = TextStreamer(tokenizer, skip_prompt=True, Timeout=5)

# import huggingface_hub as hf_hub

# hf_hub.login(token=api)

# ## loading llama base model and configuring it with adapter

# base_model_name = 'decapoda-research/llama-7b-hf'

# base_model = LlamaForCausalLM.from_pretrained(
#             base_model_name,
#             torch_dtype=torch.float16,
#             device_map="auto",
#         )

# model = LlamaForCausalLM.from_pretrained("shrinath-suresh/alpaca-lora-all-7B",low_cpu_mem_usage=True, use_auth_token=api,device_map="auto")

# class CustomLLM(LLM):
#     model_name = 'shrinath-suresh/alpaca-lora-all-7B'
#     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
#         print(":::::::::::::::::::::::::::", prompt)
#         inputs = tokenizer([prompt], return_tensors="pt")

#         # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
#         response = model.generate(**inputs, streamer=streamer, top_p=0.75, max_new_tokens=512)
#         response = tokenizer.decode(response[0])
#         return response

#     @property
#     def _identifying_params(self) -> Mapping[str, Any]:
#         return {"name_of_model": self.model_name}

#     @property
#     def _llm_type(self) -> str:
#         return "custom"

# llm_predictor = LLMPredictor(llm=CustomLLM())


In [3]:
token = ""
hf_hub.login(token=token)

model_id = "jagadeesh/vicuna-13b"
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)
streamer = TextStreamer(tokenizer, skip_prompt=True, Timeout=5)
model = LlamaForCausalLM.from_pretrained(model_id, load_in_8bit=False, low_cpu_mem_usage=True, device_map="auto", max_memory={0:"18GB",1:"18GB",2:"18GB",3:"18GB","cpu":"10GB"}, torch_dtype=torch.float16)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, streamer=streamer, max_new_tokens=512, device_map="auto"
)
hf_pipeline = HuggingFacePipeline(pipeline=pipe)
llm_predictor = LLMPredictor(llm=hf_pipeline)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
HF_Embed_model = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
embed_model = LangchainEmbedding(HF_Embed_model)

load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model)

In [6]:
# docs = SimpleDirectoryReader('./data').load_data()

In [7]:
# index = GPTVectorStoreIndex.from_documents(docs, service_context=service_context)

In [8]:
# index.storage_context.persist(persist_dir="./index")

In [9]:
index_dir="~/pytorch_docs_512"

In [105]:
import llama_index
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore.from_persist_dir(persist_dir=index_dir),
    vector_store=SimpleVectorStore.from_persist_dir(persist_dir=index_dir),
    index_store=SimpleIndexStore.from_persist_dir(persist_dir=index_dir),
)
index = load_index_from_storage(storage_context, service_context=service_context)

index_tools = [
    LlamaIndexTool.from_tool_config(
        IndexToolConfig(
            name = "PyTorch Index",
            query_engine=index.as_query_engine(similarity_top_k=3, response_mode="simple_summarize", service_context=service_context),
            description=f"useful for answering questions related to pytorch. Do not use this tool for fetching dataset. Do not use this tool with the same input/query",
            tool_kwargs={"return_direct": True, "verbose": True}
        )
    ),
]

ALL_TOOLS = index_tools

In [106]:
ALL_TOOLS

[LlamaIndexTool(name='PyTorch Index', description='useful for answering questions related to pytorch. Do not use this tool for fetching dataset. Do not use this tool with the same input/query', args_schema=None, return_direct=True, verbose=True, callbacks=None, callback_manager=None, query_engine=<llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x7f87c5eb28f0>, return_sources=False)]

In [107]:
query_engine=index.as_query_engine(similarity_top_k=3, response_mode="simple_summarize", service_context=service_context),

In [108]:
from langchain.vectorstores import FAISS
from langchain.schema import Document

docs = [Document(page_content=t.description, metadata={"index": i}) for i, t in enumerate(ALL_TOOLS)]

vector_store = FAISS.from_documents(docs, HF_Embed_model)

retriever = vector_store.as_retriever()

def get_tools(query):
    docs = retriever.get_relevant_documents(query)
    return [ALL_TOOLS[d.metadata["index"]] for d in docs]

get_tools("give me the supported datasets for the {\"model\":\"bert-base-uncased\", \"task\":\"fill-mask\"}")


[LlamaIndexTool(name='PyTorch Index', description='useful for answering questions related to pytorch. Do not use this tool for fetching dataset. Do not use this tool with the same input/query', args_schema=None, return_direct=True, verbose=True, callbacks=None, callback_manager=None, query_engine=<llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x7f87c5eb28f0>, return_sources=False)]

In [109]:
# langchain.llm_cache = RedisSemanticCache(
#     redis_url="redis://localhost:6379",
#     embedding=HuggingFaceEmbeddings()
# )
langchain.llm_cache = None

In [133]:
import re
from langchain.agents import Tool, AgentOutputParser
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish

class CustomOutputParser(AgentOutputParser):
    
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
output_parser = CustomOutputParser()

In [144]:
from langchain.prompts import StringPromptTemplate
from typing import Callable

# Set up the base template
template_with_history = """You are an expert PyTorch Assistant and provide answers to questions from the developer community.

Given the context and the conversation history, try to answer the question. Use only the information provided in the context. Do not use any external knowledge beyond the given conversation.
If you think you don't know the answer say "I am not sure about this, can you post the question on pytorch-discuss channel", don't make up an answer if you don't know.

Use the following format:

Question: the input question you must answer
Thought: use PyTorch Index tool
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat 1 times)
Final Answer: the final answer to the original input question

Previous conversation history:
{history}

New question: {input}
{agent_scratchpad}
"""

class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools_getter: Callable
    
    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        ############## NEW ######################
        tools = self.tools_getter(kwargs["input"])
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in tools])
        return self.template.format(**kwargs)

prompt_with_history = CustomPromptTemplate(
    template=template_with_history,
    tools_getter=get_tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["input", "intermediate_steps", "history"]
)


In [145]:
tools = get_tools("")
tool_names = [tool.name for tool in tools]

llm = hf_pipeline

llm_chain = LLMChain(llm=llm, prompt=prompt_with_history)
# llm_chain = LLMChain(llm=CustomLLM(), prompt=prompt_with_history)

agent = LLMSingleActionAgent(llm_chain=llm_chain, output_parser=output_parser, stop=["</s><s>"], allowed_tools=tool_names)

In [146]:
session_id = uuid.uuid4()

message_history = RedisChatMessageHistory(str(session_id), 'redis://localhost:6379/0', ttl=600)
memory = ConversationBufferWindowMemory(k=2, memory_key="history", return_messages=True, chat_memory=message_history)

agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, streaming=True, tools=tools, verbose=True, memory=memory)

In [147]:
queries = ["PyTorch preferred way to copy a tensor",
    "Pytorch tensor to numpy array",
    "Pytorch, what are the gradient arguments",
    "How to fix RuntimeError 'Expected object of scalar type Float but got scalar type Double for argument'?",
    "What does the gather function do in pytorch in layman terms?",
    "How to avoid ""CUDA out of memory"" in PyTorch",
]

In [155]:
query = "Explain about distributed test"
response = agent_chain.run(input=query)
# response = agent_chain.run(input="What is MPI in distributed training")
print("=========================================")
print(response)
print("=========================================")
# print(query_engine.query("What are the different pytorch distributed techniques?"))



[1m> Entering new AgentExecutor chain...[0m
::::::::::::::::::::::PROMPT::::::::::::::::::
You are an expert PyTorch Assistant and provide answers to questions from the developer community.

Given the context and the conversation history, try to answer the question. Use only the information provided in the context. Do not use any external knowledge beyond the given conversation.
If you think you don't know the answer say "I am not sure about this, can you post the question on pytorch-discuss channel", don't make up an answer if you don't know.

Use the following format:

Question: the input question you must answer
Thought: use PyTorch Index tool
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat 1 times)
Final Answer: the final answer to the original input question

Previous conversation history:
[HumanMessage(content='Pytorch tensor to numpy array', additional_kwargs={}, example=False), AIMessag

In [153]:
# for query in queries:
#     response = agent_chain.run(input=query)
#     print("=========================================")
#     print(response)
#     print("=========================================")
