In [22]:
# !pip install -U langchain  nest_asyncio httpx redis[hiredis] redisearch > 2.4 bitsandbytes

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
import os
import torch
import uuid
from typing import Any, List, Mapping, Optional
from dotenv import load_dotenv
import langchain 
from threading import Thread
from tqdm import tqdm
from IPython.display import Markdown, display
from peft import PeftModel


from transformers import pipeline, TextStreamer, TextIteratorStreamer, LlamaTokenizer, LlamaForCausalLM
from llama_index.prompts.prompts import SimpleInputPrompt

from llama_index.indices.composability import ComposableGraph
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.langchain_helpers.memory_wrapper import GPTIndexChatMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, LlamaIndexTool, create_llama_agent
from llama_index import load_index_from_storage, download_loader, SummaryPrompt, LLMPredictor, GPTListIndex, GPTVectorStoreIndex, PromptHelper, StorageContext, ServiceContext, LangchainEmbedding, SimpleDirectoryReader
from llama_index.langchain_helpers.text_splitter import SentenceSplitter, TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser, NodeParser

from langchain.agents import Tool
from langchain import OpenAI, LLMChain
from langchain.llms.base import LLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.callbacks import tracing_enabled
from langchain.agents import Agent, initialize_agent, StructuredChatAgent, ConversationalChatAgent, ConversationalAgent, AgentExecutor, ZeroShotAgent, AgentType
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains.conversation.memory import ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.memory.chat_memory import ChatMessageHistory
from langchain.memory.chat_message_histories import RedisChatMessageHistory
from langchain.cache import RedisSemanticCache

load_dotenv()

INFO:numexpr.utils:Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


  warn(


CUDA SETUP: CUDA path found: /usr/local/cuda/lib64/libcudart.so
CUDA_SETUP: Detected CUDA version 118
CUDA_SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


True

In [4]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 2048
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

### HF Pipeline vicuna 13b

In [None]:
model_id = "jagadeesh/vicuna-13b"
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)
streamer = TextStreamer(tokenizer)
model = LlamaForCausalLM.from_pretrained(model_id, load_in_8bit=False, low_cpu_mem_usage=True, device_map="auto", max_memory={0:"18GB",1:"18GB",2:"18GB",3:"18GB","cpu":"10GB"}, torch_dtype=torch.float16)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, streamer=streamer, max_new_tokens=num_output, device_map="auto"
)
hf_pipeline = HuggingFacePipeline(pipeline=pipe)
llm_predictor = LLMPredictor(llm=hf_pipeline)

### Custom LLM

In [None]:
# api = "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"

# tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
# streamer = TextStreamer(tokenizer, skip_prompt=True, Timeout=5)

# import huggingface_hub as hf_hub

# hf_hub.login(token=api)

# ## loading llama base model and configuring it with adapter

# base_model_name = 'decapoda-research/llama-7b-hf'

# base_model = LlamaForCausalLM.from_pretrained(
#             base_model_name,
#             torch_dtype=torch.float16,
#             device_map="auto",
#         )

# model = PeftModel.from_pretrained(
#             base_model,
#             'shrinath-suresh/alpaca-lora-7b-answer-summary',
# #             'shrinath-suresh/alpaca-lora-all-7b-delta',
#             torch_dtype=torch.float16,
#             load_in_8bit=True
#         )
# class CustomLLM(LLM):
#     model_name = 'shrinath-suresh/alpaca-lora-7b-answer-summary'
#     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
#         inputs = tokenizer([prompt], return_tensors="pt")

#         # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
#         response = model.generate(**inputs, streamer=streamer, top_p=0.75, max_new_tokens=num_output)
#         response = tokenizer.decode(response[0])
#         return response

#     @property
#     def _identifying_params(self) -> Mapping[str, Any]:
#         return {"name_of_model": self.model_name}

#     @property
#     def _llm_type(self) -> str:
#         return "custom"

# llm_predictor = LLMPredictor(llm=CustomLLM())

In [None]:
# embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
embed_model = LangchainEmbedding(HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl"))

In [None]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model)

## Generate Index

In [None]:
# sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=20)
text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=200)
# sentence_parser = SimpleNodeParser(text_splitter=sentence_splitter)
text_parser = SimpleNodeParser(text_splitter=text_splitter)

In [None]:
def set_metadata(filename):
    return {"source": filename}

In [None]:
# PyTorch Docs
from llama_index.readers.file.markdown_reader import MarkdownReader
docs = SimpleDirectoryReader(input_dir="/home/ubuntu/text", recursive=True, file_extractor={".txt": MarkdownReader()}, file_metadata=set_metadata).load_data()

In [None]:
# display(Markdown(docs[6].text))

In [67]:
text_nodes = text_parser.get_nodes_from_documents(docs)
# sentence_nodes = sentence_parser.get_nodes_from_documents([docs[1]])

In [68]:
index = GPTVectorStoreIndex(text_nodes, service_context=service_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 2795091 tokens
> [build_index_from_nodes] Total embedding token usage: 2795091 tokens


In [69]:
index.storage_context.persist('./pytorch_docs_1024')

## Load data from disk

In [70]:
import llama_index
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore.from_persist_dir(persist_dir="./pytorch_docs_1024"),
    vector_store=SimpleVectorStore.from_persist_dir(persist_dir="./pytorch_docs_1024"),
    index_store=SimpleIndexStore.from_persist_dir(persist_dir="./pytorch_docs_1024"),
)
new_index = load_index_from_storage(storage_context, service_context=service_context)

INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


### Retriever

In [72]:
from llama_index.vector_stores.types import (
    MetadataFilters,
    VectorStoreQuery,
    VectorStoreQueryMode,
)

prompt =f"{queries[3]}"
retriever_engine=new_index.as_retriever(similarity_top_k=3, service_context=service_context, response_mode='simple_summarize')
response = retriever_engine.retrieve(prompt)
response

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 10 tokens
> [retrieve] Total embedding token usage: 10 tokens


[NodeWithScore(node=Node(text="default_weight_observer\n***********************\n\ntorch.quantization.observer.default_weight_observer\n\n   alias of functools.partial(<class\n   'torch.ao.quantization.observer.MinMaxObserver'>,\n   dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){}\n", doc_id='fe31b4c6-5566-4109-ab8a-5d2ddbb74336', embedding=None, doc_hash='7d61a55edccae685f62924d4cb75ca8f905e58c64f074891ebfdd7856edbbbda', extra_info={'source': '/home/ubuntu/text/generated/torch.quantization.observer.default_weight_observer.txt'}, node_info={'start': 0, 'end': 252}, relationships={<DocumentRelationship.SOURCE: '1'>: 'd99196ae-b1e2-430f-b3c3-617384f6cc82'}), score=0.8169193434662836),
 NodeWithScore(node=Node(text='Linear\n******\n\nclass torch.ao.nn.qat.Linear(in_features, out_features, bias=True, qconfig=None, device=None, dtype=None)\n\n   A linear module attached with FakeQuantize modules for weight, used\n   for quantization aware training.\n\n   We adopt the same interface 

## Tools

In [162]:
from llama_index.langchain_helpers.agents import LlamaIndexTool, IndexToolConfig

tool_config = IndexToolConfig(
    query_engine=new_index.as_query_engine(similarity_top_k=6, vector_store_query_mode="svm", service_context=service_context, response_mode='simple_summarize'), 
    name=f"PyTorch Docs",
    description=f"useful for when you want to answer queries about pytorch.",
    tool_kwargs={"return_direct": False, "return_sources": True},
)

toolkit = LlamaToolkit(
    index_configs=[tool_config],
)




### Enable caching

In [163]:
# langchain.llm_cache = RedisSemanticCache(
#     redis_url="redis://localhost:6379",
#     embedding=HuggingFaceEmbeddings()
# )
langchain.llm_cache = None

### Output Parser

In [164]:
# from langchain.output_parsers import PydanticOutputParser
# from pydantic import BaseModel, Field, validator
# from typing import List

# class Result(BaseModel):
#     answer: str = Field(description="Answer to the question")
#     source: str = Field(description="list of names of films they starred in")
        
# actor_query = "Generate the filmography for a random actor."

# parser = PydanticOutputParser(pydantic_object=Result)

# from langchain.output_parsers import OutputFixingParser
# new_parser = OutputFixingParser.from_llm(parser=parser, llm=hf_pipeline)


# Prompt

In [None]:
PREFIX = """Act like you are an expert PyTorch Engineer and provide answers to these questions from the developer community. If you don't know the answer say "I am not sure about this, can you post the question on pytorch-discuss channel", don't make up an answer if you don't know.

TOOLS:
------

You have access to the following tools:"""
FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:

```
Thought: Do I need to use a tool? Yes
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
```

When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:

```
Thought: Do I need to use a tool? No
{ai_prefix}: [your response here]
```"""

SUFFIX = """Given the context information and not prior knowledge, answer the question

Previous conversation history:
{chat_history}

{human_prefix}: {input}
{ai_prefix}:

{agent_scratchpad}"""

prompt = ConversationalAgent.create_prompt(
    tools=toolkit.get_tools(),
#     prefix=PREFIX,
#     format_instructions=FORMAT_INSTRUCTIONS,
#     suffix=SUFFIX,
    input_variables=["agent_scratchpad", "chat_history", "input"]
)

### Callback

In [167]:
# from langchain.callbacks.base import BaseCallbackHandler

# class MyCustomHandler(BaseCallbackHandler):
#     def on_llm_new_token(self, token: str, **kwargs) -> None:
#         print(f"My custom handler, token: {token}")

### Create llm chain and agent

In [183]:
# llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_chain = LLMChain(llm=CustomLLM(), prompt=prompt)

agent = StructuredChatAgent(llm_chain=llm_chain, tools=toolkit.get_tools(), verbose=False)

### Create agent chain and session history 

In [185]:
session_id = uuid.uuid4()

message_history = RedisChatMessageHistory(str(session_id), 'redis://localhost:6379/0', ttl=600)
convo_memory = ConversationBufferWindowMemory(k=3, memory_key="chat_history", return_messages=False, chat_memory=message_history)

# agent_chain = create_llama_chat_agent(toolkit, llm=hf_pipeline, memory=convo_memory, return_sources=True, streaming=True, verbose=False)
agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=toolkit.get_tools(),max_iterations=1, early_stopping_method="generate", verbose=False, return_sources=True, memory=convo_memory)

### Run query

In [186]:
queries = ["How do I check if PyTorch is using the GPU?",
           "How do I save a trained model in PyTorch?",
           "What does .view() do in PyTorch?",
           "Why do we need to call zero_grad() in PyTorch?",
           "How do I print the model summary in PyTorch?",
           "How do I initialize weights in PyTorch?",
           "What does model.eval() do in pytorch?",
           "What's the difference between reshape and view in pytorch?",
           "What does model.train() do in PyTorch?",
           "What does .contiguous() do in PyTorch?"]

In [194]:
response = agent_chain.run(input=queries[3])
display(Markdown(response))

Assistant is a large language model trained by OpenAI.

Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.

Overall, Assistant is a powerful tool that can help with a wide range of tasks 



In [None]:
from llama_index.evaluation import ResponseEvaluator
evaluator = ResponseEvaluator(service_context=service_context)

eval_result = evaluator.evaluate(response)
eval_result = evaluator.evaluate_source_nodes(response)
