In [1]:
# !pip install -U llama-index

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [1]:
import os
import json
import uuid
import torch
import requests
from typing import Any, List, Mapping, Optional
from dotenv import load_dotenv
import langchain 
from threading import Thread
from tqdm import tqdm
from peft import PeftModel
from IPython.display import Markdown, display

from pydantic import BaseModel, Field
from transformers import pipeline, TextStreamer, TextIteratorStreamer, LlamaTokenizer, LlamaForCausalLM
import huggingface_hub as hf_hub

from llama_index.indices.composability import ComposableGraph
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.langchain_helpers.memory_wrapper import GPTIndexChatMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, LlamaIndexTool
from llama_index import download_loader, SummaryPrompt, LLMPredictor, GPTListIndex, GPTVectorStoreIndex, PromptHelper, load_index_from_storage, StorageContext, ServiceContext, LangchainEmbedding, SimpleDirectoryReader
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser, NodeParser
from llama_index.vector_stores import ChromaVectorStore


from langchain.tools import BaseTool, StructuredTool, tool
from langchain.prompts import MessagesPlaceholder
from langchain.agents import Tool, AgentOutputParser
from langchain import OpenAI, LLMChain
from langchain.llms.base import LLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.callbacks import tracing_enabled
from langchain.agents import initialize_agent, LLMSingleActionAgent, StructuredChatAgent, ConversationalChatAgent, ConversationalAgent, AgentExecutor, ZeroShotAgent, AgentType
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains.conversation.memory import ConversationBufferMemory, ConversationStringBufferMemory, ConversationBufferWindowMemory
from langchain.memory.chat_memory import ChatMessageHistory
from langchain.memory import ConversationKGMemory
from langchain.memory.chat_message_histories import RedisChatMessageHistory
from langchain.cache import RedisSemanticCache

load_dotenv()


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)


True

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
token = "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"
hf_hub.login(token=token)

model_id = "jagadeesh/vicuna-13b"
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)
streamer = TextStreamer(tokenizer, skip_prompt=True, Timeout=5)
model = LlamaForCausalLM.from_pretrained(model_id, load_in_8bit=False, low_cpu_mem_usage=True, device_map="auto", max_memory={0:"18GB",1:"18GB",2:"18GB",3:"18GB","cpu":"10GB"}, torch_dtype=torch.float16)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, streamer=streamer, max_new_tokens=512, device_map="auto"
)
hf_pipeline = HuggingFacePipeline(pipeline=pipe)
llm_predictor = LLMPredictor(llm=hf_pipeline)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# HF_embed_model = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
HF_embed_model = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
embed_model = LangchainEmbedding(HF_embed_model)

load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model)

In [6]:
import llama_index
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore.from_persist_dir(persist_dir="/home/ubuntu/pytorch_docs"),
    vector_store=SimpleVectorStore.from_persist_dir(persist_dir="/home/ubuntu/pytorch_docs"),
    index_store=SimpleIndexStore.from_persist_dir(persist_dir="/home/ubuntu/pytorch_docs"),
)
index = load_index_from_storage(storage_context, service_context=service_context)

In [7]:
engine = index.as_query_engine(similarity_top_k=3, response_mode="simple_summarize", service_context=service_context)

In [8]:
from llama_index.prompts  import Prompt
from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine

custom_prompt = Prompt("""\
Given a conversation (between Human and Assistant) and a follow up message from Human, \
rewrite the message to be a standalone question that captures all relevant context \
from the conversation.

<Chat History> 
{chat_history}

<Follow Up Message>
{question}

<Standalone question>
""")

# list of (human_message, ai_message) tuples
custom_chat_history = [
    (
        'Hello assistant, we are having a insightful discussion about PyTorch.', 
        'Okay, sounds good.'
    )
]

chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=engine, 
    service_context=service_context,
    condense_question_prompt=custom_prompt,
    chat_history=custom_chat_history,
    verbose=True
)

In [9]:
queries = [
    "What's the difference between reshape and view in pytorch?",
    "What does model.train() do in PyTorch?",
    "What does .contiguous() do in PyTorch?",
    "Why do we ""pack"" the sequences in PyTorch?",
    "Check the total number of parameters in a PyTorch model",
]

In [10]:
response=chat_engine.chat(queries[0])
print("===========================")
print(response.response)
print("===========================")
# for query in queries:
#     response=chat_engine.chat(query)
#     print("===========================")
#     print(response.response)
#     print("===========================")



Can you explain the difference between reshape and view in PyTorch, and when each should be used in a neural  network?</s><s>
Querying with: Can you explain the difference between reshape and view in PyTorch, and when each should be used in a neural network?

Yes, I can explain the difference between reshape and view in PyTorch, and when each should be used in a neural network.

Reshape and view are both methods in PyTorch that allow you to modify the shape of a tensor. However, they differ in how they approach this task.

Reshape creates a new tensor with a modified shape, while view modifies the shape of the existing tensor. When you use reshape, PyTorch creates a new tensor with the specified shape and copies the data from the original tensor to the new one. This can be slow and memory-intensive, especially for large tensors. In contrast, view modifies the shape of the existing tensor without copying the data, which is faster and more memory-efficient.

Reshape should be used when y