In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import os
import torch
from dotenv import load_dotenv

from transformers import pipeline, TextStreamer

from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.langchain_helpers.memory_wrapper import GPTIndexChatMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, LlamaIndexTool
from llama_index import download_loader, SummaryPrompt, LLMPredictor, GPTListIndex, PromptHelper, load_index_from_storage, StorageContext, ServiceContext, LangchainEmbedding

from langchain.agents import Tool
from langchain.llms.base import LLM
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.agents import initialize_agent
from langchain.embeddings import HuggingFaceEmbeddings

load_dotenv()

INFO:numexpr.utils:Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


True

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
# define prompt helper
# set maximum input size
max_input_size = 512
# set number of output tokens
num_output = 128
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

In [5]:
generate_text = pipeline(model="databricks/dolly-v2-12b", torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True)

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
llm_predictor = LLMPredictor(llm=hf_pipeline)

In [6]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Use pytorch device: cuda


In [7]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model)

## Load data from disk

In [9]:
storage_context = StorageContext.from_defaults(persist_dir='./pytorch_vector')
pytorch_index = load_index_from_storage(storage_context=storage_context, service_context=service_context)

INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


In [10]:
%%time

index_engine = pytorch_index.as_query_engine(response_mode="compact")
summary = index_engine.query("How to check if pytorch is using gpu?")
print(str(summary))