In [None]:
#!pip3 install langchain pinecone-client "unstructured[local-inference]"	# install if needed
#!pip3 install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader

# default load strategy is "hi_res", which has better performance but requires detectron2 and can be slow. To be fast and simple, use strategy="fast", 
loader = UnstructuredPDFLoader("./em_lab.pdf")	
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)
print(f'Loaded {len(data)} document(s) with {len(data[0].page_content)} characters, and split into {len(texts)} split-documents.')

100%|██████████| 9/9 [00:00<00:00, 41.35it/s]
100%|██████████| 9/9 [00:00<00:00, 25.70it/s]
100%|██████████| 2/2 [00:00<00:00, 1307.25it/s]
100%|██████████| 8/8 [00:00<00:00, 35.36it/s]
100%|██████████| 9/9 [00:00<00:00, 31.75it/s]
100%|██████████| 5/5 [00:00<00:00, 206.17it/s]
100%|██████████| 10/10 [00:00<00:00, 29.27it/s]
100%|██████████| 3/3 [00:00<00:00, 68.39it/s]
100%|██████████| 10/10 [00:00<00:00, 75.91it/s]
100%|██████████| 9/9 [00:00<00:00, 109.27it/s]
100%|██████████| 4/4 [00:00<00:00, 311.75it/s]
100%|██████████| 2/2 [00:00<00:00, 184.48it/s]

Loaded 1 document(s) with 18443 characters, and split into 26 split-documents.





In [6]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

OPENAI_API_KEY = 'YOUR OPENAI API KEY'
PINECONE_API_KEY = 'YOUR PINECONE API KEY'  # look for it at app.pinecone.io
PINECONE_API_ENV = 'us-east4-gcp'           # next to pinecone api key in console

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_API_ENV  
)

In [11]:
index_name = "langchain2" # the index name which can be stored in pinecone.io
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "What is magnetic field and has the document discussed anything about it?"
# similarity search narrows down to most similar 'texts' and can speed up the response
docs = docsearch.similarity_search(query, include_metadata=True)	
# the answer will be provided after the following line
chain.run(input_documents=docs, question=query)

In [None]:
query = "What is the key idea of the context?"
# if you know the most relvant parts of the 'texts', you can input the indexes here. Or if the file is not too large, you can search in 'texts' without any index
chain.run(input_documents=texts[0:5], question=query)	