In [1]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# create a loader
loader = PyPDFLoader("Prot_SAP_000.pdf")

# load your data
data = loader.load()

In [4]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 94 document(s) in your data
There are 1227 characters in your document


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

Now you have 320 documents


In [7]:
# import libraries
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [9]:
import keyring
import openai

In [16]:
openai.organization = "org-EEUV4gnLyXDJe82NmHgNkyo4"
OPENAI_API_KEY = keyring.get_password("openai", openai.organization)

PINECONE_API_ENV = "gcp-starter"
PINECONE_API_KEY = keyring.get_password('pinecone', 'gcp-starter')

In [17]:
# create embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [18]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to API key in console
)

index_name = "pdf-index" # put in the name of your pinecone index here

In [19]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
# If already have index
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [20]:
query = "What is the complexity of this clinical trial in terms of treatment arms?"
docs = docsearch.similarity_search(query)

In [22]:
docs

[Document(page_content='population of subjects is 44%. The alternative hypothesis is that the PFS6 rate is 60%. Forty- two \npatients per arm, expected to give 30 events, (this number includes the first 6 patients upon which safety was established) provide 81% power per arm for a log rank test, to detect this increase in PFS6, at a two -sided 5% significance level, assuming an exponential distribution and \nthat en rollment will continue for 36 months with an additional 6 months of follow-up prior to the \nfinal statistical analysis. Secondary outcomes include overall survival (OS) and objective response rate (ORR); median OS and 95% confidence interval will be estimated by the Kaplan - \nMeier method and ORR will be an estimated percentage with 95% confidence interval. Toxicities will be graded and tabulated and Grade 3 or higher toxicity rates will be calculated. With 42 patients per arm, we have 88% power to detect any toxicity occurring at a rate of 5% or more.', metadata={}),
 Doc

In [23]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.7, openai_api_key = OPENAI_API_KEY)

In [24]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=docsearch.as_retriever()
)

In [25]:
result = qa_chain({"query": query})
result["result"]

'This clinical trial has two treatment arms: Arm A and Arm B.'