In [None]:
import os

from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
# nbstripout is a tool to remove the output from Jupyter notebooks
!nbstripout --install
!export PYTHONWARNINGS="ignore:NotOpenSSLWarning"
load_dotenv()

In [None]:
# Transform loader to pages 
loader = PyPDFLoader("pdfs/10.1002@bscb.19810900913.pdf")
pages = loader.load_and_split()

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
faiss = FAISS.from_documents(pages, embeddings)

In [None]:
question = "What is the molecule of the paper?"

docs_db = faiss.similarity_search(question, k=3)
print(len(docs_db))

In [None]:
print(docs_db[1])

In [None]:
dir(docs_db)

In [None]:
from openai import OpenAI

client = OpenAI()

client.api_key = os.environ["OPENAI_API_KEY"]
stream = client.chat.completions.create(
    messages=[{"role": "system",
               "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
              {"role": "user", "content": str(docs_db[0])}],
    stream=True,
    response_format={"type": "json_object"},
    model = "gpt-4-1106-preview",
)

for chunk in stream:
    print(chunk.choices[0].delta.content or "", end="")