In [3]:
import os

from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

load_dotenv()

True

In [4]:
!export PYTHONWARNINGS="ignore:NotOpenSSLWarning"

In [5]:
# Transform loader to pages 
loader = PyPDFLoader("pdfs/10.1002@bscb.19810900913.pdf")
pages = loader.load_and_split()

In [6]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
faiss = FAISS.from_documents(pages, embeddings)

In [8]:
question = "What is the molecule of the paper?"

docs_db = faiss.similarity_search(question, k=3)
print(len(docs_db))

3


In [9]:
print(docs_db[1])

page_content="seEaflrion-pf-The_-s_Te_Nl-lc_e_T~Te_s_ \n103 mg of the steryl acetates were chromatographed several times on columns \nof silica gel impregnated with 15% AgN03 using a gradient elution of benzene \n(from 3 to 50%) in hexane). Eluted fractions were monitored by GC analysis. \nCombined fractions were finally purified by preparative tlc (eluent : hexane- \nbenzene 1:l) : the silica band corresponding to Rf 0.4 to 0.6 was extracted by \na 1:l mixture of CHC13 and AcOEt. This yielded 3 mg of 2-1 (IIIb), 77 mg Of 2-2 \n(In) and 8 mg Of 2-3 (Vb). \nCompound IIIb : : 95% pure, RRT = 1.00, co-chromatographs with authentic \n6 \n: 1730, 1640, 1250 and 895 IIIb. : M+-60 = 368, 353 (368-CH;) ,... \nCompound IV b : : 98% pure, RRT = 1.31. m.p. = 133-1340 (lit. 134-1350) . \n[alD = -42.80 (lit. -46.7?)6. \ncm-'. !g = M+-60 = 380, 365 (380-CH3), 296 (Mac Lafferty from \n380), 281, 255, 253 (380-side chain + 2H), 228 and 213. 'H NMR \n(CDC13) : 0.65 (3H s C-18) - 0.95 (3H d J=6.5 HZ C-2

In [10]:
dir(docs_db)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [14]:
from openai import OpenAI

client = OpenAI()

client.api_key = os.environ["OPENAI_API_KEY"]
stream = client.chat.completions.create(
    messages=[{"role": "system",
               "content": "You are a chemist expert in natural products. You give the answer in JSON format. Answer user's questions utilizing your background knowledge or the information given below"},
              {"role": "user", "content": str(docs_db[0])}],
    stream=True,
    response_format={"type": "json_object"},
    model = "gpt-4-1106-preview",
)

for chunk in stream:
    print(chunk.choices[0].delta.content or "", end="")

{
    "MeltingPoint": {
        "Instrument": "Büchi SMP-20",
        "CorrectionStatus": "uncorrected"
    },
    "OpticalRotation": {
        "Instrument": "Perkin-Elmer 241 Polarimeter",
        "MeasurementWavelength_nm": 259,
        "Solvent": "CHCl3"
    },
    "IR_Spectra": {
        "Instrument": "Perkin-Elmer 7358 Spectrophotometer",
        "Solvent": "CHCl3"
    },
    "MassSpectrometry": {
        "Instrument": "Micromass MM12F",
        "Detail": "Fragment intensities are expressed as percentages of the base peak"
    },
    "NMR_Spectra": {
        "Instruments": [
            "Varian XL-100",
            "Varian EM 390"
        ],
        "EquippedWith": "Varian 3930 Spin Decoupler System",
        "Solvent": "CDCl3",
        "InternalReference": "TMS",
        "ShiftScale": "δ",
        "ShiftAbbreviations": {
            "b": "broad",
            "d": "doublet",
            "m": "multiplet",
            "s": "singlet"
        }
    },
    "GasChromatography": {
      

In [12]:

print(stream.response.read().json(
))

NameError: name 'stream' is not defined

In [54]:
# nbstripout is a tool to remove the output from Jupyter notebooks
!nbstripout --install