In [None]:
import json
import re  # Import the regular expressions library
from collections import Counter


def aggregate_json_values(json_array):
    # Load the JSON data into a Python object if it's a string
    if isinstance(json_array, str):
        data = json.loads(json_array)
    else:
        data = json_array

    # Initialize a dictionary to hold counters for each field
    field_counters = {}

    # Populate field_counters with a Counter for each field, ignoring empty or non-alphanumeric values
    for item in data:
        for field, value in item.items():
            # Use regular expression to check if the value is empty or does not contain alphanumeric characters
            if not re.search('[a-zA-Z0-9]', value):
                continue  # Skip if value is empty or has no alphanumeric characters
            if field not in field_counters:
                field_counters[field] = Counter()
            field_counters[field][value] += 1

    # Prepare the result dictionary
    result = {}

    # Determine the most common value or concatenate if there's no majority
    for field, counter in field_counters.items():
        # Extract values and their counts
        values, counts = zip(*counter.most_common()) if counter else ([], [])

        # Check if there's a single most common value
        if len(counter) == 1 or counter.most_common(1)[0][1] > 1:
            result[field], _ = counter.most_common(1)[0]
        else:
            # Concatenate all values if no clear majority and ensure unique values
            unique_values = sorted(set(values))
            result[field] = ",".join(unique_values)

    # Convert the result dictionary into a JSON string
    result_json_data = json.dumps(result, ensure_ascii=False)

    return result_json_data


# GPT call

In [None]:
# nbstripout is a tool to remove the output from Jupyter notebooks
#!nbstripout --install
!export PYTHONWARNINGS="ignore:NotOpenSSLWarning"

from dotenv import load_dotenv

load_dotenv()
# iterate over the pdfs in the folder pdf
from dotenv import load_dotenv

load_dotenv()

In [None]:
# iterate over the pdfs in the folder pdf
import os
from dotenv import load_dotenv

load_dotenv()


def call_openai_api(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "This is a scientific Paper from the field of chemestry. What is the molecule described in the paper?"
    docs_db = faiss.similarity_search(question, k=3)
    # make a for loop from 0 to k

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    return client.chat.completions.create(
        messages=[{"role": "system",
                   "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
                  {"role": "user", "content": str(docs_db[0])}],
        stream=True,
        response_format={"type": "json_object"},
        model="gpt-4-1106-preview",
    )


# save the output in a pandas dataframe
import pandas as pd

df = pd.DataFrame(columns=["pdf", "output"])
for pdf in os.listdir("pdfs"):
    stream = call_openai_api(f"pdfs/{pdf}")
    output = ""
    for chunk in stream:
        output += chunk.choices[0].delta.content or ""
        print(chunk.choices[0].delta.content or "", end="")
    df = df.append({"pdf": pdf, "output": output}, ignore_index=True)

df.to_csv("output.csv")


In [None]:
def call_openai_api(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "What is the molecule of the paper?"
    docs_db = faiss.similarity_search(question, k=3)

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    return client.chat.completions.create(
        messages=[{"role": "system",
                   "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
                  {"role": "user", "content": str(docs_db[0])}],
        stream=True,
        response_format={"type": "json_object"},
        model="gpt-4-1106-preview",
    )

#stream = call_openai_api("pdfs/10.1002@bscb.19810900913.pdf")
#for chunk in stream:
#    print(chunk.choices[0].delta.content or "", end="")

In [None]:
def call_openai_api(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "What is the molecule of the paper?"
    docs_db = faiss.similarity_search(question, k=3)

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    return client.chat.completions.create(
        messages=[{"role": "system",
                   "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
                  {"role": "user", "content": str(docs_db[0])}],
        stream=True,
        response_format={"type": "json_object"},
        model="gpt-4-1106-preview",
    )


import os
import pandas as pd

# Initialize an empty DataFrame
df = pd.DataFrame(columns=["pdf", "output"])

# List all PDF files in the directory and limit to the first five
pdf_files = [pdf for pdf in os.listdir("pdfs") if pdf.endswith('.pdf')][:5]

for pdf in pdf_files:
    # Assuming call_openai_api is a function that processes each PDF
    stream = call_openai_api(f"pdfs/{pdf}")
    output = ""
    for chunk in stream:
        # Assuming each chunk is processed to extract text and appended to output
        output += chunk.choices[0].delta.content or ""
        print(chunk.choices[0].delta.content or "", end="")
    # Append the results to the DataFrame
    df = df._append({"pdf": pdf, "output": output}, ignore_index=True)

# Save the DataFrame to a CSV file
df.to_csv("output.csv")

In [None]:
df

In [None]:
def call_openai_api(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "What is the molecule of the paper?"
    docs_db = faiss.similarity_search(question, k=1)

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    return client.chat.completions.create(
        messages=[{"role": "system",
                   "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
                  {"role": "user", "content": str(docs_db[0])}],
        stream=True,
        response_format={"type": "json_object"},
        model="gpt-4-1106-preview",
    )


import os
import pandas as pd

# Initialize an empty DataFrame
df2 = pd.DataFrame(columns=["pdf", "output"])

# List all PDF files in the directory and limit to the first five
pdf_files = [pdf for pdf in os.listdir("pdfs") if pdf.endswith('.pdf')][:5]

for pdf in pdf_files:
    # Assuming call_openai_api is a function that processes each PDF
    stream = call_openai_api(f"pdfs/{pdf}")
    output = ""
    for chunk in stream:
        # Assuming each chunk is processed to extract text and appended to output
        output += chunk.choices[0].delta.content or ""
        print(chunk.choices[0].delta.content or "", end="")
    # Append the results to the DataFrame
    df2 = df2._append({"pdf": pdf, "output": output}, ignore_index=True)

# Save the DataFrame to a CSV file
df2.to_csv("k1.csv")

In [None]:
df2

In [None]:
def call_openai_api(path):
    global response_string
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "What is the the Compound of the paper? Find name, bioactivity, species, collection site, isolation type."
    docs_db = faiss.similarity_search(question, k=6)

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    response_list = ""
    # for loop from 0 to k
    for i in range(0, 3):
        response = client.chat.completions.create(
            messages=[{"role": "system",
                       "content": "You are a chemist expert in natural products. You give the answer in JSON format: [{\"compoundName\": \"Example Compound Name\", \"bioactivity\": \"Example Bioactivity\", \"species\": \"Example Species\", \"collectionSite\": \"Example Collection Site\", \"isolationType\": \"Example Isolation Type\"}]. NEVER CHANGE THE JSON FORMAT!. Answer user's questions utilizing your background knowledge or the information given below if its not specified leave it empty like \"\""},
                      {"role": "user", "content": str(
                          docs_db[i]) + "PLEASE WATCH INFORMATION FROM THE LAST CALL AND USE THE COMPUND " + str(
                          response_list)}],
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4-1106-preview",
        )
        response_string = ""
        for chunk in response:
            response_string += chunk.choices[0].delta.content or ""
            print(chunk.choices[0].delta.content or "", end="")
            response_list = response_list + str(chunk.choices[0].delta.content) or ""

    return response_string


import os
import pandas as pd

# Initialize an empty DataFrame
df2 = pd.DataFrame(columns=["pdf", "output"])

# List all PDF files in the directory and limit to the first five
pdf_files = [pdf for pdf in os.listdir("pdfs") if pdf.endswith('.pdf')][:1]

for pdf in pdf_files:
    # Assuming call_openai_api is a function that processes each PDF
    stream = call_openai_api(f"pdfs/{pdf}")
    # remove \n from the string
    stream = stream.replace("\n", "")
    df2 = df2._append({"pdf": pdf, "output": stream}, ignore_index=True)

# Save the DataFrame to a CSV file
df2.to_csv("k1.csv")

In [None]:
df2

In [None]:
"This is a scientific Paper from the field of chemistry. What is the molecule described in the paper?"

In [None]:
def call_openai_api(path):
    k = 10
    global response_string, response_res, json_array_one
    json_array_one = []
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    faiss = FAISS.from_documents(pages, embeddings)
    question = "This is a scientific Paper from the field of chemistry. What is the molecule described in the paper? Find name, bioActivity, collectionSpecies, collectionSite, isolationType."
    docs_db = faiss.similarity_search(question, k=k)

    system_message = (
        "You are a chemist specializing in natural products. Your task is to analyze a scientific paper and identify the molecule described. "
        "Provide your answer in JSON format, including the molecule's name, bioactivity, collection species, site, and isolation type. No more! If no Information available leave it empty \"\"  NO ARRAYS IN THE ATTRIBUTES. NEVER CHANGE THE JSON FORMAT! JUST THE 5 FIELDS! "
        "Use clear and concise language. Focus on accuracy and detail."
    )

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    response_list = ""
    # for loop from 0 to k
    for i in range(0, k):
        user_message = (
            f"Based on the document content: {docs_db[i]}, and considering previous analyses: {response_list}, "
            "identify the described molecule. If specific information is not available, leave the field empty."
        )
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4-1106-preview",
        )
        response_res = ""
        response_string = ""
        for chunk in response:
            # add response json to the json_array
            json_array_one.append(chunk.choices[0].delta.content or "")
            response_res += chunk.choices[0].delta.content or ""
            response_string += chunk.choices[0].delta.content or ""
            print(chunk.choices[0].delta.content or "", end="")
            response_list = response_list + str(chunk.choices[0].delta.content) or ""

    return response_res


import os
import pandas as pd

# Initialize an empty DataFrame
df2 = pd.DataFrame(columns=["pdf", "output"])

# Assuming call_openai_api is a function that processes each PDF
stream = call_openai_api("pdfs/10.1016@j.biortech.2010.01.041.pdf")
# remove \n from the string
stream = stream.replace("\n", "")
df2 = df2._append({"pdf": pdf, "output": stream}, ignore_index=True)

# Save the DataFrame to a CSV file
df2.to_csv("k1.csv")

In [None]:
jarray = json.dumps(json_array_one)
print(aggregate_json_values(jarray))

In [None]:
# List all PDF files in the directory and limit to the first five
pdf_files = [pdf for pdf in os.listdir("pdfs") if pdf.endswith('.pdf')][:1]

for pdf in pdf_files:

In [None]:
def call_openai_api(path):
    k = 3
    global response_string, response_res, json_array_one
    json_array_one = []
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    system_message = (
        "You are a chemist specializing in natural products. Your task is to analyze a scientific paper and identify the molecule described. "
        "Provide your answer in JSON format, including the molecule's IUPAC_nomenclature, bioActivity, collectionSpecies, collectionSite, and collectionType. No more! If no Information available leave it empty \"\"  NO ARRAYS IN THE ATTRIBUTES. NEVER CHANGE THE JSON FORMAT! JUST THE 5 FIELDS! MAKE THE INFORMATION AS SPECIFIC AS POSSIBLE. "
    )

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    response_list = ""
    response_res = ""

    # for loop from 0 to k
    for i in range(0, k):
        user_message = (
            f"Based on the document content: {pages}, and previous analyses: {response_list} but check if its right otherwise change it, "
            "identify the described molecule enrich it with information from the above. If specific information is not available, leave the field empty."
        )
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4-1106-preview",
        )

        response_string = ""
        for chunk in response:
            # add response json to the json_array
            json_array_one.append(chunk.choices[0].delta.content or "")
            response_res += chunk.choices[0].delta.content or ""
            response_string += chunk.choices[0].delta.content or ""
            print(chunk.choices[0].delta.content or "", end="")
            response_list = response_list + str(chunk.choices[0].delta.content) or ""

    return response_res


import pandas as pd

pdf = "pdfs/10.1016@j.biortech.2010.01.041.pdf"
# Initialize an empty DataFrame
df2 = pd.DataFrame(columns=["pdf", "output"])

# Assuming call_openai_api is a function that processes each PDF
stream = call_openai_api(pdf)
# remove \n from the string
stream = stream.replace("\n", "")
df2 = df2._append({"pdf": pdf, "output": stream}, ignore_index=True)

# Save the DataFrame to a CSV file
df2.to_csv(pdf + ".csv")

In [None]:
df2

In this scenario, you are a chemist with a focus on natural products, tasked with analyzing a scientific paper to identify a specific molecule. Your response must be structured in JSON format, capturing key information about the molecule in question. This includes:
- `IUPAC_nomenclature`: This field represents the molecule's name according to the International Union of Pure and Applied Chemistry's systematic naming conventions. For example, "3,4-dihydroxybenzoic acid geranyl ester" clearly describes the chemical structure of the compound in a standardized way.
- `bioActivity`: Here, you detail the molecule's biological effect or function. "Inhibition of Protease" means the compound prevents or reduces the activity of protease enzymes, crucial for understanding its potential therapeutic uses.
- `collectionSpecies`: This specifies the biological source or species from which the molecule was isolated, such as "Piper crassinervium (Piperaceae)," pointing to a specific plant within the Piperaceae family.
- `collectionSite`: Indicates the geographical origin where the compound was collected or the organism was found. "Araraquara/SP" refers to a location in São Paulo, Brazil, providing context for the environmental conditions of the source.
- `collectionType`: Describes the origin or process through which the compound was obtained, such as "Biotransformation Product," indicating the compound results from a biological organism chemically modifying a precursor compound.
Your analysis should be precise, adhering to the JSON format provided, with each field filled according to the information available from the paper. If certain details are not mentioned, leave the fields empty with `""`. This structured approach ensures clarity and specificity in documenting the molecule's characteristics.

# Best one 

In [None]:
def call_openai_api(path):
    k = 1
    global response_string, response_res, json_array_one
    json_array_one = []
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    system_message = (
        "In this scenario, you are a chemist with a focus on natural products, tasked with analyzing a scientific paper to identify a specific molecule. Your response must be structured in JSON format, capturing key information about the molecules in question. This includes:"
        "- `IUPAC_nomenclature`: This field represents the molecule's name according to the International Union of Pure and Applied Chemistry's systematic naming conventions. For example, \"3,4-dihydroxybenzoic acid geranyl ester\" clearly describes the chemical structure of the compound in a standardized way."
        "- `bioActivity`: Here, you detail the molecule's biological effect or function. \"Inhibition of Protease\" means the compound prevents or reduces the activity of protease enzymes, crucial for understanding its potential therapeutic uses."
        "- `collectionSpecies`: This specifies the biological source or species from which the molecule was isolated, such as \"Piper crassinervium (Piperaceae)\", pointing to a specific plant within the Piperaceae family."
        "- `collectionSite`: Indicates the geographical origin where the compound was collected or the organism was found. \"Araraquara/SP\" refers to a location in São Paulo, Brazil, providing context for the environmental conditions of the source."
        "- `collectionType`: Describes the origin or process through which the compound was obtained, such as \"Biotransformation Product\", indicating the compound results from a biological organism chemically modifying a precursor compound."
        "Your analysis should be precise, adhering to the JSON format provided, with each field filled according to the information available from the paper. If certain details are not mentioned, leave the fields empty with `""`. This structured approach ensures clarity and specificity in documenting the molecule's characteristics."
        "If there are multiple molecules, provide information for each one separately. The keys are the same for each molecule, but the values will differ based on the information available in the document. Ensure that the JSON format is maintained for each molecule. Name the molecules as Molecule_1, Molecule_2, and so on."
    )

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    response_list = ""
    response_res = ""

    for i in range(0, k):
        user_message = (
            f"Based on the document content: {pages}, and previous analyses: {response_list} but check if its right otherwise change it, "
            "identify the described molecule enrich it with information from the above. If specific information is not available, leave the field empty."
        )
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            seed=920987036854775807,
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4-1106-preview",
        )

        response_string = ""
        for chunk in response:
            json_array_one.append(chunk.choices[0].delta.content or "")
            response_res += chunk.choices[0].delta.content or ""
            response_string += chunk.choices[0].delta.content or ""
            print(chunk.choices[0].delta.content or "", end="")
            response_list = response_list + str(chunk.choices[0].delta.content) or ""

    return response_res


import pandas as pd

pdf = "pdfs/10.1016@0031-9422(73)85034-4.pdf"
df2 = pd.DataFrame(columns=["pdf", "output"])

stream = call_openai_api(pdf)
stream = stream.replace("\n", "")
df2 = df2._append({"pdf": pdf, "output": stream}, ignore_index=True)

df2.to_csv(pdf + ".csv")

In [None]:
import json
import pandas as pd


def call_openai_api(path):
    # Number of pages or iterations, for demonstration it's a fixed value
    k = 3
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]

    # List to accumulate JSON objects for each compound
    compounds_info = []
    # System message describing the task, kept constant for each API call
    system_message = ("In this scenario, you are a chemist with a focus on natural products, tasked with analyzing "
                      "a scientific paper to identify a specific molecule. Your response must be structured in JSON format, "
                      "capturing key information about the molecules in question. If certain details are not mentioned, "
                      "leave the fields empty with ``.")

    # Assume PyPDFLoader and OpenAI client setup is defined elsewhere

    for i in range(0, k):
        # Simulate reading and processing pages (actual PDF processing should happen here)
        page_content = f"Simulated page content {i + 1}"  # Placeholder for actual page content

        user_message = (
            f"Based on the document content: {page_content}, identify the described molecule enrich it with "
            "information from the above. If specific information is not available, leave the field empty.")

        user_message = (
            f"Based on the document content: {pages} "
            "identify the described molecule enrich it with information from the above. If specific information is not available, leave the field empty."
        )
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            seed=920987036854775807,
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4-1106-preview",
        )

        # Process the response
        for chunk in response:
            # Correctly accessing the content of ChatCompletionChunk objects
            if chunk.choices[0].delta.content:
                try:
                    json_obj = json.loads(chunk.choices[0].delta.content)
                    compounds_info.append(json_obj)
                except json.JSONDecodeError:
                    # Handle the case where JSON decoding fails
                    print("Error decoding JSON from response")

    # Return the list of compounds information as JSON
    return json.dumps(compounds_info)


# Function usage and output saving example
pdf_path = "pdfs/10.1016@j.cbi.2010.08.008.pdf"
json_output = call_openai_api(pdf_path)

# Convert the JSON string back to a list for DataFrame construction
compounds_info_list = json.loads(json_output)
df = pd.DataFrame([{"pdf": pdf_path, "output": json_output}])
csv_file_path = pdf_path + ".csv"
df.to_csv(csv_file_path, index=False)



In [None]:
import json
import pandas as pd


def call_openai_api(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()  # Load all pages and split them into a list or a single string
    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]

    # System message describing the task, kept constant for the API call
    system_message = ("In this scenario, you are a chemist with a focus on natural products, tasked with analyzing "
                      "a scientific paper to identify specific molecules. Your response must be structured in JSON format, "
                      "capturing key information about the molecules in question. If certain details are not mentioned, "
                      "leave the fields empty with ``.")

    # Combine all pages into a single string for analysis
    document_content = " ".join(str(pages))  # Assuming pages is a list of strings

    user_message = (
        f"Based on the document content: {str(document_content)}, identify all the described molecules and enrich them with "
        "information accordingly. If specific information is not available, leave the field empty.")

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        seed=920987036854775807,
        stream=True,
        response_format={"type": "json_object"},
        model="gpt-4-1106-preview",
    )

    # Initialize list to accumulate JSON objects for each compound
    compounds_info = []

    # Process the response
    if response.choices[0].delta.content:
        try:
            json_obj = json.loads(response.choices[0].delta.content)
            compounds_info = json_obj  # Assuming the response directly gives a list of compounds
        except json.JSONDecodeError:
            print("Error decoding JSON from response")

    # Return the list of compounds information as JSON
    return json.dumps(compounds_info)


# Usage and output saving
pdf_path = "pdfs/10.1016@j.cbi.2010.08.008.pdf"
json_output = call_openai_api(pdf_path)
compounds_info_list = json.loads(json_output)
df = pd.DataFrame([{"pdf": pdf_path, "output": json_output}])
csv_file_path = pdf_path + ".csv"
df.to_csv(csv_file_path, index=False)
print(compounds_info_list)

In [None]:
# nbstripout is a tool to remove the output from Jupyter notebooks
#!nbstripout --install
!export PYTHONWARNINGS="ignore:NotOpenSSLWarning"

from dotenv import load_dotenv

load_dotenv()
# iterate over the pdfs in the folder pdf
import os
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
