In [None]:
# nbstripout is a tool to remove the output from Jupyter notebooks
#!nbstripout --install
!export PYTHONWARNINGS="ignore:NotOpenSSLWarning"

from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from openai import OpenAI
import pandas as pd

load_dotenv()


In [None]:
def call_openai_api(path):
    k = 1
    global response_string, response_res, json_array_one
    json_array_one = []
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    system_message = (
        "In this scenario, you are a chemist with a focus on natural products, tasked with analyzing a scientific paper to identify a specific molecule. Your response must be structured in JSON format, capturing key information about the molecules in question. This includes: `IUPAC_nomenclature`: This field represents the molecule's name according to the International Union of Pure and Applied Chemistry's systematic naming conventions. For example, \"3,4-dihydroxybenzoic acid geranyl ester\" clearly describes the chemical structure of the compound in a standardized way. `bioActivity`: Here, you detail the molecule's biological effect or function. \"Inhibition of Protease\" means the compound prevents or reduces the activity of protease enzymes, crucial for understanding its potential therapeutic uses. `collectionSpecies`: This specifies the biological source or species from which the molecule was isolated, such as \"Piper crassinervium (Piperaceae)\", pointing to a specific plant within the Piperaceae family. `collectionSite`: Indicates the geographical origin where the compound was collected or the organism was found. \"Araraquara/SP\" refers to a location in São Paulo, Brazil, providing context for the environmental conditions of the source. `collectionType`: Describes the origin or process through which the compound was obtained, such as \"Biotransformation Product\", indicating the compound results from a biological organism chemically modifying a precursor compound. Your analysis should be precise, adhering to the JSON format provided, with each field filled according to the information available from the paper. If certain details are not mentioned, leave the fields empty with `""`. If there are multiple molecules, provide information for each one separately. The keys are the same for each molecule, but the values will differ based on the information available in the document. Ensure that the JSON format is maintained for each molecule. Name the molecules as Molecule_1, Molecule_2, and so on. Please find as much information as possible about each molecule in the document."
    )

    client = OpenAI()
    client.api_key = os.environ["OPENAI_API_KEY"]
    response_list = ""
    response_res = ""

    for i in range(0, k):
        user_message = (
            f"Based on the document content: {pages}, and previous analyses: {response_list} but check if its right otherwise change it, identify the described molecule enrich it with information from the above. If specific information is not available, leave the field empty."
        )
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            seed=920987036854775807,
            stream=True,
            response_format={"type": "json_object"},
            model="gpt-4o",
        )

        response_string = ""
        for chunk in response:
            json_array_one.append(chunk.choices[0].delta.content or "")
            response_res += chunk.choices[0].delta.content or ""
            response_string += chunk.choices[0].delta.content or ""
            #print(chunk.choices[0].delta.content or "", end="")
            response_list = response_list + str(chunk.choices[0].delta.content) or ""

    return response_res



pdf = "pdfs/10.1016@0031-9422(73)85034-4.pdf"
df2 = pd.DataFrame(columns=["pdf", "output"])

stream = call_openai_api(pdf)
stream = stream.replace("\n", "")
df2 = df2._append({"pdf": pdf, "output": stream}, ignore_index=True)

df2.to_csv(pdf + ".csv")

In [None]:
import time
import os
import pandas as pd


def process_pdfs_in_folder(folder_path):
    # Initialize an empty DataFrame for consolidated data
    consolidated_df = pd.DataFrame(columns=["pdf", "output"])
    i =0
    # List all PDF files in the specified folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    # Process each PDF file
    for pdf_file in pdf_files:
        i += 1
        print(f"Processing PDF {i}/{len(pdf_files)}: {pdf_file}")
        pdf_path = os.path.join(folder_path, pdf_file)
        try:
            stream = call_openai_api(pdf_path)
            stream = stream.replace("\n", "")
            # Create a DataFrame for the current PDF
            df_current = pd.DataFrame([{"pdf": pdf_path, "output": stream}])
            # Append the current DataFrame to the consolidated DataFrame
            consolidated_df = pd.concat([consolidated_df, df_current], ignore_index=True)
            # Export the current DataFrame to an individual CSV file
            csv_filename = os.path.join(folder_path, pdf_file + ".csv")
            df_current.to_csv(csv_filename, index=False)
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
            # sleep for 60 seconds to avoid hitting the OpenAI API rate limit
            time.sleep(60)

    # Export the consolidated DataFrame to a CSV file
    consolidated_csv_filename = os.path.join(folder_path, "consolidated_output.csv")
    consolidated_df.to_csv(consolidated_csv_filename, index=False)


# Process PDFs in the specified folder
process_pdfs_in_folder('pdfs/')


In [None]:
consolidated_df = pd.read_csv('consolidated_output_full.csv')
#print(consolidated_df)
# iterate over the row pdf of the DataFrame and search if the pdfs of the folder are in the row pdf
# if they are not, then print the pdf name and add them to the list of pdfs not processed
not_processed_pdfs = []
for pdf in os.listdir('pdfs/'):
    if pdf.endswith('.pdf'):
        if "pdfs/"+pdf not in consolidated_df['pdf'].values:
            print(f"PDF {pdf} not processed.")
            not_processed_pdfs.append(pdf)


In [None]:
# call again for the pdfs not processed and add them to the consolidated_df
for pdf in not_processed_pdfs:
    pdf_path = os.path.join('pdfs/', pdf)
    try:
        stream = call_openai_api(pdf_path)
        stream = stream.replace("\n", "")
        df_current = pd.DataFrame([{"pdf": pdf_path, "output": stream}])
        consolidated_df = pd.concat([consolidated_df, df_current], ignore_index=True)
        csv_filename = os.path.join('pdfs/', pdf + ".csv")
        df_current.to_csv(csv_filename, index=False)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        time.sleep(60)

In [None]:
# save the consolidated_df to a csv file
consolidated_df.to_csv('consolidated_output_full.csv', index=False)