In [53]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
    ExtractRenditionsElementType
import os.path
import zipfile
import json
import pandas as pd
import re
import openpyxl
from datetime import datetime
from dotenv import load_dotenv
import os
load_dotenv()
adobe_client_id = os.getenv("ADOBE_CLIENT_ID")
adobe_api_key = os.getenv("ADOBE_API_KEY")

def get_dict_xlsx(outputzipextract, xlsx_file):
    """
    Function to read excel output from adobe API
    """
    # Read excel
    df = pd.read_excel(os.path.join(
        outputzipextract, xlsx_file), sheet_name='Sheet1', engine='openpyxl')
    
    # Clean df
    df.columns = [re.sub(r"_x([0-9a-fA-F]{4})_", "", col) for col in df.columns]
    df = df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)

    # Convert df to string
    data_dict = df.to_dict(orient='records')

    return data_dict

def adobeLoader(input_pdf, output_zip_path):
    """
    Function to run adobe API and create output zip file
    """
    # Initial setup, create credentials instance.
    credentials = Credentials.service_principal_credentials_builder() \
        .with_client_id(adobe_client_id) \
        .with_client_secret(adobe_api_key) \
        .build()

    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} create an ExecutionContext")
        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()
    except:
        print('----Error: create an ExecutionContext')
    
    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} Set and execute operation input from a source file")
        # Set operation input from a source file.
        source = FileRef.create_from_local_file(input_pdf)
        extract_pdf_operation.set_input(source)
        
        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
            .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
                                                    ExtractRenditionsElementType.FIGURES]) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)
    except:
        print('----Error: cannot set and execute operation input')

    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} Save result")

        # Save result to output path
        if os.path.exists(output_zip_path):
            os.remove(output_zip_path)
        result.save_as(output_zip_path)

    except:
            print('----Error: cannot save result')   

def extract_text_from_file_adobe(output_zip_path, output_zipextract_folder):
    """
    Function to extract text and table from adobe output zip file
    """    
    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} unzip file")
        # Open the ZIP file
        with zipfile.ZipFile(output_zip_path, 'r') as zip_ref:
            # Extract all the contents of the ZIP file to the current working directory
            zip_ref.extractall(path=output_zipextract_folder)
    except:
        print('----Error: cannot unzip file')

    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} open json file")
        # Opening JSON file
        with open(os.path.join(output_zipextract_folder, "structuredData.json")) as json_file:
            data = json.load(json_file)
    except:
        print('----Error: cannot open json file')

    try:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} extract text")
        dfs = pd.DataFrame()
        page = ''
        # Loop through elements in the document
        for ele in data['elements']:
            df = pd.DataFrame()
            # Get element page
            if ('Page' in ele.keys()):
                page = ele['Page']

            # Append table
            if any(x in ele['Path'] for x in ['Table']):
                if ('filePaths' in ele):
                    if [s for s in ele['filePaths'] if 'xlsx' in s]:
                        # Read excel table
                        data_dict = get_dict_xlsx(output_zipextract_folder, ele['filePaths'][0])
                        json_string = json.dumps(data_dict)
                        df = pd.DataFrame({'text': json_string}, index=[0])

            # Append text            
            elif 'Text' in ele.keys():
                df = pd.DataFrame({'text': ele['Text']}, index=[0])

            df['page_number'] = page
            dfs = pd.concat([dfs, df], axis=0)

        dfs = dfs.reset_index(drop=True)

        # Groupby page
        dfs = dfs.groupby('page_number')['text'].apply(lambda x: '\n'.join(x)).reset_index()
    except Exception as error:
        print('Error: ', error)
    return dfs

In [33]:
input_pdf = './FY2023-NVIDIA-Corporate-Responsibility-Report-1.pdf'

# Adobe output zip file path
output_zip_path = '/tmp/sdk_result/tmp.zip'
output_zipextract_folder = './tmp'

# Run adobe API
adobeLoader(input_pdf, output_zip_path)

2024-04-14 21:49:59 create an ExecutionContext
2024-04-14 21:49:59 Set and execute operation input from a source file
2024-04-14 21:51:42 Save result


In [54]:
# Extract text and table from adobe output zip file
df = extract_text_from_file_adobe(output_zip_path, output_zipextract_folder)

2024-04-14 22:01:24 unzip file
2024-04-14 22:01:25 open json file
2024-04-14 22:01:25 extract text


In [55]:
df

Unnamed: 0,page_number,text
0,0,NVIDIA \nCorporate Responsibility Report Fisca...
1,1,Table of Contents \n(<>)Message From Our CEO \...
2,2,Message From Our CEO \nThe dawn of AI has arri...
3,3,"[{""Message From "": ""Our CEO "", ""Introduction ""..."
4,4,Introduction \nAbout NVIDIA \nNVIDIA pioneered...
5,5,(<>)Message From (<>)Introduction (<>)Business...
6,6,"[{""Message From "": ""Our CEO "", ""Introduction ""..."
7,7,Corporate Responsibility Management \nWe condu...
8,8,"[{""Message From "": ""Our CEO "", ""Introduction ""..."
9,9,"[{""Message From "": ""Our CEO "", ""Introduction ""..."


In [None]:
# Get sub_pages with relavant keywords
emissions_keywords = ['scope 1', 'scope 2', 'scope 3', 'greenhouse gas emissions', 'ghg', 'direct emissions', 'indirect emissions']

# Filter text according to keywords
pattern = '|'.join(emissions_keywords)
sub_df = df[df['text'].str.contains(pattern, regex=True, case=False)]
page_index = sub_df.index.tolist()
sub_df = sub_df.reset_index(drop=True)
# sub_pages = [pages[i] for i in page_index]

In [None]:
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain import PromptTemplate
from langchain.llms import OpenAI
import openai
from langchain.docstore.document import Document
import tiktoken
import glob
import yaml
import warnings
warnings.filterwarnings('ignore')

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_key = os.getenv("AZURE_OPENAI_KEY")
openai.api_version = "2023-09-01-preview"
openai_api_key = openai.api_key
model = 'genAI-DC'
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding_name = 'cl100k_base'
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def format_output(text, input_type, output, df_output):
    # Parse output
    if '{' in output:
        output = '{' + output.split('{')[1].split('}')[0] + '}'
    else:
        output = None

    try:
        df = df = pd.DataFrame(yaml.safe_load(output), index=[0])
        # df['page_number'] = i
        df['text'] = str(text)
        df['input_type'] = input_type
        df_output = pd.concat([df_output, df], axis=0, ignore_index=True)
    except:
        pass

    return output, df_output
    
schema = """
{
    "company_name": "name of the parent company"
    "reporting_year": "reporting year",
    "scope_1": "Scope 1 GHG emissions of the parent company",
    "scope_2_market": "Scope 2 GHG emissions (market based) of the parent company",
    "scope_2_location": "Scope 2 GHG emissions (location based) of the parent company",
    "scope_1+2": "Scope 1+2(market based) GHG emissions of the parent company",
    "scope_3": "Scope 3 GHG emissions of the parent company",
    "emssions_unit": "Unit of absolute emissions (e.g., metric tons CO2e)", 
    "infographic": "if the page contains infographic,
    "page_number": "page number contains GHG emissions"
      }
"""

def qa_emissions(input, input_type):

    # -- Create system prompt template
    sys_tpl = "You are a sustainablity reporting expert. You will help me to extract information from a given context in the sustainability report. Do NOT make up answer. If you don't know the answer or the answer is in the table or infographic, leave it as null. If the report is in wrong year, return the reporting year and leave the GHG emissions as null. Do not give any explanation other than the schema."

    sys_msg_pt = SystemMessagePromptTemplate.from_template(sys_tpl)

    prompt_template = """
        The following context shows emssisons of the company: \n {context_str}\n

        Noted that Scope 1 emissions are direct greenhouse (GHG) emissions that occur from sources that are controlled or owned by an organization (e.g., emissions associated with fuel combustion in boilers, furnaces, vehicles). Scope 2 emissions are indirect GHG emissions associated with the purchase of electricity, steam, heat, or cooling.Scope 3 emissions are the result of activities from assets not owned or controlled by the reporting organization, but that the organization indirectly affects in its value chain. The GHG Protocol defines 15 categories of scope 3 emissions, though not every category will be relevant to all organizations.

        Give me the emissions of the parent company for FY2022 in JSON representation. Ignore the emssions of sub sector or subsidiaries, only return the emissions for company total of the parent company. If you don't know the answer or the answer is in table or infographic, leave it as null, do NOT try to make up an answer. The answer should be strictly follows this schema: {schema} \n

    """

    usr_pt = PromptTemplate(
        input_variables=["context_str", "schema"],
        template=prompt_template,
    )
    usr_msg_pt = HumanMessagePromptTemplate(prompt=usr_pt)

    # Combine (system, user) into a chat prompt template
    prompt = ChatPromptTemplate.from_messages([sys_msg_pt, usr_msg_pt])

    # Define llm
    llm = OpenAI(deployment_id=model, openai_api_key=openai_api_key, temperature=0)

    df_output = pd.DataFrame()

    # Step1: Run LLMChain
    llm_chain  = LLMChain(llm=llm, prompt=prompt, verbose=False)
    token_limit = 8193-300 # Reserve for completion

    for i in range(len(input)):
        
        num_tokens = num_tokens_from_string(str(input[i]))+num_tokens_from_string(str(prompt))
        +num_tokens_from_string(str(schema))

        if num_tokens < token_limit:

            if input_type == 'table':
                page_number = input[i]['page_number']
                output = llm_chain.run({"context_str": input[i]['data'], "schema": schema})

            elif input_type == 'text':
                # page_number = input[i].metadata['page']
                page_number = input[i].metadata['page_number']
                output = llm_chain.run({"context_str": input[i], "schema": schema})
            
            clean_output, df_output = format_output(page_number, input_type, output, df_output)

    # Filter pages according to step1 LLM answer
    if df_output.shape[0]>0:

        filter_page = []
        check_cols = ['scope_1', 'scope_2_market', 'scope_2_location', 'scope_1+2', 'scope_3']
        if df_output.columns.isin(check_cols).any():
            filter_page = df_output[df_output[check_cols].apply(lambda x: ((notnull(x[0])|(notnull(x[1]))|(notnull(x[2]))|(notnull(x[3]))|(notnull(x[4])))), axis=1)].index.tolist()

        sub_json = []
        for i in filter_page:
            if input_type == 'table':
                sub_json.append(input[i]['data'])            
                page_number = input[i]['page_number']

            elif input_type == 'text':
                sub_json.append(input[i])
                # page_number = input[i].metadata['page']
                page_number = input[i].metadata['page_number']

        # Check number of tokens
        num_tokens = num_tokens_from_string(str(sub_json))+num_tokens_from_string(str(prompt))+num_tokens_from_string(str(schema))
        if num_tokens < token_limit:
            df_output = pd.DataFrame()

            # Step2: Rerun LLM with filtered page
            llm_chain  = LLMChain(llm=llm, prompt=prompt, verbose=False)
            output = llm_chain.run({"context_str": sub_json, "schema": schema})
            clean_output, df_output = format_output(page_number, input_type, output, df_output)
        else:
            pass
    return df_output