### Import library

In [2]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType

import os
import zipfile
import json
import pandas as pd

In [3]:
esg_path = "./1-2. ESG report/"

In [10]:
input_pdf_list = os.listdir(esg_path+'esg_report/')

In [11]:
input_pdf_list

['GSretail_2023.pdf',
 'hyundaehomeshopping_2023.pdf',
 'LGlife_2023.pdf',
 'lottehi_2023.pdf',
 'sinsegye_2023.pdf']

In [6]:
zip_file_name_list = []

for i in range(len(input_pdf_list)):
    zip_file_name_list.append(input_pdf_list[i].replace('.pdf', ''))

In [7]:
zip_file_name_list

['GSretail_2023',
 'hyundaehomeshopping_2023',
 'LGlife_2023',
 'lottehi_2023',
 'sinsegye_2023']

### PDF에서 table, text, figure 추출하기

In [8]:

for i in range(len(input_pdf_list)):
    
    # 파일 불러오기 
    zip_file = esg_path + "ExtractTextInfoFromPDF_{}.zip".format(zip_file_name_list[i])

    if os.path.isfile(zip_file):
        os.remove(zip_file)

    input_pdf = esg_path +"esg_report/{}".format(input_pdf_list[i])
    
    #Initial setup, create credentials instance.
    credentials = Credentials.service_account_credentials_builder()\
        .from_file(esg_path+"adobe-dc-pdf-services-sdk-extract-python-samples/pdfservices-api-credentials.json") \
        .build()

    #Create an ExecutionContext using credentials and create a new operation instance.
    execution_context = ExecutionContext.create(credentials)
    
    extract_pdf_operation = ExtractPDFOperation.create_new()

    #Set operation input from a source file.
    source = FileRef.create_from_local_file(input_pdf)
    extract_pdf_operation.set_input(source)

    #Build ExtractPDF options and set them into the operation
    extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
        .with_element_to_extract(ExtractElementType.TEXT) \
        .build()
    extract_pdf_operation.set_options(extract_pdf_options)
    
    #Execute the operation.
    result: FileRef = extract_pdf_operation.execute(execution_context)

    #Save the result to the specified location.
    result.save_as(zip_file)

### esg 관련 문장 추출 및 csv로 저장

In [9]:
# 각 e/s/g 별 추출을 위한 페이지 설정
dict = {
    'GSretail_2023':[range(7,18), range(18,34),range(2,7)],
    'hyundaehomeshopping_2023' : [range(6,16), range(16,32), range(2,6)],
    'LGlife_2023' : [range(5,22), range(22,53), range(2,5)],
    'lottehi_2023' : [range(5,14), range(14,32), range(2,5)],
    'sinsegye_2023' : [range(6,18), range(18,35), range(2,6)],
}

In [18]:
for i in range(len(input_pdf_list)):
    output_zip = esg_path +"ExtractTextInfoFromPDF_{}.zip".format(zip_file_name_list[i])
    archive = zipfile.ZipFile(output_zip, 'r')
    jsonentry = archive.open('structuredData.json')
    jsondata = jsonentry.read()
    data = json.loads(jsondata)
    empty_df = []
    empty_df_2 = []
    name = input_pdf_list[i].replace('.pdf', '')
    
    for j in range(len(data["elements"])):
        if 'Text' in data["elements"][j]:
            if len(data["elements"][j]['Text']) > 50:
                page = data["elements"][j]['Page']
                if page in dict[name][0]:
                    empty_df.append((data["elements"][j]['Text'],'e'))
                elif page in dict[name][1]:
                    empty_df.append((data["elements"][j]['Text'],'s'))
                elif page in dict[name][2]:
                    empty_df.append((data["elements"][j]['Text'],'g'))
                else:
                    pass
            else:
                pass
        else:
            pass

    for k in range(len(empty_df)):
        if '' in empty_df[k]:
            pass
        else : 
            empty_df_2.append(empty_df[k])
            
    str_df = pd.DataFrame(empty_df_2, columns=['text', 'label'])
    
    # output 폴더 없으면 만들기 
    if not os.path.isdir(esg_path + 'output/'):
        os.mkdir(esg_path + 'output/')
    str_df.to_csv(esg_path + 'output/'+"ESG_text_{}.csv".format(zip_file_name_list[i]), index=False, encoding='utf-8-sig')