# Transform from PDFs to Text

In [1]:
from pdfminer.high_level import extract_text
import os

In [3]:
def convert_pdf_folder_to_text(input_folder, output_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        # Construct full file path
        file_path = os.path.join(input_folder, filename)
        
        # Check if it's a file and ends with .pdf
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            try:
                # Extract text from the PDF file
                text = extract_text(file_path)
                
                # Define the output file path by replacing .pdf with .txt
                output_filename = f"{os.path.splitext(filename)[0]}.txt"
                output_file_path = os.path.join(output_folder, output_filename)
                
                # Write the text to a text file
                with open(output_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(text)
                
                print(f"Successfully converted {filename} to {output_filename}")
            except Exception as e:
                print(f"Error while processing {filename}: {e}")

In [4]:
# Example usage
input_folder = 'data_glec_pdf'  # Path to the folder containing PDF files
output_folder = 'data_glec'  # Path to the folder where text files will be saved
convert_pdf_folder_to_text(input_folder, output_folder)

Successfully converted Clean_Cargo_-_2022_Global_Ocean_Container_Greenhouse_Gas_Emission_Intensities_2023-06.pdf to Clean_Cargo_-_2022_Global_Ocean_Container_Greenhouse_Gas_Emission_Intensities_2023-06.txt
Successfully converted BOOSTLOG_CloudReport_Final.pdf to BOOSTLOG_CloudReport_Final.txt
Successfully converted SFCExchangeNetwork_PoC_Evaluation_Report_2023.pdf to SFCExchangeNetwork_PoC_Evaluation_Report_2023.txt
Successfully converted Breakbulk_GHG_Emissions_Accounting_and_Reporting_Guidance_Version1Final.pdf to Breakbulk_GHG_Emissions_Accounting_and_Reporting_Guidance_Version1Final.txt
Successfully converted ASU-FEE-001-2_v02_Assurance_Fees_for_VVBs.pdf to ASU-FEE-001-2_v02_Assurance_Fees_for_VVBs.txt
Successfully converted ASU-PRO-005-2_v02_Disputes.pdf to ASU-PRO-005-2_v02_Disputes.txt
Successfully converted Data_exchange_of_GHG__Logistics_Emissions_Guidance_-_Jan2023_.pdf to Data_exchange_of_GHG__Logistics_Emissions_Guidance_-_Jan2023_.txt
Successfully converted Exploring_Susta