<a href="https://colab.research.google.com/github/eleven111101/GenAI_JPN11/blob/main/pages_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opencv-python-headless
!pip install pytesseract
!pip install pdf2image
!pip install pandas openpyxl
!sudo apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
!sudo apt-get install poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.4).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
import pandas as pd
import cv2
import pytesseract
from pdf2image import convert_from_path
import os
import re

# Add Poppler binaries to the PATH
os.environ["PATH"] += os.pathsep + "/usr/bin"

def extract_data_with_keyword(pdf_path, keyword):
    extracted_data = []

    # Convert PDF pages to images using pdf2image
    pages = convert_from_path(pdf_path, 300)  # Set the DPI (dots per inch)

    for i, page in enumerate(pages, start=1):
        # Save the image temporarily
        temp_image_path = f"temp_page_{i}.png"
        page.save(temp_image_path, "PNG")

        # Read the saved image using OpenCV
        img = cv2.imread(temp_image_path)

        if img is not None:
            # Use pytesseract to perform OCR on the image
            img_text = pytesseract.image_to_string(img, config='--psm 6')  # PSM 6 for sparse text

            # Search for the keyword in the extracted text
            if keyword.lower() in img_text.lower():
                # Append extracted data to the list
                extracted_data.append({
                    'Page': i,
                    'Text': img_text.strip()
                })

        # Remove the temporary image file
        os.remove(temp_image_path)

    return extracted_data

try:
    # Ask the user for the input PDF file and keyword
    pdf_path = input("Enter the path to the PDF file: ")
    keyword = input("Enter the keyword to search for: ")

    # Check if the file exists
    if not os.path.isfile(pdf_path):
        print("Error: The specified file does not exist.")
    else:
        # Extract data related to the keyword from the PDF
        extracted_data = extract_data_with_keyword(pdf_path, keyword)

        if extracted_data:
            # Create a Pandas DataFrame from the extracted data
            df = pd.DataFrame(extracted_data)

            # Create a Pandas ExcelWriter object
            output_file = f"{keyword}_data.xlsx"
            with pd.ExcelWriter(output_file) as writer:
                # Write data to Excel
                df.to_excel(writer, sheet_name=f"{keyword.capitalize()} Data", index=False)

            print(f"Data related to '{keyword}' extracted and saved to {output_file}.")
        else:
            print(f"No data related to '{keyword}' found in the PDF.")

except Exception as e:
    print(f"An error occurred: {e}")


Enter the path to the PDF file: /content/ASSESSMENT PROTOCOL.pdf
Enter the keyword to search for: ASSESSMENT PROTOCOL – SAFETY ASSIST
No data related to 'ASSESSMENT PROTOCOL – SAFETY ASSIST' found in the PDF.


In [None]:
import pandas as pd
import cv2
import pytesseract
from pdf2image import convert_from_path
import os
import re

# Add Poppler binaries to the PATH
os.environ["PATH"] += os.pathsep + "/usr/bin"

def extract_content_structure(pdf_path):
    # Convert PDF pages to images using pdf2image
    pages = convert_from_path(pdf_path, 300)  # Set the DPI (dots per inch)

    content_structure = []
    current_level = []
    prev_indent = 0

    for i, page in enumerate(pages, start=1):
        # Read the saved image using OpenCV
        img = cv2.imread(f"temp_page_{i}.png")

        if img is not None:
            # Use pytesseract to perform OCR on the image
            img_text = pytesseract.image_to_string(img, config='--psm 6')  # PSM 6 for sparse text

            # Process the extracted text to get content structure
            lines = img_text.strip().split('\n')
            for line in lines:
                indent_level = len(re.match(r'^\s*', line).group())

                if indent_level > prev_indent:
                    current_level.append(line.strip())
                elif indent_level < prev_indent:
                    for _ in range(prev_indent - indent_level):
                        current_level.pop()
                    current_level.append(line.strip())
                else:
                    current_level.pop()
                    current_level.append(line.strip())

                content_structure.append('.'.join(current_level))

                prev_indent = indent_level

    return content_structure

def extract_content_with_number(pdf_path, content_number):
    content_structure = extract_content_structure(pdf_path)

    extracted_data = []
    start_extraction = False

    for line in content_structure:
        if start_extraction:
            extracted_data.append(line)
        elif line.startswith(content_number):
            extracted_data.append(line)
            start_extraction = True

    return extracted_data

try:
    # Ask the user for the input PDF file and content number
    pdf_path = input("Enter the path to the PDF file: ")
    content_number = input("Enter the content number to search for (e.g., 3): ")

    # Check if the file exists
    if not os.path.isfile(pdf_path):
        print("Error: The specified file does not exist.")
    else:
        # Extract content starting with the specified number
        extracted_content = extract_content_with_number(pdf_path, content_number)

        if extracted_content:
            print(f"Content starting with '{content_number}' extracted:")
            for content in extracted_content:
                print(content)
        else:
            print(f"No content starting with '{content_number}' found in the PDF.")

except Exception as e:
    print(f"An error occurred: {e}")


Enter the path to the PDF file: /content/ASSESSMENT PROTOCOL.pdf
Enter the content number to search for (e.g., 3): 3
No content starting with '3' found in the PDF.


In [None]:
import pandas as pd
import cv2
import pytesseract
from pdf2image import convert_from_path
import os

# Add Poppler binaries to the PATH
os.environ["PATH"] += os.pathsep + "/usr/bin"

def extract_text_and_images(pdf_path, start_page, end_page):
    extracted_data = []

    # Convert PDF pages to images using pdf2image
    pages = convert_from_path(pdf_path, 300, first_page=start_page, last_page=end_page)  # Set the DPI and page range

    for i, page in enumerate(pages, start=start_page):
        # Save the image temporarily
        temp_image_path = f"temp_page_{i}.png"
        page.save(temp_image_path, "PNG")

        # Read the saved image using OpenCV
        img = cv2.imread(temp_image_path)

        if img is not None:
            # Convert image to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # Apply thresholding to preprocess the image
            _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

            # Use pytesseract to perform OCR on the preprocessed image
            img_text = pytesseract.image_to_string(thresh, config='--psm 6')  # PSM 6 for sparse text

            # Split the extracted text into separate lines
            extracted_lines = img_text.strip().split('\n')

            # Append extracted lines and image to the list
            extracted_data.append({
                'Page': i,
                'Text': '\n'.join(extracted_lines),
                'Image': cv2.imencode('.png', img)[1]  # Encode image as PNG bytes
            })

        # Remove the temporary image file
        os.remove(temp_image_path)

    return extracted_data

try:
    # Ask the user for the input PDF file
    pdf_path = input("Enter the path to the PDF file: ")

    # Check if the file exists
    if not os.path.isfile(pdf_path):
        print("Error: The specified file does not exist.")
    else:
        # Ask for the start and end page numbers
        start_page = int(input("Enter the start page number: "))
        end_page = int(input("Enter the end page number: "))

        # Extract text and images from the specified page range
        extracted_data = extract_text_and_images(pdf_path, start_page, end_page)

        # Create a Pandas DataFrame from the extracted data
        df = pd.DataFrame(extracted_data)

        # Create a Pandas ExcelWriter object
        with pd.ExcelWriter('extracted_data.xlsx') as writer:
            # Write data to Excel
            df.to_excel(writer, sheet_name='Extracted_Data', index=False)

        print("Data extracted and saved to extracted_data.xlsx.")

except Exception as e:
    print(f"An error occurred: {e}")


Enter the path to the PDF file: /content/euro-ncap-assessment-protocol-sa-v903.pdf
Enter the start page number: 2
Enter the end page number: 4
Data extracted and saved to extracted_data.xlsx.
