<a href="https://colab.research.google.com/github/dharmendra7/Extract-Text-from-Scanned-PDF/blob/main/pdf_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --no-cache-dir -r  requirements.txt
!pip install boto3



In [None]:

# Since pdf2image requires Poppler, we need to install it as well
!apt-get install poppler-utils
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
import boto3
client = boto3.client('textract',region_name='',aws_access_key_id='',aws_secret_access_key='')

In [None]:
# To read the PDF
import PyPDF2
# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
from pdf2image import convert_from_path
# To perform OCR to extract text from images
import pytesseract
# To remove the additional created files
import os

In [None]:

def text_extraction(element):
    # Extracting the text from the in line text element
    line_text = element.get_text()

    # Find the formats of the text
    # Initialize the list with all the formats appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))

    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

In [None]:
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]

    return table

# Convert table into appropriate fromat
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapted texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

# Create a function to check if the element is in any tables present in the page
def is_element_inside_any_table(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for table in tables:
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return True
    return False

# Function to find the table for a given element
def find_table_for_element(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for i, table in enumerate(tables):
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return i  # Return the index of the table
    return None

In [None]:
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
    # Get the coordinates to crop the image from PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = 'PDF_image.png'
    image.save(output_file, 'PNG')

# Create a function to read text from images
def image_to_text(image_path):
    # # Read the image
    # img = Image.open(image_path)
    # # Extract the text from the image
    # text = pytesseract.image_to_string(img)
    with open(image_path, 'rb') as image:
      img = bytearray(image.read())


    response = client.detect_document_text(
        Document={'Bytes': img}
    )
    text = ""
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            print (item["Text"])
            text = text + " "+item["Text"]
    return text

In [None]:
# Find the PDF path
pdf_path = 'image-in-between-page.pdf'

# Create a pdf file object
pdfFileObj = open(pdf_path, 'rb')
# Create a pdf reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

In [None]:
# Create the dictionary to extract text from each image
text_per_page = {}
# Create a boolean variable for image detection
image_flag = False

# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):

    # Initialize the variables needed for the text extraction from the page
    pageObj = pdfReaded.pages[pagenum]
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    # Initialize the number of the examined tables
    table_in_page= -1
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    page_tables = pdf.pages[pagenum]
    # Find the number of tables in the page
    tables = page_tables.find_tables()
    if len(tables)!=0:
        table_in_page = 0

    # Extracting the tables of the page
    for table_num in range(len(tables)):
        # Extract the information of the table
        table = extract_table(pdf_path, pagenum, table_num)
        # Convert the table information in structured string format
        table_string = table_converter(table)
        # Append the table string into a list
        text_from_tables.append(table_string)

    # Find all the elements
    page_elements = [(element.y1, element) for element in page._objs]
    print(f"{page_elements = }")
    # exit(0)
    # Sort all the element as they appear in the page
    page_elements.sort(key=lambda a: a[0], reverse=True)


    # Find the elements that composed a page
    for i,component in enumerate(page_elements):
        # Extract the element of the page layout
        element = component[1]

        # Check the elements for tables
        if table_in_page == -1:
            pass
        else:
            if is_element_inside_any_table(element, page ,tables):
                table_found = find_table_for_element(element,page ,tables)
                if table_found == table_in_page and table_found != None:
                    page_content.append(text_from_tables[table_in_page])
                    page_text.append('table')
                    line_format.append('table')
                    table_in_page+=1
                # Pass this iteration because the content of this element was extracted from the tables
                continue

        if not is_element_inside_any_table(element,page,tables):

            # Check if the element is text element
            if isinstance(element, LTTextContainer):
                # Use the function to extract the text and format for each text element
                (line_text, format_per_line) = text_extraction(element)
                # Append the text of each line to the page text
                page_text.append(line_text)
                # Append the format for each line containing text
                line_format.append(format_per_line)
                page_content.append(line_text)


            # Check the elements for images
            if isinstance(element, LTFigure):
                # Crop the image from PDF
                crop_image(element, pageObj)
                # Convert the croped pdf to image
                convert_to_images('cropped_image.pdf')
                # Extract the text from image
                image_text = image_to_text('PDF_image.png')
                text_from_images.append(image_text)
                page_content.append(image_text)
                # Add a placeholder in the text and format lists
                page_text.append('image')
                line_format.append('image')
                # Update the flag for image detection
                image_flag = True


    # Create the key of the dictionary
    dctkey = 'Page_'+str(pagenum)
    # Add the list of list as value of the page key
    text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables,  ]

page_elements = [(768.4368999999999, <LTTextBoxHorizontal(0) 56.693,688.437,539.413,768.437 'DrylabNews\n'>), (699.4734, <LTTextBoxHorizontal(1) 336.849,686.473,538.583,699.473 'for investors & friends · May 2017\n'>), (397.69550000000004, <LTTextBoxHorizontal(2) 56.693,234.495,289.001,397.696 "Welcome to our first newsletter of 2017! It's\nbeen a while since the last one, and a lot has\nhappened. We promise to keep them coming\nevery two months hereafter, and permit\nourselves to make this one rather long. The\nbig news is the beginnings of our launch in\nthe American market, but there are also\ninteresting updates on sales, development,\nmentors and (of course) the investment\nround that closed in January.\n">), (221.29549999999998, <LTTextBoxHorizontal(3) 56.693,192.153,282.125,221.295 'New capital: The investment round was\nsuccessful. We raised 2.13 MNOK to match\n'>), (397.69550000000004, <LTTextBoxHorizontal(4) 303.638,284.896,532.478,397.696 'the 2.05 MNOK loan from Innovation\

In [None]:
# Close the pdf file object
pdfFileObj.close()

In [None]:
# Delete the additional files created if image is detected
if image_flag:
    os.remove('cropped_image.pdf')
    os.remove('PDF_image.png')

In [None]:
print(text_per_page)

{'Page_0': [['DrylabNews\n', 'for investors & friends · May 2017\n', 'image', "Welcome to our first newsletter of 2017! It's\nbeen a while since the last one, and a lot has\nhappened. We promise to keep them coming\nevery two months hereafter, and permit\nourselves to make this one rather long. The\nbig news is the beginnings of our launch in\nthe American market, but there are also\ninteresting updates on sales, development,\nmentors and (of course) the investment\nround that closed in January.\n", 'the 2.05 MNOK loan from Innovation\nNorway. Including the development\nagreement with Filmlance International, the\ntotal new capital is 5 MNOK, partly tied to\nthe successful completion of milestones. All\nformalities associated with this process are\nnow finalized.\n', 'New owners: We would especially like to\nwarmly welcome our new owners to the\nDrylab family: Unni Jacobsen, Torstein Jahr,\nSuzanne Bolstad, Eivind Bergene, Turid Brun,\nVigdis Trondsen, Lea Blindheim, Kristine\n', 'New 

In [None]:
# Display the content of the page
result = ''.join(text_per_page['Page_0'][4])
print(result)

DrylabNews
for investors & friends · May 2017
Welcome to our first newsletter of 2017! It's
been a while since the last one, and a lot has
happened. We promise to keep them coming
every two months hereafter, and permit
ourselves to make this one rather long. The
big news is the beginnings of our launch in
the American market, but there are also
interesting updates on sales, development,
mentors and (of course) the investment
round that closed in January.
the 2.05 MNOK loan from Innovation
Norway. Including the development
agreement with Filmlance International, the
total new capital is 5 MNOK, partly tied to
the successful completion of milestones. All
formalities associated with this process are
now finalized.
New owners: We would especially like to
warmly welcome our new owners to the
Drylab family: Unni Jacobsen, Torstein Jahr,
Suzanne Bolstad, Eivind Bergene, Turid Brun,
Vigdis Trondsen, Lea Blindheim, Kristine
New capital: The investment round was
successful. We raised 2.13 MNOK t

###Using Paddle Paddle OCR

In [None]:
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import os

def extract_text_from_pdf(pdf_path):
    # Dictionary to store text extracted from each page
    text_per_page = {}

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        print(f"Total number of pages: {total_pages}")

        for page_number, page in enumerate(pdf.pages):
            # Extract text
            page_text = page.extract_text() or "No text found."

            # Extract tables
            page_tables = []
            for table in page.extract_tables():
                # Cleaning and formatting table data
                cleaned_table = [' | '.join(row) for row in table]
                page_tables.append('\n'.join(cleaned_table))

            # Handle image extraction if required
            # This will save images to disk temporarily and use OCR to extract text
            images = page.to_image()
            image_text = []
            temp_image_path = f'temp_image_{page_number}.png'
            images.save(temp_image_path)
            image_text.append(pytesseract.image_to_string(Image.open(temp_image_path)))
            os.remove(temp_image_path)  # Clean up image file

            # Storing results
            text_per_page[page_number] = {
                'text': page_text,
                'tables': page_tables,
                'image_text': image_text
            }

    return text_per_page

# Example usage
pdf_path = '/content/removed_removed.pdf'
extracted_content = extract_text_from_pdf(pdf_path)
for page, content in extracted_content.items():
    print(f"Page {page + 1}:")
    print("Text:", content['text'])
    print("Tables:", content['tables'])
    print("Image Text:", content['image_text'])
    print("\n")


In [None]:
!pip install paddlepaddle-gpu

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.1-cp310-cp310-manylinux1_x86_64.whl (758.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.9/758.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from paddlepaddle-gpu)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting httpcore==1.* (from httpx->paddlepaddle-gpu)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->paddlepaddle-gpu)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m9.8 MB/s[0m

In [None]:
!pip install "paddleocr>=2.0.1"

Collecting paddleocr>=2.0.1
  Downloading paddleocr-2.8.0-py3-none-any.whl (407 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.3/407.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyclipper (from paddleocr>=2.0.1)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lmdb (from paddleocr>=2.0.1)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting rapidfuzz (from paddleocr>=2.0.1)
  Downloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0

In [None]:
!paddleocr --image_dir /content/ss.png --use_angle_cls true --lang en

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar
100% 4.00M/4.00M [00:08<00:00, 483kiB/s] 
download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar
100% 10.2M/10.2M [00:10<00:00, 956kiB/s] 
download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar
100% 2.19M/2.19M [00:06<00:00, 313kiB/s]
[2024/07/16 10:30:12] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir='/content/ss.png', page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_si

In [None]:
from paddleocr import PaddleOCR,draw_ocr
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
ocr = PaddleOCR(use_angle_cls=True,use_gpu=False, lang='en') # need to run only once to download and load model into memory
img_path = '/content/ss.png'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        print(line)


# draw result
from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='/content/sample_data/Roboto-Black.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')

[2024/07/16 10:40:22] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

In [None]:
import os

# Check and list fonts available in the 'dejavu' directory
font_dir = '/usr/share/fonts/truetype/dejavu'
available_fonts = os.listdir(font_dir)
print("Available fonts:", available_fonts)

# Choose a font (making sure it exists)
font_path = os.path.join(font_dir, 'DejaVuSans.ttf')  # Adjust filename if necessary

# Continue with your OCR process
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image

# Initialize OCR model
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Path to your image
img_path = '/content/105032.png'

# Perform OCR
result = ocr.ocr(img_path, cls=True)

# Process results for drawing
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result[0]]
txts = [line[1][0] for line in result[0]]
scores = [line[1][1] for line in result[0]]

# Draw OCR results with the chosen font
im_show = draw_ocr(image, boxes, txts, scores)
im_show = Image.fromarray(im_show)
im_show.save('/content/result.jpg')

# Display the result
display(im_show)


In [None]:
!pip install PyMuPDF PyPDF2 pytesseract pillow

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytesseract, PyPDF2, PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6 PyPDF2-3.0.1 pytesseract-0.3.10


In [None]:
import fitz  # PyMuPDF
from paddleocr import PaddleOCR,draw_ocr
from PIL import Image
import io

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def is_image_only_page(page):
    # Get the text from the page
    text = page.get_text()

    # If the page has text, it's not an image-only page
    if text.strip():
        return False

    # Otherwise, check if it has images
    image_list = page.get_images(full=True)

    # If the page has images, we assume it's a scanned image page
    return bool(image_list)

def extract_text_from_image(image):
    # Save the image to a temporary file
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        img_bytes = output.getvalue()



    # Perform OCR using PaddleOCR
    result = ocr.ocr(img_bytes, cls=True)

    if None in result:
        return ""  # Return an empty string if no text is found

    extracted_text = ""
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            extracted_text += line[1][0] + "\n"
    return extracted_text.strip()

def analyze_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    pages_info = []

    for page_num in range(len(doc)):
        print(f"{page_num = }")
        page = doc.load_page(page_num)
        if is_image_only_page(page):
            image_list = page.get_images(full=True)
            if image_list:
                # Get the first image on the page (if any)
                xref = image_list[0][0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))

                # Extract text from the image using PaddleOCR
                extracted_text = extract_text_from_image(image)
                pages_info.append((page_num, "Scanned Image", extracted_text))
            else:
                pages_info.append((page_num, "Unknown (No text or images detected)", ""))
        else:
            text = page.get_text()
            pages_info.append((page_num, "Plain Text", text))

    return pages_info

# Example usage
pdf_path = "/content/History_of_Asia.pdf"
pages_info = analyze_pdf(pdf_path)

for page_num, page_type, content in pages_info:
    print(f"Page {page_num + 1}: {page_type}")
    print("Content:")
    print(content)
    print("-" * 50)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
especially under the leadership of Huang Taiji, who changed the Jurchen
name into Manchurians, adopted numerous Chinese-style mannerisms of
ruling and anointed himself as Emperor of the Qing dynasty in 1636 [which.
ruled what we today would call northeastern China].
o Huang died in 1643 and passed power on to his five year old son [with
his brother Dorgon acting as regent] just as Li Zicheng and his armies
were gaining strength to Beijing's south.
 After the Ming Emperor hanged himself upon learning of Li Zicheng's army
entering Beijing, a Ming general, Wu Sangui, held the last stronghold to
Beijing's northeast, preventing the Manchurians from sweeping down upon
the capital as well
o Li Zicheng made the mistake of executing Wu Sangui's father, who
 happened to be in Beijing, which encouraged Wu to join forces with.
the Manchurians, who, in 1644, swept down into Beijing and forced Li
Zicheng from the city
 Thus, the Manchu

In [None]:
import fitz  # PyMuPDF
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import io

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def is_image_only_page(page):
    # Get the text from the page
    text = page.get_text()

    # If the page has text, it's not an image-only page
    if text.strip():
        return False

    # Otherwise, check if it has images
    image_list = page.get_images(full=True)

    # If the page has images, we assume it's a scanned image page
    return bool(image_list)

def extract_text_from_image(image):
    # Save the image to a temporary file
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        img_bytes = output.getvalue()

    # Perform OCR using PaddleOCR
    result = ocr.ocr(img_bytes, cls=True)
    if not result or not isinstance(result, list):
        return "No text found or OCR failed"

    extracted_text = ""
    for idx in range(len(result)):
        res = result[idx]
        if res is None:
            continue
        for line in res:
            if line is not None:
                extracted_text += line[1][0] + "\n"
    return extracted_text.strip()

def analyze_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    pages_info = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        if is_image_only_page(page):
            image_list = page.get_images(full=True)
            if image_list:
                # Get the first image on the page (if any)
                xref = image_list[0][0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))

                # Extract text from the image using PaddleOCR
                extracted_text = extract_text_from_image(image)
                pages_info.append((page_num, "Scanned Image", extracted_text))
            else:
                pages_info.append((page_num, "Unknown (No text or images detected)", ""))
        else:
            text = page.get_text()
            pages_info.append((page_num, "Plain Text", text))

    return pages_info

def write_output_to_file(output_path, pages_info):
    with open(output_path, 'w', encoding='utf-8') as f:
        for page_num, page_type, content in pages_info:
            f.write(f"Page {page_num + 1}: {page_type}\n")
            f.write("Content:\n")
            f.write(content + "\n")
            f.write("-" * 50 + "\n")

# Example usage
pdf_path = "/content/History_of_Asia.pdf"
output_path = "drylab.txt"
pages_info = analyze_pdf(pdf_path)
write_output_to_file(output_path, pages_info)


ModuleNotFoundError: No module named 'fitz'