In [23]:
from fitz import FileDataError
import os
from io import BytesIO
from PIL import Image

import fitz

# Set the directory containing the PDF files
directory = '/Users/standard/PDF'

# Iterate through all PDF files in the directory
for filename in os.listdir(directory):
    out_filename = f'covers/{filename}.png'
    if filename.endswith('.pdf') and not os.path.exists(out_filename):
        # Open the PDF file
        try:
            pdf_document = fitz.open(f'{directory}/{filename}')
            # Extract the first page
            try:
                page = pdf_document[0]
                # Render the page as an image
                try:
                    pix = page.get_pixmap()
                    # Save the image
                    image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
                    # Save the image
                    image.save(out_filename)
                except RuntimeError:
                    print(f"ERROR {filename}")
            except IndexError:
                pass
        except FileDataError:
            pass

print('Done!')


Done!


In [4]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.21.1-cp39-cp39-macosx_11_0_arm64.whl (12.5 MB)
     |████████████████████████████████| 12.5 MB 10.7 MB/s            
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.21.1
You should consider upgrading via the '/Users/standard/SRC/BooksML/venv/bin/python -m pip install --upgrade pip' command.[0m


In [27]:
import os
import zipfile
from io import BytesIO
from PIL import Image

# Set the directory containing the EPUB files
directory = '/Users/standard/EPUB'

# Iterate through all EPUB files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.epub') and os.path.isfile(filename):
        # Open the EPUB file
        try:
            with zipfile.ZipFile(os.path.join(directory, filename), 'r') as epub:
                # Find the cover image file
                cover_image_file = None
                for info in epub.infolist():
                    if info.filename.endswith('cover.jpg'):
                        cover_image_file = info
                        break
                # If a cover image was found, extract it and save it
                if cover_image_file is not None:
                    # Read the cover image file from the EPUB as bytes
                    cover_image_bytes = epub.read(cover_image_file)
                    # Convert the bytes to an image
                    try:
                        cover_image = Image.open(BytesIO(cover_image_bytes))

                        # Save the image
                        cover_image.save(f'covers/{filename}.png')
                    except OSError:
                        print(f"{filename} is in wrong colorspace")
        except zipfile.BadZipFile:
            pass


print('Done!')



Done!


In [28]:
import os
from PIL import Image
import mutagen

# Set the directory containing the M4B files
directory = '/Users/standard/M4B'

# Iterate through all M4B files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.m4b'):
        # Open the M4B file
        m4b_file = mutagen.File(os.path.join(directory, filename))
        # Check if the file has a cover art tag
        if 'covr' in m4b_file:
            # Extract the cover art from the tag
            cover_art = m4b_file['covr'][0]
            # Convert the cover art to an image
            cover_image = Image.open(BytesIO(cover_art))
            # Save the image
            cover_image.save(f'covers/{filename}.png')

print('Done!')


Done!


In [29]:
import os
import pytesseract
import pandas as pd

# Set the directory containing the JPG files
directory = 'covers/'

# Iterate through all JPG files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.png'):
        # Perform OCR on the JPG file
        text = pytesseract.image_to_data(f'{directory}/{filename}', output_type='data.frame')
        # Extract the recognized text and font sizes from the OCR data
        text_data = text[['text', 'level', 'conf', 'height']]
        # Remove rows where the recognized text is empty
        text_data = text_data[text_data['text'].astype(bool)]
        # Save the text and font sizes to a CSV file
        text_data.to_csv(f'covers/{filename}.csv', index=False)

print('Done!')


Done!


In [30]:
!python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple


Looking in indexes: https://mirror.baidu.com/pypi/simple
Collecting paddlepaddle
  Downloading https://mirror.baidu.com/pypi/packages/0e/b4/236768a3d175537831a2841638bae2239bc82bee545320d1c04342ce3cc5/paddlepaddle-2.4.1-cp39-cp39-macosx_11_0_arm64.whl (46.9 MB)
     |████████████████████████████████| 46.9 MB 15.1 MB/s            
[?25hCollecting opt-einsum==3.3.0
  Downloading https://mirror.baidu.com/pypi/packages/bc/19/404708a7e54ad2798907210462fd950c3442ea51acc8790f3da48d2bee8b/opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     |████████████████████████████████| 65 kB 13.0 MB/s            
Collecting astor
  Downloading https://mirror.baidu.com/pypi/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting paddle-bfloat==0.1.7
  Downloading https://mirror.baidu.com/pypi/packages/d8/6f/17751e00a956e7bc76ee4a830b3edc12c8c0fae7bfec10d80318d601f1d9/paddle_bfloat-0.1.7-cp39-cp39-macosx_11_0_arm64.whl (41 kB)
  

In [31]:
!pip install paddleocr


Collecting paddleocr>=2.0.1
  Downloading paddleocr-2.6.1.2-py3-none-any.whl (440 kB)
     |████████████████████████████████| 440 kB 2.2 MB/s            
[?25hCollecting opencv-python
  Using cached opencv_python-4.6.0.66-cp37-abi3-macosx_11_0_arm64.whl (30.0 MB)
Collecting cython
  Using cached Cython-0.29.32-py2.py3-none-any.whl (986 kB)
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
     |████████████████████████████████| 5.6 MB 11.2 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting premailer
  Using cached premailer-3.10.0-py2.py3-none-any.whl (19 kB)
Collecting scikit-image
  Downloading scikit_image-0.19.3-cp39-cp39-macosx_12_0_arm64.whl (12.5 MB)
     |████████████████████████████████| 12.5 MB 10.2 MB/s            
[?25hCollecting opencv-contrib-python
  Using cached opencv_contrib_python-4.6.0.66-cp37-abi3-macosx_11_0_arm64.whl (38.9 MB)
Collecting pdf2docx
  Downloading pdf2docx-0.5.6-py3-none-a

In [33]:
import os
import csv
from paddleocr import ocr

# Set the directory path where the image files are stored
dir_path = '/path/to/directory'

# Iterate through all the files in the directory
for file in os.listdir(dir_path):
  # Check if the file is a PNG or JPG file
  if file.endswith('.png') or file.endswith('.jpg'):
    # Load the image file
    image_path = os.path.join(dir_path, file)
    image = ocr.Image(image_path)

    # Perform OCR on the image
    res = ocr.ocr(image)

    # Extract the recognized text, text size, and font names from the result
    boxes = [line[0] for line in res]
    txts = [line[1][0] for line in res]
    scores = [line[1][1] for line in res]

    pass


ModuleNotFoundError: No module named 'paddleocr'