<a href="https://colab.research.google.com/github/dariru3/py-pdf_punctuation_check/blob/main/pdf_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Checker
## Instructions

**How to run the code**:
1.   Hover the mouse around the "Show code" text to reveal the play button (if necessary).
2.   Click the play button for each section in order.
3.   After clicking the play button in Step 2, upload the PDF data using the "Choose Files" button.

Note

- Depending on the size of the files, it can take up to 15 minutes or longer for the process to complete. Do not close the web browser tab or window until both output files--the highlighted PDF and summary csv file--have finished downloading.

In [1]:
#@title Step 1: initialize the script{display-mode: "form"}
%%capture
!pip install pymupdf
!pip install frontend

In [2]:
#@title Optional: set the highlighter name{display-mode: "form"}
import ipywidgets as widgets
from IPython.display import display

widget = widgets.Text(
    value='PDF Checker',
    description='Commenter:',
    disabled=False   
)
display(widget)

Text(value='PDF Checker', description='Commenter:')

In [5]:
#@title Step 2: upload PDF file{display-mode: "form"}
from google.colab import files
import os
import fitz, sys
import csv
import unicodedata, re

def highlight_punctuation_errors(input_file:str, pages:list=None):
    comment_name = "PDF Checker"
    try:
        if widget.value:
            comment_name = widget.value
    except:
        pass
    # create matches list for output summary
    error_summary = []
    # open pdf
    pdfIn = fitz.open(input_file)
    # Iterate throughout pdf pages
    for pg, page in enumerate(pdfIn):
        pageID = pg+1
        sys.stdout.write(f"\rScanning page {pageID}...")
        sys.stdout.flush()
        # If required to look in specific pages
        if pages and pageID not in pages:
            continue

        # Get all the text in the page
        text = page.get_text("text")
        target_chars = check_punctuation_errors(text, error_summary)

        page_highlights = {}  # Initialize a dictionary to store match rectangles for each character
        get_positions(target_chars, text, page, page_highlights)
        add_highlight_annot(page_highlights, page, comment_name)
    sys.stdout.write("Done!")

    export_summary(error_summary)
    output_file = save_output_file(input_file, pdfIn)

    if output_file:
        files.download(output_file)
        files.download('error_summary.csv')

def check_full_width_chars(text, summary):
    full_width_chars = set()
    full_status = ['W', 'F', 'A']
    full_width_pattern = re.compile("[\uFF01-\uFF5E]+")

    excluded_chars = {
        '\u0022',  # Half-width double quote mark (")
        '\u0027',  # Half-width single quote mark/apostrophe (')
        '\u2018',  # Left single quotation mark (‘)
        '\u2019',  # Right single quotation mark (’)
        '\u201C',  # Left double quotation mark (“)
        '\u201D',  # Right double quotation mark (”)
        '\u2014',  # Em dash (—)
    }

    status_descriptions = {
        'W': 'Full-width: Wide',
        'F': 'Full-width: Full-width',
        'A': 'Full-width: Ambiguous'
    }

    for char in text:
        if char not in excluded_chars:
            status = unicodedata.east_asian_width(char)
            if status in full_status or full_width_pattern.search(char):
                description = status_descriptions.get(status, 'Unknown status')
                full_width_chars.add((char, description))
                update_summary(summary, char, description)

    return full_width_chars

def check_punctuation_patterns(text, summary):
    punctuation_errors = set()
    pattern = re.compile(
        # r"(?P<double_space>(?<=\S)[.,;:!?]\s{2}(?=\S))|"  # Double space after punctuation [removed for false-positives]
        r"(?P<straight_quotes>['\"])|"  # Straight quotes
        r"(?P<space_around_punct>\s[.,;:?!'\[\]{}()“”‘’%$¥—-]\s)|"  # Space before and after punctuation
        r"(?P<space_before_closing_quote>\s[’”](?=[a-zA-Z0-9]))|"  # Space before closing quotation mark followed by a character
        r"(?P<repeated_punct>(?:(?P<punct>[.,;:?!'\[\]{}()“”‘’&%$¥—-]))(?P=punct))|"  # Same punctuation is used twice in a row
        r"(?P<no_closing_parenthesis>\([^)]*$)" # Match a parethesis not closed
    )

    for match in pattern.finditer(text):
        error_type = match.lastgroup
        error_char = match.group()
        error_description = {
            'double_space': 'Double space after punctuation',
            'straight_quotes': 'Straight quotes',
            'space_around_punct': 'Space before and after punctuation',
            'space_before_closing_quote': 'Space before closing quotation mark followed by a character',
            'repeated_punct': 'Same punctuation is used twice in a row',
            'no_closing_parenthesis': 'Missing closing parenthesis'
        }.get(error_type, 'Unknown error')

        punctuation_errors.add((error_char, error_description))
        update_summary(summary, error_char, error_description)

    return punctuation_errors

def check_punctuation_errors(text, summary):
    errors = check_full_width_chars(text, summary) | check_punctuation_patterns(text, summary)
    error_characters = []
    for error_char, error_description in errors:
        error_characters.append([error_char, error_description])

    return error_characters

def update_summary(summary:list, char, description):
    found = False
    for entry in summary:
        if entry['char'] == char:
            entry['count'] += 1
            found = True
            break
    if not found:
        summary.append({'char': char, 'count': 1, 'description': description})

def get_positions(target_chars, text, page, page_highlights):
    for char, description in target_chars:
        start_idx = 0
        while True:
            start_idx = text.find(char, start_idx)
            if start_idx == -1:
                break
            end_idx = start_idx + len(char)
            matches = page.search_for(text[start_idx:end_idx])
            if matches:
                handle_matches(matches, char, description, page_highlights)
            start_idx += 1

def handle_matches(matches, char, description, page_highlights):
    for match in matches:
        if char not in page_highlights:
            page_highlights[char] = {"matches": [match], "description": description}
        else:
            # Check if the match rectangle is not already in the list
            if not any([rects_are_equal(match, rect, threshold=1) for rect in page_highlights[char]["matches"]]):
                page_highlights[char]["matches"].append(match)

def rects_are_equal(rect1, rect2, threshold=1e-6):
    return all([abs(rect1[i] - rect2[i]) < threshold for i in range(4)])

def add_highlight_annot(page_highlights:dict, page, comment_name):
    # print(f"Page highlights: {page_highlights}")
    for char, char_data in page_highlights.items():
        match_rects = char_data["matches"]
        description = char_data["description"]
        for rect in match_rects:
            annot = page.add_highlight_annot(rect)
            info = annot.info
            info["title"] = comment_name
            info["content"] = f"Error found: {char} ({description})"
            annot.set_info(info)
            annot.update()

def export_summary(error_summary:list):
    fieldnames = ['Character', 'Count', 'Description']
    with open("error_summary.csv", mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        csv_writer.writeheader()
        for entry in error_summary:
            csv_writer.writerow({
                fieldnames[0]: entry['char'],
                fieldnames[1]: entry['count'],
                fieldnames[2]: entry['description']
            })

def save_output_file(input_file, pdfIn):
    output_file = input_file.split(".")[0] + " punctuation_errors.pdf"
    pdfIn.save(output_file, garbage=3, deflate=True)
    pdfIn.close()
    return output_file

def upload_files():
  uploaded = files.upload()
  message_end = "successfully uploaded!"

  for filename in uploaded:
    if filename.endswith(".pdf"):
      print("PDF", message_end)
    else:
      print("Invalid file type uploaded!")
  
  check_uploaded_files()

def check_uploaded_files():
    directory = "/content/"

    pdf_file = None
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_file = filename

    if pdf_file:
        try:
            highlight_punctuation_errors(input_file="/content/"+pdf_file)
        except Exception as e:
            print("Error in highlight_punctuation_errors!")
            print(e)
    else:
        print("Something else is wrong with the file uploader (T_T)")

upload_files()

Saving KOSE2023e_C2-07_56-67_0515 V.pdf to KOSE2023e_C2-07_56-67_0515 V.pdf
PDF successfully uploaded!
Scanning page 10...Done!

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
#@title RESET: click the play button below to remove all files, then upload a new file in Step 2{display-mode: "form"}
!rm -f summary.txt
!rm -f *.csv
!rm -f *.pdf

directory = "/content/"
created_filetypes = (".csv", ".pdf")
counter = 0
for filename in os.listdir(directory):
  if filename.endswith(created_filetypes):
    counter += 1
print("Files remaining: ", counter)
if counter > 0:
  print("Error removing files. Click reset button again.")
else:
  print('All files removed!\nReady to upload new files.')

Files remaining:  0
All files removed!
Ready to upload new files.
