<a href="https://colab.research.google.com/github/dariru3/py-pdf_full-width/blob/main/pdf_full_width_highlighter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Full-Width Characters Highlighter
## Instructions

**How to run the code**:
1.   Hover the mouse around the "Show code" text to reveal the play button (if necessary).
2.   Click the play button for each section in order.
3.   After clicking the play button in Step 2, upload the PDF data using the "Choose Files" button.

Note

- Depending on the size of the files, it can take up to 15 minutes or longer for the process to complete. Do not close the web browser tab or window until both output files--the highlighted PDF and summary csv file--have finished downloading.

In [None]:
#@title Step 1: initialize the script{display-mode: "form"}
%%capture
!pip install pymupdf
!pip install frontend

In [None]:
#@title Optional: set the highlighter name{display-mode: "form"}
import ipywidgets as widgets
from IPython.display import display

widget = widgets.Text(
    value='Full-Width Highlighter',
    description='Commenter:',
    disabled=False   
)
display(widget)

In [None]:
#@title Step 2: upload PDF file{display-mode: "form"}
from google.colab import files
import os
import fitz, sys
import csv
import unicodedata, re

def highlight_full_width(input_file:str, pages:list=None):
    comment_name = "Full-Width Highlighter"
    try:
        if widget.value:
            comment_name = widget.value
    except:
        pass
    # create matches list for output summary
    full_width_summary = []
    # open pdf
    pdfIn = fitz.open(input_file)
    # Iterate throughout pdf pages
    for pg, page in enumerate(pdfIn):
        pageID = pg+1
        sys.stdout.write(f"\rScanning page {pageID}...")
        sys.stdout.flush()
        # If required to look in specific pages
        if pages and pageID not in pages:
            continue

        # Get all the text in the page
        text = page.get_text("text")
        full_width_chars = check_full_width(text, full_width_summary)

        page_highlights = {}  # Initialize a dictionary to store match rectangles for each character
        get_positions(full_width_chars, text, page, page_highlights)
        add_highlight_annot(page_highlights, page, comment_name)
    sys.stdout.write("Done!")

    export_summary(full_width_summary)
    output_file = save_output_file(input_file, pdfIn)

    if output_file:
        files.download(output_file)
        files.download('full-width_summary.csv')

def check_full_width(text, full_width_summary):
    temp_set = set()
    full_status = ['W', 'F', 'A']
    pattern = re.compile("[\uFF01-\uFF5E]+")
    for char in text:
        status = unicodedata.east_asian_width(char)
        if status in full_status and pattern.search(char) is None:
            temp_set.add(char)
            update_summary(full_width_summary, char, status)
    return temp_set

def update_summary(full_width_summary:list, char, status):
    found = False
    for entry in full_width_summary:
        if entry['char'] == char:
            entry['count'] += 1
            found = True
            break
    if not found:
        full_width_summary.append({'char': char, 'count': 1, 'type': status})

def get_positions(full_width_chars, text, page, page_highlights):
    for char in full_width_chars:
        start_idx = 0
        while True:
            start_idx = text.find(char, start_idx)
            if start_idx == -1:
                break
            end_idx = start_idx + len(char)
            matches = page.search_for(text[start_idx:end_idx])
            if matches:
                handle_matches(matches, char, page_highlights)
            start_idx += 1

def handle_matches(matches, char, page_highlights):
    for match in matches:
        if char not in page_highlights:
            page_highlights[char] = [match]
        else:
            # Check if the match rectangle is not already in the list
            if not any([rects_are_equal(match, rect) for rect in page_highlights[char]]):
                page_highlights[char].append(match)

def rects_are_equal(rect1, rect2):
    return all([abs(rect1[i] - rect2[i]) < 1e-6 for i in range(4)])

def add_highlight_annot(page_highlights:dict, page, comment_name):
    for char, match_rects in page_highlights.items():
        for rect in match_rects:
            annot = page.add_highlight_annot(rect)
            info = annot.info
            info["title"] = comment_name
            info["content"] = f"Replace {char} with half-width version"
            annot.set_info(info)
            annot.update()

def export_summary(full_width_summary:list):
    fieldnames = ['Character', 'Count', 'Type']
    with open("full-width_summary.csv", mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        csv_writer.writeheader()
        for entry in full_width_summary:
            csv_writer.writerow({
                fieldnames[0]: entry['char'],
                fieldnames[1]: entry['count'],
                fieldnames[2]: entry['type']
            })

def save_output_file(input_file, pdfIn):
    output_file = input_file.split(".")[0] + " full-width_highlight.pdf"
    pdfIn.save(output_file, garbage=3, deflate=True)
    pdfIn.close()
    return output_file

def upload_files():
  uploaded = files.upload()
  message_end = "successfully uploaded!"

  for filename in uploaded:
    if filename.endswith(".pdf"):
      print("PDF", message_end)
    else:
      print("Invalid file type uploaded!")
  
  check_uploaded_files()

def check_uploaded_files():
    directory = "/content/"

    pdf_file = None
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_file = filename

    if pdf_file:
        try:
            highlight_full_width(input_file="/content/"+pdf_file)
        except Exception as e:
            print("Error in highlight_full_width!")
            print(e)
    else:
        print("Something else is wrong with the file uploader (T_T)")

upload_files()

In [None]:
#@title RESET: click the play button below to remove all files, then upload a new file in Step 2{display-mode: "form"}
!rm -f summary.txt
!rm -f *.csv
!rm -f *.pdf

directory = "/content/"
created_filetypes = (".csv", ".pdf")
counter = 0
for filename in os.listdir(directory):
  if filename.endswith(created_filetypes):
    counter += 1
print("Files remaining: ", counter)
if counter > 0:
  print("Error removing files. Click reset button again.")
else:
  print('All files removed!\nReady to upload new files.')