<a href="https://colab.research.google.com/github/dariru3/py-pdf_highlight_comment/blob/main/pdf_highlighter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Keywords Highlighter
## Instructions

**How to run the code**:
1.   Hover the mouse around the "Show code" text to reveal the play button (if necessary).
2.   Click the play button for each section in order.
3.   After clicking the play button in Step 2, upload the PDF layout data and the CSV wordlist using the "Choose Files" button.

Notes

- Download the CSV template [here](https://drive.google.com/file/d/1PH4qmrRmNpnErzLm_0JSKDbz0ArO_K_7/view?usp=sharing)

- General manual [here](https://docs.google.com/document/d/19EKw2OBcDwfV1X-yiUz8ltDu1TSzUpXBo3YYd3HcTFE/edit#)

- Depending on the size of the files, it can take up to 15 minutes or longer for the process to complete. Do not close the web browser tab or window until both output files--the highlighted PDF and summary text file--have finished downloading.

In [None]:
#@title Step 1: initialize the script{display-mode: "form"}
%%capture
!pip install pymupdf
!pip install frontend

In [None]:
#@title Optional: set the highlighter name{display-mode: "form"}
import ipywidgets as widgets
from IPython.display import display

widget = widgets.Text(
    value='LCI-QA',
    description='Commenter:',
    disabled=False
)
display(widget)

In [None]:
#@title Step 2: upload files.{display-mode: "form"}
from google.colab import files
import os

def comment_pdf(input_file:str, list_filename_csv:str, pages:list=None):
    import fitz
    import sys

    comment_name = "LCI-QA"
    try:
      if widget.value:
        comment_name = widget.value
    except:
      pass

    search_list = read_csv(list_filename_csv)
    # create matches dictionary for output summary
    matches_record = create_matches_record(search_list)

    # open pdf
    pdfIn = fitz.open(input_file)
    # Iterate throughout pdf pages
    for pg,page in enumerate(pdfIn):
        pageID = pg+1
        # UX
        sys.stdout.write(f"\rScanning page {pageID}...")
        sys.stdout.flush()

        # If required to look in specific pages
        if pages and pageID not in pages:
            continue

        # Use the search_for function to find text
        for search_settings in search_list:
            word, comment, color = search_settings
            matched_values = page.search_for(word)
            if matched_values:
                update_matches_record(matches_record, word, matched_values)
                highlight_text(matched_values, page, color, comment_name, comment)
    # UX
    sys.stdout.write("Done!")

    # Save to output file
    output_file = create_output_file(input_file, pdfIn)
    create_summary(input_file, output_file, comment_name, matches_record)

    if output_file:
        files.download(output_file)
        files.download('summary.txt')

def read_csv(list_filename_csv):
    import csv
    with open(list_filename_csv, 'r') as csv_data:
        csv_reader = csv.reader(csv_data)
        header = next(csv_reader) # skips the first row
        search_list = [[row[0], row[1], row[2]] for row in csv_reader]
    return search_list

def create_matches_record(search_list):
   return {search[0]: 0 for search in search_list}

def update_matches_record(matches_record, word, match_values):
   matches_record[word] += len(match_values)

def highlight_text(matched_values, page, color, comment_name, comment):
    colors = {
        'blue': [0, 0, 1],
        'light blue': [.22, .9, 1],
        'green': [.42, .85, .16],
        'light green': [.77, .98, .45],
        'yellow': [1, .82, 0],
        'light yellow': [.99, .96, .52],
        'orange': [1, .44, .01],
        'light orange': [1, .75, .62],
        'red': [.90, .13, .22],
        'light red': [1, .50, .62],
        'pink': [.64, .19, .53],
        'light pink': [.98, .53, 1]
    }

    for item in matched_values:
        # Highlight found text
        annot = page.add_highlight_annot(item)
        if color:
            if color.lower() in colors:
                annot.set_colors(stroke=colors[color])
        # Add comment to the found match
        info = annot.info
        info["title"] = comment_name
        info["content"] = comment
        annot.set_info(info)
        annot.update(opacity=0.4)

def create_output_file(input_file, pdfIn):
  output_file = input_file.split(".")[0] + " comments.pdf"
  pdfIn.save(output_file,garbage=3,deflate=True)
  pdfIn.close()
  return output_file

def create_summary(input_file, output_file, comment_name, matches_record):
    summary = {
        "Input File": input_file,
        "Output File": output_file,
        "Comment Title": comment_name,
        "Matching Instances": "\n" + "\n".join("{}: {}".format(word, count) for word, count in matches_record.items())
    }
    # Export Process Summary
    with open('summary.txt', 'w') as summary_txt:
        summary_txt.write("\n".join("{}: {}".format(i, j) for i, j in summary.items()))

def upload_files():
  uploaded = files.upload()
  message_success = "successfully uploaded!"

  for filename in uploaded:
    if filename.endswith(".pdf"):
      print("PDF", message_success)
    elif filename.endswith(".csv"):
      print("CSV", message_success)
    else:
      print("Invalid file type uploaded!")

  check_uploaded_files()

def check_uploaded_files():
  directory = "/content/"

  pdf_file = None
  keyword_list_file = None

  for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
      pdf_file = filename
    if filename.endswith(".csv"):
      keyword_list_file = filename

  if pdf_file and keyword_list_file:
    try:
        comment_pdf(input_file="/content/"+pdf_file, list_filename_csv="/content/"+keyword_list_file)
    except Exception as e:
        print("Error in comment_pdf!")
        print(e)
  elif pdf_file or keyword_list_file:
      missing_file = "CSV file" if not keyword_list_file else "PDF file"
      print(f"Please upload {missing_file}.")
      upload_files()
  else:
      print("Something else is wrong with the file uploader (T_T)")

upload_files()

Saving scan_list_MitsuiTradingJPEN.csv to scan_list_MitsuiTradingJPEN.csv
Saving 34_人権とサプライチェーンへの取り組み.pdf to 34_人権とサプライチェーンへの取り組み.pdf
CSV successfully uploaded!
PDF successfully uploaded!
Scanning page 3...Done!

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title RESET: click the play button below to remove all files, then upload a new pair of files in Step 2{display-mode: "form"}
!rm -f summary.txt
!rm -f *.csv
!rm -f *.pdf

directory = "/content/"
created_filetypes = (".txt", ".csv", ".pdf")
counter = 0
for filename in os.listdir(directory):
  if filename.endswith(created_filetypes):
    counter += 1
print("Files remaining: ", counter)
if counter > 0:
  print("Error removing files. Click reset button again.")
else:
  print('All files removed!\nReady to upload new files.')

Files remaining:  0
All files removed!
Ready to upload new files.
