<a href="https://colab.research.google.com/github/dariru3/py-comment_pdf/blob/main/pdf_highlighter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instructions

How to run the code:
1.   Hover the mouse around the "Show code" text to reveal the play button.
2.   Press each button in succession. Scroll down if necessary to see the next button.
3.   After the 3rd button, select the "Choose Files" button that appears. Select both the target PDF and the CSV with the list of key words.



In [None]:
#@title Press the play button to get the necessary internal files {display-mode: "form"}
!pip install pymupdf
!pip install frontend

In [14]:
#@title Press the play button to initialize the script {display-mode: "form"}
def comment_pdf(input_file:str, list_filename_csv:str, pages:list=None):
    import fitz
    from google.colab import files

    comment_title = "Python Highlighter"
    search_list = read_csv(list_filename_csv)
    # create matches dictionary for output summary
    matches_record = {search[0]: 0 for search in search_list}
    # open pdf
    pdfIn = fitz.open(input_file)
    # Iterate throughout pdf pages
    for pg,page in enumerate(pdfIn):
        pageID = pg+1
        # If required to look in specific pages
        if pages and pageID not in pages:
            continue
        # Use the search_for function to find text
        for search_settings in search_list:
            word, comment, color = search_settings
            matched_values = page.search_for(word,hit_max=20)
            if matched_values:
                # Update matches_record
                matches_record[word] += len(matched_values)
                highlight_text(matched_values, page, color, comment_title, comment)
    
    # Save to output file
    output_file = input_file.split(".")[0] + " comments.pdf"
    pdfIn.save(output_file,garbage=3,deflate=True)
    pdfIn.close()
    
    create_summary(input_file, output_file, comment_title, matches_record)

    if output_file:
      files.download(output_file)
      files.download('summary.txt')

def read_csv(list_filename_csv):
    import csv
    with open(list_filename_csv, 'r') as csv_data:
        csv_reader = csv.reader(csv_data)
        header = next(csv_reader) # skips the first row
        search_list = [row for row in csv_reader]
    return search_list

def highlight_text(matched_values, page, color, comment_title, comment):
    colors = {
        "red": [0.7, 0.35, 0.5],
        "green": [0.35, 0.7, 0.5],
        "blue": [0.35, 0.5, 0.7]
    }
    for item in matched_values:
        # Highlight found text
        annot = page.add_highlight_annot(item)
        if color:
            annot.set_colors(stroke=colors[color])
        # Add comment to the found match
        info = annot.info
        info["title"] = comment_title
        info["content"] = comment
        annot.set_info(info)
        annot.update()

def create_summary(input_file, output_file, comment_title, matches_record):
    summary = {
         "Input File": input_file
       , "Output File": output_file
       , "Comment Title": comment_title
       , "Matching Instances": "\n" + "\n".join("{}: {}".format(word, count) for word, count in matches_record.items())
    }
    # Export Process Summary
    with open('summary.txt', 'w') as summary_txt:
        summary_txt.write("\n".join("{}: {}".format(i, j) for i, j in summary.items()))
    



In [29]:
#@title Press the play button then select two files to upload {display-mode: "form"}
from google.colab import files

uploaded = files.upload()
for filename in uploaded:
  if filename.endswith(".pdf"):
    upload_file = filename
  if filename.endswith(".csv"):
    keyword_list_file = filename

if len(uploaded) < 2:
  print ("Files not uploaded")
else:
  comment_pdf(input_file="/content/"+upload_file, list_filename_csv="/content/"+keyword_list_file)


Files not uploaded


# How to select multiple files in upload dialog

Win: ctrl + left click

Mac: cmd + left click