# This script :
1. imports files from a Google Drive folder;
2. extracts text data and file name; and
3. stores data for later use

The script's text extraction process scans each PDF file with pdfplumber. If pdfplumber doesn't work (if the length of the extracted text is under 200 words), we also scan the file with an OCR engine (easyocr, which uses the GPU; or pytesseract, which doesn't use the GPU).

The script utilizes a metastore.json file to track progress, so if your Colab shuts down mid-execution, you can resume text extraction from where you left off.

The script outputs a csv file (extracted_text.csv) that stores the data in 3 columns: filename, text (extracted with pdfplumber), and ocr_text (extracted with easyocr).

In [None]:
!pip install pdfplumber pytesseract opencv-python-headless easyocr
!sudo apt install tesseract-ocr
!pip install pdf2image
!apt-get install -y poppler-utils

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.man

In [None]:
# PDF parsing imports
import pdfplumber
import easyocr # leverages GPU, so it's faster
import pytesseract # does not leverage GPU (?), so it's slow

# general imports
import pandas as pd
import numpy as np
import io
import re
import tempfile
import os

from pdf2image import convert_from_path
from PIL import Image

from tqdm import tqdm  # For progress bar
import json #for saving metadata
import torch

### Functions to import and process file

In [None]:
# function to extract text from pdf using pdfplumber
def extract_text_with_pdfplumber(pdf_path):
  try:
    with pdfplumber.open(pdf_path) as pdf:
      text = ''
      for page in pdf.pages:
        text += page.extract_text() or '' # handle None returns
      return text
  except Exception as e:
    print(f'pdfplumber error {e}')
    return ''

# function to extract text from pdf using OCR (easyocr)
def extract_text_with_ocr(pdf_path):
  try:
      images = convert_from_path(pdf_path, dpi=75)  # adjust DPI if needed
      text = ''
      for image in images:
          result = reader.readtext(np.array(image)) # convert image to numpy array for easyocr ingestion
          extracted_text = ' '.join([detection[1] for detection in result])
          text += extracted_text + ' '  # add space between lines/detections
      return text
  except Exception as e:
      print(f'EasyOCR error: {e}')
      return ''

# function to load metadata (will help track progress on big project)
def load_metadata(metadata_file):
  try:
    with open(metadata_file, 'r') as f:
      return json.load(f)
  except FileNotFoundError:
    return {'processed_files':[], 'last_processed_index':-1}

# function to save metadata (will help track progress on big project)
def save_metadata(metadata, metadata_file):
  with open(metadata_file, 'w') as f:
    json.dump(metadata, f)

### Prep for main execution: mount Google Drive, establish file paths, check for GPU availability, and initialize OCR reader:

In [None]:
# mount Google drive so we can access and save files with extracted text later on
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# paths
pdf_folder = "/content/drive/My Drive/polsky file split/folder_3" # this is the source of our raw PDF files
output_csv = '/content/drive/My Drive/polsky file split/export_3.csv' # this is where we'll keep our extracted text
metadata_file = '/content/drive/My Drive/polsky file split/metadata.json' # for saving progress

# check for GPU availability
if torch.cuda.is_available():
    print('GPU is available. Using GPU for OCR.')
    device = 'cuda'
else:
    print('GPU is not available. Using CPU for OCR.')
    device = 'cpu'

# initialize EasyOCR reader (only once)
reader = easyocr.Reader(['en'], gpu=True if device == 'cuda' else False)



GPU is not available. Using CPU for OCR.
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

### Main Execution: work through the entire dataset in batches

In [None]:
# batch processing
batch_size = 20
metadata = load_metadata(metadata_file)
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
start_index = metadata['last_processed_index'] + 1
total_files = len(pdf_files)

all_data = []
if os.path.exists(output_csv): # if the file exists, load its data
  all_data = pd.read_csv(output_csv).to_dict(orient='records')

for i in tqdm(range(start_index, total_files, batch_size), desc='Processing Batches'):
  batch_end = min(i + batch_size, total_files)
  batch_files = pdf_files[i:batch_end]
  batch_data = []

  for filename in tqdm(batch_files, desc=f'Processing Files {i+1}-{batch_end}', leave=False):
    # check if current filename is already present in all_data list (which is populated from the CSV if it exists)
    # if filename is found, skip the processing for that file and move to the next one.
    if filename in [d['filename'] for d in all_data]:
      continue
    pdf_path = os.path.join(pdf_folder, filename)
    text = extract_text_with_pdfplumber(pdf_path)
    ocr_text = ''

    if not text.strip() or len(text.split()) < 200: # if pdfplumber's text was < 200 words, try OCR
      ocr_text = extract_text_with_ocr(pdf_path)
      if not ocr_text:
        print(f'Could not extract text from {filename}')
      else:
        print(f'OCR used for {filename}')

    batch_data.append({'filename': filename, 'text': text, 'ocr_text': ocr_text})

  all_data.extend(batch_data)
  df = pd.DataFrame(all_data)
  df.to_csv(output_csv, index=False)

  metadata['processed_files'].extend([f['filename'] for f in batch_data])
  metadata['last_processed_index'] = batch_end -1
  save_metadata(metadata, metadata_file)

print('\nfinished processing all files')

Processing Batches:   0%|          | 0/54 [00:00<?, ?it/s]
Processing Files 1-20:   0%|          | 0/20 [00:00<?, ?it/s][A
Processing Files 1-20:   5%|▌         | 1/20 [01:59<37:54, 119.73s/it][A

OCR used for ARCD.P0745US.P1 - 11_8_21 Efiled Provisional Application, ADS, EFS Ack Receipt.PDF-!!-6374506.PDF



Processing Files 1-20:  10%|█         | 2/20 [04:25<40:33, 135.17s/it][A

OCR used for ARCD.P0744US.P1 - 11_09_21  Efiled Provisional Application, ADS, EFS Ack Receipt.PDF-!!-6388967.PDF



Processing Files 1-20:  15%|█▌        | 3/20 [07:28<44:27, 156.92s/it][A

OCR used for ARCD.P0724US.P1 - 11_10_21  Efiled Provisional Application, ADS, EFS Ack Receipt.PDF-!!-6390039.PDF



Processing Files 1-20:  20%|██        | 4/20 [07:41<26:41, 100.12s/it][A
Processing Files 1-20:  25%|██▌       | 5/20 [15:48<59:53, 239.56s/it][A

OCR used for ARCD.P0752US.P1 - 2021-11-16 - Filed US Provisional Application.PDF-!!-6409967.PDF



Processing Files 1-20:  30%|███       | 6/20 [15:49<36:59, 158.56s/it][A
Processing Files 1-20:  35%|███▌      | 7/20 [15:51<23:13, 107.18s/it][A
Processing Files 1-20:  40%|████      | 8/20 [16:23<16:41, 83.43s/it] [A

OCR used for ARCD.P0716US.P1 - 2021-11-22 - Filed Request to Correct Applicant.PDF-!!-6429332.PDF



Processing Files 1-20:  45%|████▌     | 9/20 [16:57<12:26, 67.84s/it][A
Processing Files 1-20:  50%|█████     | 10/20 [17:34<09:44, 58.41s/it][A

OCR used for ARCD.P0715US.P1 - 2021-12-02 - Filed Request for Correction of Inventors and Applicants.PDF-!!-6456573.PDF



Processing Files 1-20:  55%|█████▌    | 11/20 [28:26<36:00, 240.08s/it][A

OCR used for ARCD.P0715WO - 2021-12-03 - Filed PCT Application.PDF-!!-6461712.PDF



Processing Files 1-20:  60%|██████    | 12/20 [28:52<23:19, 174.98s/it][A
Processing Files 1-20:  65%|██████▌   | 13/20 [28:55<14:20, 122.92s/it][A

OCR used for ARCD.P0712WO - 2021-11-30 - Official Filing Receipt.PDF-!!-6483593.PDF



Processing Files 1-20:  70%|███████   | 14/20 [29:08<08:57, 89.60s/it] [A
Processing Files 1-20:  75%|███████▌  | 15/20 [29:14<05:22, 64.51s/it][A

OCR used for ARCD.P0715US.P1_12_07_2021_Updated Filing Receipt.pdf-!!-6554561.pdf



Processing Files 1-20:  80%|████████  | 16/20 [29:16<03:02, 45.69s/it][A

OCR used for ARCD.P0715US.P1_12_07_2021_Acceptance of Request to correct inventorship....pdf-!!-6554568.pdf



Processing Files 1-20:  85%|████████▌ | 17/20 [29:22<01:41, 33.68s/it][A

OCR used for ARCD.P0753US.P1_11_10_2021_Filing Receipt.pdf-!!-6554616.pdf



Processing Files 1-20:  90%|█████████ | 18/20 [32:04<02:24, 72.20s/it][A
Processing Files 1-20:  95%|█████████▌| 19/20 [32:07<00:51, 51.43s/it][A

OCR used for ARCD.P0713WO - 2021-12-31 - Official Filing Receipt.PDF-!!-6571824.PDF



Processing Files 1-20: 100%|██████████| 20/20 [32:10<00:00, 37.00s/it][A
                                                                      [A

OCR used for ARCD.P0720WO - 2021-12-29 - Official Filling Receipt.PDF-!!-6594656.PDF


Processing Batches:   2%|▏         | 1/54 [32:11<28:25:45, 1931.05s/it]
Processing Files 21-40:   0%|          | 0/20 [00:00<?, ?it/s][A
Processing Files 21-40:   5%|▌         | 1/20 [00:13<04:24, 13.90s/it][A

OCR used for ARCD.P0720WO - 2021-12-29 - Invitation to Correct Defects and Figs 5E-H on Record.PDF-!!-6594814.PDF



Processing Files 21-40:  10%|█         | 2/20 [00:20<02:48,  9.36s/it][A

OCR used for ARCD.P0745US.P1_11_15_2021_Filing Receipt.pdf-!!-6602949.pdf



Processing Files 21-40:  15%|█▌        | 3/20 [00:27<02:22,  8.40s/it][A

OCR used for ARCD.P0724US.P1_11_18_2021_Filing Receipt.pdf-!!-6602963.pdf



Processing Files 21-40:  20%|██        | 4/20 [00:33<01:58,  7.39s/it][A

OCR used for ARCD.P0744US.P1_11_18_2021_Filing Receipt.pdf-!!-6602970.pdf



Processing Files 21-40:  25%|██▌       | 5/20 [00:40<01:47,  7.19s/it][A

OCR used for ARCD.P0740US.P1_12_20_2021_Filing Receipt.pdf-!!-6603253.pdf



Processing Files 21-40:  30%|███       | 6/20 [00:42<01:20,  5.74s/it][A

OCR used for ARCD.P0715WO - 2021-12-28 - Filing Receipt.PDF-!!-6603285.PDF



Processing Files 21-40:  35%|███▌      | 7/20 [00:46<01:06,  5.09s/it][A

OCR used for ARCD.P0725EP - 2021-10-05 - Unpaid Annuity Notification.PDF-!!-6683804.PDF



Processing Files 21-40:  40%|████      | 8/20 [03:21<10:33, 52.83s/it][A

OCR used for ARCD.P0756US.P1 - 01_06_22 Efiled Provisional Application, ADS, Seq Listing, EFS Ack Receipt.PDF-!!-6691268.PDF



Processing Files 21-40:  45%|████▌     | 9/20 [03:25<06:52, 37.50s/it][A
Processing Files 21-40:  50%|█████     | 10/20 [03:29<04:31, 27.11s/it][A

OCR used for ARCD.P0713WO - 2021-12-31 - Invite to Furnish Sequence Listing.PDF-!!-6731425.PDF



Processing Files 21-40:  55%|█████▌    | 11/20 [03:30<02:53, 19.24s/it][A
Processing Files 21-40:  60%|██████    | 12/20 [03:38<02:05, 15.64s/it][A

OCR used for ARCD.P0754US.P1_12_01_2021_Filing Receipt.pdf-!!-6760993.pdf



Processing Files 21-40:  65%|██████▌   | 13/20 [03:44<01:28, 12.70s/it][A

OCR used for ARCD.P0752US.P1_11_23_2021_Filing Receipt.pdf-!!-6761196.pdf



Processing Files 21-40:  70%|███████   | 14/20 [03:51<01:05, 11.00s/it][A

OCR used for ARCD.P0756US.P1_01_19_2022_Filing Receipt.pdf-!!-6762498.pdf



Processing Files 21-40:  75%|███████▌  | 15/20 [03:57<00:47,  9.51s/it][A

OCR used for ARCD.P0759US.P1_12_30_2021_Filing Receipt.pdf-!!-6763210.pdf



Processing Files 21-40:  80%|████████  | 16/20 [03:58<00:27,  6.94s/it][A
Processing Files 21-40:  85%|████████▌ | 17/20 [04:35<00:48, 16.09s/it][A
Processing Files 21-40:  90%|█████████ | 18/20 [05:00<00:37, 18.64s/it][A
Processing Files 21-40:  95%|█████████▌| 19/20 [05:06<00:14, 14.95s/it][A
Processing Files 21-40: 100%|██████████| 20/20 [05:13<00:00, 12.57s/it][A
                                                                       [A

OCR used for ARCD.P0755US.P1_11_18_2021_Filing Receipt.pdf-!!-6829258.pdf


Processing Batches:   4%|▎         | 2/54 [37:24<14:09:04, 979.70s/it] 
Processing Files 41-60:   0%|          | 0/20 [00:00<?, ?it/s][A
Processing Files 41-60:   5%|▌         | 1/20 [00:27<08:40, 27.39s/it][A

OCR used for ARCD.P0755US.P1 - 11_10_21 Efiled Provisonal Application, ADS, EFS Ack Receipt.PDF-!!-6829469.PDF



Processing Files 41-60:  10%|█         | 2/20 [05:27<56:19, 187.74s/it][A

OCR used for ARCD.P0763US.P1 - 02_18_22 Efiled Provisional Application, ADS, Seq Listing, EFS Ack Receipt.PDF-!!-6829660.PDF



Processing Files 41-60:  15%|█▌        | 3/20 [05:31<29:28, 104.02s/it][A

OCR used for ARCD.P0722US.P1 & ARCD.P0722US.P2 - 2022-02-22 - Filed Assignment - RAMAN to UChicago.PDF-!!-6840826.PDF



Processing Files 41-60:  20%|██        | 4/20 [05:38<17:30, 65.68s/it] [A

OCR used for ARCD.P0722US.P1 & ARCD.P0722US.P2 - 2022-02-22 - Filed Assignment - ZAYDMAN to Washington University.PDF-!!-6840834.PDF



Processing Files 41-60:  25%|██▌       | 5/20 [05:48<11:22, 45.51s/it][A

OCR used for ARCD.P0761US.P1 Figures V2.pdf-!!-6842545.pdf



Processing Files 41-60:  30%|███       | 6/20 [05:53<07:26, 31.92s/it][A

OCR used for ARCD.P0743US.P1 - 2022-02-23 - Filed Assignment.PDF-!!-6845939.PDF



Processing Files 41-60:  35%|███▌      | 7/20 [06:00<05:05, 23.49s/it][A

OCR used for ARCD.P0748US.P1_01_12_2022_Filing Receipt.pdf-!!-6846147.pdf



Processing Files 41-60:  40%|████      | 8/20 [09:17<15:46, 78.83s/it][A

OCR used for ARCD.P0731US.P2 - 02_23_22 Efiled Provisional Application, ADS, EFS Ack Receipt.PDF-!!-6846688.PDF



Processing Files 41-60:  45%|████▌     | 9/20 [09:18<10:00, 54.59s/it][A
Processing Files 41-60:  50%|█████     | 10/20 [09:20<06:21, 38.16s/it][A
Processing Files 41-60:  55%|█████▌    | 11/20 [09:21<04:01, 26.88s/it][A
Processing Files 41-60:  60%|██████    | 12/20 [09:33<02:58, 22.30s/it][A

OCR used for ARCD.P0715WO - 2022-02-14 - Invitation to Pay Additional Search Fees.PDF-!!-6878099.PDF



Processing Files 41-60:  65%|██████▌   | 13/20 [11:37<06:13, 53.30s/it][A

OCR used for ARCD.P0758US.P1 - 02_25_22  Efiled Provisional Application, ADS, EFS Ack Receipt.PDF-!!-6878201.PDF



Processing Files 41-60:  70%|███████   | 14/20 [11:45<03:56, 39.45s/it][A

OCR used for ARCD.P0748US.P1 (2022.02.18) Efiled Express Abandonment.PDF-!!-6885570.PDF



Processing Files 41-60:  75%|███████▌  | 15/20 [13:50<05:26, 65.23s/it][A

OCR used for ARCD.P0758US.P1 - 02_25_22  Efiled Provisional Application, ADS, EFS Ack Receipt.pdf-!!-6890108.pdf



Processing Files 41-60:  80%|████████  | 16/20 [13:57<03:10, 47.69s/it][A

OCR used for ARCD.P0731US.P2_03_04_2022_Filing Receipt.pdf-!!-6896886.pdf



Processing Files 41-60:  85%|████████▌ | 17/20 [14:27<02:07, 42.43s/it][A

OCR used for ARCD.P0720WO - 2022-02-23 - International Search Report and 1st Written Opinion.PDF-!!-6899334.PDF



Processing Files 41-60:  90%|█████████ | 18/20 [14:32<01:02, 31.22s/it][A

OCR used for ARCD.P0724US.P1 - 2022-03-07 - Filed Assignment.PDF-!!-6900093.PDF



Processing Files 41-60:  95%|█████████▌| 19/20 [14:38<00:23, 23.59s/it][A

OCR used for ARCD.P0736US.P1 - 2022-03-07 - Filed Assignment.PDF-!!-6900207.PDF



Processing Files 41-60: 100%|██████████| 20/20 [14:39<00:00, 16.90s/it][A
Processing Batches:   6%|▌         | 3/54 [52:04<13:14:00, 934.13s/it]
Processing Files 61-80:   0%|          | 0/20 [00:00<?, ?it/s][A
Processing Files 61-80:   5%|▌         | 1/20 [00:01<00:22,  1.19s/it][A
Processing Files 61-80:  10%|█         | 2/20 [00:05<00:53,  2.97s/it][A
Processing Files 61-80:  15%|█▌        | 3/20 [02:29<19:08, 67.54s/it][A

OCR used for ARCD.P0722WO - 2022-03-16 - Filed PCT Application.pdf-!!-6942260.pdf



Processing Files 61-80:  20%|██        | 4/20 [03:24<16:38, 62.43s/it][A

OCR used for ARCD.P0765US.P1 - EFILED ADS_ Provisional Application_ Fee & Acknowledge....pdf-!!-6946351.pdf



Processing Files 61-80:  25%|██▌       | 5/20 [03:30<10:29, 42.00s/it][A

OCR used for ARCD.P0754US.P1 - 2022-03-19 - Filed Assignment.pdf-!!-6953730.pdf



Processing Files 61-80:  30%|███       | 6/20 [03:42<07:26, 31.89s/it][A

OCR used for ARCD.P0716WO Figures (B&W).pdf-!!-6974353.pdf



Processing Files 61-80:  35%|███▌      | 7/20 [03:43<04:44, 21.88s/it][A
Processing Files 61-80:  40%|████      | 8/20 [03:58<03:53, 19.49s/it][A

OCR used for ARCD.P0716WO [22-T-044] PCT Figures - FINAL.pdf-!!-6992716.pdf



Processing Files 61-80:  45%|████▌     | 9/20 [04:07<03:00, 16.43s/it][A
Processing Files 61-80:  50%|█████     | 10/20 [04:15<02:16, 13.61s/it][A

OCR used for ARCD.P0758US.P1_03_04_2022_Filing Receipt.pdf-!!-7005344.pdf



Processing Files 61-80:  55%|█████▌    | 11/20 [04:21<01:41, 11.29s/it][A

OCR used for ARCD.P0765US.P1_03_25_2022_Filing Receipt.pdf-!!-7005433.pdf



Processing Files 61-80:  60%|██████    | 12/20 [04:28<01:20, 10.02s/it][A

OCR used for ARCD.P0763US.P1_02_25_2022_Filing Receipt.pdf-!!-7005753.pdf



Processing Files 61-80:  65%|██████▌   | 13/20 [04:29<00:52,  7.48s/it][A

OCR used for ARCD.P0748US.P1_03_08_2022_Notice of Abandonment.pdf-!!-7005793.pdf


### Inspect text extraction results

In [None]:
df = pd.read_csv(output_csv)

In [None]:
# Find rows where 'text' is empty - make sure we're capturing data with OCR in those cases
empty_text_rows = df[df['text'].isnull()]
empty_text_rows

In [None]:
# make a copy of 'df' and sort it by the length of the 'text' column (low to high)

df_copy = df.copy()
df_sorted = df_copy.sort_values(by='text', key=lambda x: x.str.len())
df_sorted['text_length'] = df_sorted['text'].apply(lambda x: len(str(x).split()))
df_sorted['ocr_text_length'] = df_sorted['ocr_text'].apply(lambda x: len(str(x).split()))

In [None]:
df_sorted.head(20)