# This script :
1. imports files from a Google Drive folder;
2. extracts text data and file name; and
3. stores data for later use

The script's text extraction process scans each PDF file with pdfplumber. If pdfplumber doesn't work (if the length of the extracted text is under 200 words), we also scan the file with an OCR engine (easyocr, which uses the GPU; or pytesseract, which doesn't use the GPU).

The script utilizes a metastore.json file to track progress, so if your Colab shuts down mid-execution, you can resume text extraction from where you left off.

The script outputs a csv file (extracted_text.csv) that stores the data in 3 columns: filename, text (extracted with pdfplumber), and ocr_text (extracted with easyocr).

In [None]:
!pip install pdfplumber pytesseract opencv-python-headless easyocr
!sudo apt install tesseract-ocr
!pip install pdf2image
!apt-get install -y poppler-utils

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.man

In [None]:
# PDF parsing imports
import pdfplumber
import easyocr # leverages GPU, so it's faster
import pytesseract # does not leverage GPU (?), so it's slow

# general imports
import pandas as pd
import numpy as np
import io
import re
import tempfile
import os

from pdf2image import convert_from_path
from PIL import Image

from tqdm import tqdm  # For progress bar
import json #for saving metadata
import torch

### Functions to import and process file

In [None]:
# function to extract text from pdf using pdfplumber
def extract_text_with_pdfplumber(pdf_path):
  try:
    with pdfplumber.open(pdf_path) as pdf:
      text = ''
      for page in pdf.pages:
        text += page.extract_text() or '' # handle None returns
      return text
  except Exception as e:
    print(f'pdfplumber error {e}')
    return ''

# function to extract text from pdf using OCR (easyocr)
def extract_text_with_ocr(pdf_path):
  try:
      images = convert_from_path(pdf_path, dpi=100)  # adjust DPI if needed
      text = ''
      for image in images:
          result = reader.readtext(np.array(image)) # convert image to numpy array for easyocr ingestion
          extracted_text = ' '.join([detection[1] for detection in result])
          text += extracted_text + ' '  # add space between lines/detections
      return text
  except Exception as e:
      print(f'EasyOCR error: {e}')
      return ''

# function to load metadata (will help track progress on big project)
def load_metadata(metadata_file):
  try:
    with open(metadata_file, 'r') as f:
      return json.load(f)
  except FileNotFoundError:
    return {'processed_files':[], 'last_processed_index':-1}

# function to save metadata (will help track progress on big project)
def save_metadata(metadata, metadata_file):
  with open(metadata_file, 'w') as f:
    json.dump(metadata, f)

### Prep for main execution: mount Google Drive, establish file paths, check for GPU availability, and initialize OCR reader:

In [None]:
# mount Google drive so we can access and save files with extracted text later on
from google.colab import drive
drive.mount('/content/drive')
pdf_folder = '/content/drive/My Drive/polsky file split/folder_3'
if os.path.exists(pdf_folder):
  print("total file count")
  print(len(os.listdir(pdf_folder)))

Mounted at /content/drive
total file count
1067


In [None]:
# paths
# pdf_folder = '/content/drive/My Drive/polsky file split/folder_3' # this is the source of our raw PDF files
output_csv = '/content/drive/My Drive/polsky file split/export_3.csv' # this is where we'll keep our extracted text
metadata_file = '/content/drive/My Drive/polsky file split/metadata.json' # for saving progress

# check for GPU availability
if torch.cuda.is_available():
    print('GPU is available. Using GPU for OCR.')
    device = 'cuda'
else:
    print('GPU is not available. Using CPU for OCR.')
    device = 'cpu'

# initialize EasyOCR reader (only once)
reader = easyocr.Reader(['en'], gpu=True if device == 'cuda' else False)



GPU is not available. Using CPU for OCR.
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

### Main Execution: work through the entire dataset in batches

In [None]:
# batch processing
batch_size = 10
metadata = load_metadata(metadata_file)
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
start_index = metadata['last_processed_index'] + 1
total_files = len(pdf_files)

all_data = []
if os.path.exists(output_csv): # if the file exists, load its data
  all_data = pd.read_csv(output_csv).to_dict(orient='records')

for i in tqdm(range(start_index, total_files, batch_size), desc='Processing Batches'):
  batch_end = min(i + batch_size, total_files)
  batch_files = pdf_files[i:batch_end]
  batch_data = []

  for filename in tqdm(batch_files, desc=f'Processing Files {i+1}-{batch_end}', leave=False):
    # check if current filename is already present in all_data list (which is populated from the CSV if it exists)
    # if filename is found, skip the processing for that file and move to the next one.
    if filename in [d['filename'] for d in all_data]:
      continue
    pdf_path = os.path.join(pdf_folder, filename)
    text = extract_text_with_pdfplumber(pdf_path)
    ocr_text = ''

    if not text.strip() or len(text.split()) < 200: # if pdfplumber's text was < 200 words, try OCR
      ocr_text = extract_text_with_ocr(pdf_path)
      if not ocr_text:
        print(f'Could not extract text from {filename}')
      else:
        print(f'OCR used for {filename}')

    batch_data.append({'filename': filename, 'text': text, 'ocr_text': ocr_text})

  all_data.extend(batch_data)
  df = pd.DataFrame(all_data)
  df.to_csv(output_csv, index=False)

  metadata['processed_files'].extend([f['filename'] for f in batch_data])
  metadata['last_processed_index'] = batch_end -1
  save_metadata(metadata, metadata_file)

print('\nfinished processing all files')

NameError: name 'load_metadata' is not defined

### Inspect text extraction results

In [None]:
df = pd.read_csv(output_csv)

In [None]:
# Find rows where 'text' is empty - make sure we're capturing data with OCR in those cases
empty_text_rows = df[df['text'].isnull()]
empty_text_rows

In [None]:
# make a copy of 'df' and sort it by the length of the 'text' column (low to high)

df_copy = df.copy()
df_sorted = df_copy.sort_values(by='text', key=lambda x: x.str.len())
df_sorted['text_length'] = df_sorted['text'].apply(lambda x: len(str(x).split()))
df_sorted['ocr_text_length'] = df_sorted['ocr_text'].apply(lambda x: len(str(x).split()))

In [None]:
df_sorted.head(20)