In [None]:
import re
from pathlib import Path
import pandas as pd
import pytesseract
from pdf2image import convert_from_path

In [None]:
PROJECT_PATH = r'C:\python\development\extraction_rr_material_properties'
PDF_PATH = rf'{PROJECT_PATH}\data'
POPPLER_PATH = r'C:\Users\M67743\AppData\Local\poppler\poppler-25.12.0\Library\bin'
TESSERACT_PATH = r'C:\Users\M67743\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
 
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH

# Define configs
standard_config = r''  # For tables
custom_config = r'--oem 3 --psm 6'  # For structured text

In [None]:
def find_relevant_pages(file, dpi_value=150):
    """
    Quick low-DPI scan to find which pages contain our data
    Returns dict of page numbers for each section
    """
    print('  Quick scan to find relevant pages...')
    
    # Low DPI = fast but less accurate (just for finding pages)
    images = convert_from_path(file, dpi=dpi_value, poppler_path=POPPLER_PATH)
    
    pages = {
        'lot_info': [],
        'tg_data': [],
        'spool_table': []
    }
    
    for page_num, image in enumerate(images, 1):
        # Fast, low-quality OCR just to find keywords
        text = pytesseract.image_to_string(image, config=standard_config)
        text_lower = text.lower()
        
        # Check for lot info
        if 'lot no:' in text_lower and 'part number' in text_lower:
            pages['lot_info'].append(page_num)
        
        # Check for Tg data
        if 'tg by dma' in text_lower:
            pages['tg_data'].append(page_num)
        
        # Check for spool table
        if 'lot averages report for lot' in text_lower:
            pages['spool_table'].append(page_num)
    
    print(f'    Lot info pages: {pages["lot_info"]}')
    print(f'    Tg data pages: {pages["tg_data"]}')
    print(f'    Spool table pages: {pages["spool_table"]}')
    
    return pages

In [None]:
def process_file(file, hi_dpi=300, low_dpi=150):
    """
    Two-pass approach:
    1. Fast scan to find relevant pages
    2. High-quality OCR only on those pages
    """
    print(f'Processing {file}...')
    
    # PASS 1: Find relevant pages (fast, low DPI)
    relevant_pages = find_relevant_pages(file, low_dpi)
    
    # PASS 2: High-quality OCR only on relevant pages
    print('  High-quality OCR on selected pages...')
    
    # Get all unique page numbers we need
    all_pages_needed = set()
    all_pages_needed.update(relevant_pages['lot_info'])
    all_pages_needed.update(relevant_pages['tg_data'])
    all_pages_needed.update(relevant_pages['spool_table'])
    
    # Also include pages near the spool table (it might span multiple pages)
    # for page in relevant_pages['spool_table']:
        # all_pages_needed.add(page + 1)  # Next page
        # if page > 1:
        #     all_pages_needed.add(page - 1)  # Previous page
    
    all_pages_needed = sorted(all_pages_needed)
    print(f'    Processing pages: {all_pages_needed}')
    
    # Convert only the pages we need at high DPI
    images = convert_from_path(file, hi_dpi, poppler_path=POPPLER_PATH)
    
    all_text = ''
    for page_num in all_pages_needed:
        if page_num <= len(images):
            # Use different config based on what's on the page
            if page_num in relevant_pages['spool_table']:
                config = standard_config  # Standard for tables
            else:
                config = custom_config  # PSM 6 for structured text
            
            text = pytesseract.image_to_string(images[page_num - 1], config=config)
            all_text += f'\n--- PAGE {page_num} ---\n{text}\n'
    
    lines = all_text.split('\n')
    return lines

In [None]:
# ===== EXTRACT LOT INFORMATION =====
def extract_lot_info(lines):
    lot_no = None
    part_number = None
    date_of_manufacture = None
 
    for line in lines:
        if 'Lot No:' in line and 'Date of Manufacture:' in line:
            lot_match = re.search(r'Lot No:\s*(92M\d+)', line)
            if lot_match:
                lot_no = lot_match.group(1)
           
            date_match = re.search(r'Date of Manufacture:\s*(\d+\s+\w+\s+\d{4})', line)
            if date_match:
                date_of_manufacture = date_match.group(1)
       
        if 'Part Number:' in line:
            part_match = re.search(r'Part Number:\s*([A-Z0-9-]+)', line)
            if part_match:
                part_number = part_match.group(1)
   
    print(f'    lot_no = {lot_no}, part_number = {part_number}, date_of_manufacture = {date_of_manufacture}')
       
    return lot_no, part_number, date_of_manufacture

In [None]:
# ===== EXTRACT TG DATA (2 values total for the lot) =====
def extract_tg_data(lines):
    tg_values = []
 
    for line in lines:
        if 'Tg by DMA in Â°C (' in line:
            temp_match = re.search(r'(\d{3}\.\d+)', line)
            if temp_match:
                tg_value = float(temp_match.group(1))
                tg_values.append(tg_value)
 
    tg_1 = tg_values[0] if len(tg_values) > 0 else None
    tg_2 = tg_values[1] if len(tg_values) > 1 else None
    tg_3 = tg_values[2] if len(tg_values) > 2 else None
   
    print(f'    tg_1 = {tg_1}, tg_2 = {tg_2}, tg_3 = {tg_3}')
   
    return tg_1, tg_2, tg_3

In [None]:
# ===== EXTRACT SPOOL TABLE AND CREATE DATAFRAME =====
def extract_spool_table(lines, lot_no, part_number, date_of_manufacture, tg_1, tg_2, tg_3):
 
    for i, line in enumerate(lines):
        if 'Lot Averages Report for Lot' in line:
            start_idx = i
            break
 
    if start_idx == 0:
        print('  "Lot Averages Report for Lot" phrase not found...')
        return pd.DataFrame()
    
    table_data = []
    
    for i in range(start_idx + 1, len(lines)):
        line = lines[i].strip()
       
        if not line or 'Page' in line or 'Averages:' in line:
            continue
       
        row = line.split()
       
        if len(row) < 6:
            continue
       
        spool = row[0].replace('$', '9')
        if len(spool) != 13:
            continue
       
        try:
            rc = int(row[2])
            paw = int(row[4])
            
            table_data.append({
                'part_no': part_number,
                'lot_no': lot_no,
                'manufacture_date': date_of_manufacture,
                'spool_no': spool,
                'rc': rc,
                'paw': paw,
                'tg_1': tg_1,
                'tg_2': tg_2,
                'tg_3': tg_3
            })
            
        except ValueError as e:
            continue
 
    if not table_data:
        print('  No spool data extracted')
        return pd.DataFrame()
    
    df = pd.DataFrame(table_data)
    df['manufacture_date'] = pd.to_datetime(df['manufacture_date'])
    
    print(f'  Extracted {len(df)} spool records')
   
    return df

In [None]:
df_all = pd.DataFrame()
files = list(Path(PDF_PATH).glob('*.pdf'))
 
for file in files:
    print(f'\n{"="*70}')
    lines = process_file(file, 275, 150)
    lot_no, part_number, date_of_manufacture = extract_lot_info(lines)
    tg_1, tg_2, tg_3 = extract_tg_data(lines)
    df = extract_spool_table(lines, lot_no, part_number, date_of_manufacture, tg_1, tg_2, tg_3)
    if not df.empty:
        df_all = pd.concat([df_all, df], ignore_index=True)

print(f'\n{"="*70}')
print(f'FINAL RESULTS: {len(df_all)} total records extracted')
print(f'{"="*70}')

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_all

In [None]:
output = f'{PROJECT_PATH}\MSRR4040_VAFR.xlsx'
df_all.to_excel(output, sheet_name='rr_material_certs', index=False)