In [7]:
# Bloc 1: Imports
from pathlib import Path
import pdfplumber
import pandas as pd
import re

In [8]:
# Bloc 2: Functions
def extract_metadata(pdf):
    """Extract metadata from first page"""
    metadata = {}
    first_page = pdf.pages[0]
    text = first_page.extract_text()
    
    patterns = {
        'turbine_number': r'Turbine No\./Id:[\s\n]*(\d+)',
        'service_order': r'Service order[\s\n]*(\d+)',
        'start_date': r'Date:[\s\n]*(\d{2}\.\d{2}\.\d{4})',
        'reason_for_call_out': r'Check ICPE Electrical.*'
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            metadata[key] = match.group(1) if key != 'reason_for_call_out' else match.group(0)
    
    return metadata

def extract_inspection_table(pdf):
    """Extract inspection items from all pages"""
    inspection_items = []
    items_seen = {}
    
    for page in pdf.pages:
        text = page.extract_text()
        lines = text.split('\n')
        
        for line in lines:
            item_match = re.match(r'^(\d+\.\d+)\s+(.+?)(?:\s+(OK|NOT OK|NOK|N/A))?$', line)
            if item_match:
                item_number = item_match.group(1)
                description = item_match.group(2)
                status = item_match.group(3) if item_match.group(3) else ''
                
                if item_number not in items_seen:
                    items_seen[item_number] = True
                    inspection_items.append({
                        'item_number': item_number,
                        'description': description,
                        'status': status,
                        'comment': ''
                    })
    
    return inspection_items

def calculate_compliance_ratio(inspection_items):
    """Calculate compliance ratio from inspection items"""
    if not inspection_items:
        return 0, 0, 0
    
    items_with_status = [item for item in inspection_items if item['status']]
    total_items = len(items_with_status)
    ok_items = sum(1 for item in items_with_status if item['status'] == 'OK')
    ratio = (ok_items / total_items) * 100 if total_items > 0 else 0
    
    return ok_items, total_items, ratio

In [9]:
# Bloc 3: Process PDF and Display Results
# Setup file path
folder_path = "data"
file_name = 'vestas_report_example.pdf'
file_path = Path(folder_path) / file_name

# Process PDF and display results
with pdfplumber.open(file_path) as pdf:
    # Extract data
    metadata = extract_metadata(pdf)
    inspection_items = extract_inspection_table(pdf)
    ok_items, total_items, ratio = calculate_compliance_ratio(inspection_items)
    
    # Display metadata
    print("=== Metadata ===")
    metadata_df = pd.DataFrame.from_dict(metadata, orient='index', columns=['Value'])
    display(metadata_df)

    # Display compliance stats
    print("\n=== Compliance Stats ===")
    print(f"OK items: {ok_items}")
    print(f"Total items: {total_items}")
    print(f"Compliance ratio: {ratio:.2f}%")

    # Display inspection items
    print("\n=== Inspection Items ===")
    inspection_df = pd.DataFrame(inspection_items)
    display(inspection_df)

=== Metadata ===


Unnamed: 0,Value
reason_for_call_out,Check ICPE Electrical V136



=== Compliance Stats ===
OK items: 13
Total items: 13
Compliance ratio: 100.00%

=== Inspection Items ===


Unnamed: 0,item_number,description,status,comment
0,0.01,0. DMS: 0093-0083 V03,,
1,1.01,Two persons present.,OK,
2,1.02,Trained in and aware of the contents of the Ve...,,
3,1.03,Be sure to know the general recommendations re...,OK,
4,1.04,Locking of rotor/blades.,OK,
5,1.05,Hearing protection.,OK,
6,1.06,Hydraulic tools.,OK,
7,2.01,Check that wind turbine is clean. Check that n...,OK,
8,2.02,L'intérieur de l'aérogénérateur est maintenu p...,OK,
9,2.03,Remarks: (as a comment),,
