In [49]:
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter
import io
import re

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\e2023898\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'  # Update this path based on your installation

In [15]:
# IMPORT PDF REPORT

pdf_path = r"V:\TEST CHILLER\PDF\00378721_TALF8NHHBX1T1U1R00_NUOVI_COMPRESSORI_20241108_133955.pdf"
pdf_document = fitz.open(pdf_path)
page = pdf_document.load_page(0)  # Load the first page
image_list = page.get_images(full=True)
if not image_list:
    raise ValueError("No images found in the PDF.")
    
xref = image_list[0][0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
#image.show()

# 1 -EXTRACTS THE LOGO OF THE COMPANY
def replace_top_right_corner_with_blank(image, blank_width=230, blank_height=120):
    draw = ImageDraw.Draw(image)
    width, height = image.size
    draw.rectangle([(width - blank_width, 0), (width, blank_height)], fill="white")
    return image

image = replace_top_right_corner_with_blank(image)
#image.show()

# 2 - REMOVE TEXTS FROM REPORT
def replace_bottom_left_corner_with_blank(image):
    draw = ImageDraw.Draw(image)
    width, height = image.size
    draw.rectangle([(0, 250), (150, 290)], fill="white")
    return image

image = replace_bottom_left_corner_with_blank(image)

# SPLIT DOC IN TWO PARTS, UPPER TEST INFO, BOTTOM TEST DATA
def extract_and_split_pdf_image(image, top_height_ratio=0.31):

    width, height = image.size
    top_height = int(height * top_height_ratio)
    
    image_top = image.crop((0, 0, width, top_height))
    image_bottom = image.crop((0, top_height, width, height))
    
    return image_top, image_bottom

# Example usage

image_top, image_bottom = extract_and_split_pdf_image(image)

# Display the output images
image_top.show()
image_bottom.show()



In [16]:
#EXTRACT DATA TOP PART OF THE REPORT

# IMPROVE IMAGE QUALITY
def enhance_image(image):
    # Increase size
    new_size = (image.width * 2, image.height * 2)
    enlarged_image = image.resize(new_size, Image.Resampling.LANCZOS)
    
    # Increase contrast
    enhancer = ImageEnhance.Contrast(enlarged_image)
    enhanced_image = enhancer.enhance(2)  # Increase contrast by a factor of 2
    
    return enhanced_image

enhanced_image_top = enhance_image(image_top)
#enhanced_image_top.show()

# DEFINE SEARCH BOXES

def draw_rectangles(image, areas):
    draw = ImageDraw.Draw(image)
    for area in areas:
        draw.rectangle(area, outline="red", width=2)
    return image

# Define the areas (left, upper, right, lower) for each of the 20 regions
areas = [
    (190, 60, 260, 120), # TEXT STAZIONE -1
    (190, 120, 440, 200), # TEXT OPERATOR -2 
    (190, 200, 440, 250), #TEXT DATA-ORA - 3 
    (665, 200, 900, 250), #ESITO COLLAUDO - 4
    (190, 250, 440, 300), #TEXT CODICE - 5 
    (665, 250, 900, 300), #TEXT LINEA - 6 
    (190, 300, 440, 350), # TEXT MATRICOLA - 7
    (665, 300, 900, 350), # TEXT MATRICOLA - 8 
    (190, 350, 440, 400), # TEXT ODL - 9
    (665, 350, 900, 400), # GAS - 10
    (110, 400, 440, 450), #TEXT REVIS - 11
    (500, 390, 900, 450), # TEXT CIRCUITO NUMBER - 12
    (190, 450, 480, 490), #TEXT ALIMENTAZIONE - 13
    (470, 490, 1000, 550), #TEXT REGOLAZIONE VALVOLA - 14
    (1210, 490, 1400, 530), #TEXT ORA - 15
    (1040, 530, 1400, 560), #TEXT DURATION - 16
    (1150, 400, 1400, 450), # TEXT RESP. TEST - 17
    
]

# Example usage
#image_with_rectangles = draw_rectangles(enhanced_image_top, areas)
#image_with_rectangles.show()

def preprocess_image(image):
    # Convert to grayscale
    image = image.convert('L')
    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)
    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())
    return image

def extract_text_with_retries(image, config, retries=3):
    for _ in range(retries):
        ocr_result = pytesseract.image_to_string(image, config=config)
        if ocr_result.strip():
            return ocr_result.strip()
    return 'No text found'


def extract_text_from_areas(image, areas):
    custom_config = r'--oem 3 --psm 6'
    fallback_config = r'--oem 3 --psm 3'
    extracted_texts = {}
    
    for i, area in enumerate(areas):
        cropped_image = enhanced_image_top.crop(area)
        preprocessed_image = preprocess_image(cropped_image)
        ocr_result = extract_text_with_retries(preprocessed_image, custom_config)
        if ocr_result == 'No text found':
            ocr_result = extract_text_with_retries(preprocessed_image, fallback_config)
        extracted_texts[f'area_{i+1}'] = ocr_result
    
    return extracted_texts


extracted_texts = extract_text_from_areas(enhanced_image_top, areas)

# Print the extracted text
#for area, text in extracted_texts.items():
#   print(f"{area}: {text}")

test_reports = pd.DataFrame([extracted_texts])
test_reports.rename(columns={
    'area_1': 'STATION', 
    'area_2': 'OPERATOR',
    'area_3': 'ISSUED',
    'area_4': 'STATUS',
    'area_5': 'PART.NUMBER',
    'area_6': 'LINE',
    'area_7': 'SERIAL',
    'area_8': 'WORK.FLUID',
    'area_9': 'ORDER',
    'area_10': 'GAS',
    'area_11': 'REV',
    'area_12': 'CIRCUITS',
    'area_13': 'ELECTRICAL',
    'area_14': 'CONFIG',
    'area_15': 'TIME',
    'area_16': 'DURATION',
    'area_17': 'RESPONSIBLE',},
    inplace=True)

test_reports.head()

Unnamed: 0,STATION,OPERATOR,ISSUED,STATUS,PART.NUMBER,LINE,SERIAL,WORK.FLUID,ORDER,GAS,REV,CIRCUITS,ELECTRICAL,CONFIG,TIME,DURATION,RESPONSIBLE
0,R110,1 RESTANI MATTEO,08/11/2024 14:00:01 -,PASSED,TALF8NHHBX1T1U1,ALTA PORTATA,378721,ACQUA,616540,R410a_PAtt,Revis.: 00,Nr. Circuiti: 1,400V-460V 3PH SOHz-60Hz,"chiuso VT 2 giri tot 7,2 kg VT chiusa 10 giri",10:30:51,Durata regime: Oh 4m 25s,Ing. Paolo Russo


In [None]:
area_4 = areas[11]
cropped_image_area_8 = enhanced_image_top.crop(area_4)

# Show the cropped image of area_8
cropped_image_area_8.show()

In [None]:



enhanced_image_bottom = enhance_image(image_bottom)

areas2 = [
    (30, 50, 1360, 90), # LINE 1
    (30, 90, 1360, 125), # LINE 2
    (30, 125, 1360, 160), # LINE 3
    (30, 160, 1360, 195), # LINE 4
    (30, 195, 1360, 230), # LINE 5
    (30, 230, 1360, 270), # LINE 6
    (30, 270, 1360, 305), # LINE 7
    (30, 305, 1360, 340), # LINE 8
    (30, 340, 1360, 375), # LINE 9
    (30, 375, 1360, 410), # LINE 10
    (30, 415, 1360, 450), # LINE 11
    (30, 450, 1360, 485), # LINE 12
    (30, 485, 1360, 520), # LINE 13
    (30, 520, 1360, 555), # LINE 14
    (30, 555, 1360, 590), # LINE 15
    (30, 592, 1360, 627), # LINE 16
    (30, 629, 1360, 664), # LINE 17
    (30, 665, 1360, 700), # LINE 18
    (30, 700, 1360, 735), # LINE 19
    #(30, 735, 1360, 770), # LINE 20
    #(30, 770, 1360, 810), # LINE 21
    #(30, 810, 1360, 845), # LINE 22
    #(30, 845, 1360, 880), # LINE 23
    (30, 975, 1360, 1010), # LINE 24
    (30, 1010, 1360, 1045), # LINE 25
    (30, 1045, 1360, 1080), # LINE 26
    (30, 1080, 1360, 1117), # LINE 27
    (30, 1117, 1360, (1117+45)), # LINE 27 
]

# Example usage
image_bottom_with_rectangles = draw_rectangles(enhanced_image_bottom, areas2)
image_bottom_with_rectangles.show()

In [55]:
def extract_text_from_areas2(image, areas):
    custom_config = r'--oem 3 --psm 6'
    fallback_config = r'--oem 3 --psm 3'
    extracted_texts = {}
    
    for i, area in enumerate(areas):
        cropped_image = enhanced_image_bottom.crop(area)
        preprocessed_image = preprocess_image(cropped_image)
        ocr_result = extract_text_with_retries(preprocessed_image, custom_config)
        if ocr_result == 'No text found':
            ocr_result = extract_text_with_retries(preprocessed_image, fallback_config)
        extracted_texts[f'area_{i+1}'] = ocr_result
    
    return extracted_texts


extracted_texts2 = extract_text_from_areas2(enhanced_image_bottom, areas2)
print(extracted_texts2)
test_reports2 = pd.DataFrame([extracted_texts2])



test_reports2.head()

{'area_1': 'P7 [bar] Pressione Condensazione 1 28,531', 'area_2': 'TC_P7[C] Temperatura condensazione 1 calcolata (da P7)', 'area_3': 'TC7[C) Temperatura Liquida 1', 'area_4': 'SOTT_1[C) Sottoraffreddamento 1 (TC7-TC_P7)', 'area_5': 'Pressione Evaporazione 1 19,300 sis', 'area_6': 'TC_P9[C] Temperatura evaporazione 1 calcolata (da P9)', 'area_7': 'Tc9[C} Temperatura Aspirazione 1', 'area_8': 'SURR_1[C) Surriscaldamento 1 (TC9-TC_P9)', 'area_9': 'VCM_5 [VJ Tensione concatenata media - Presa 5 398,472', 'area_10': 'ITOT_5 [A] Corrente totale - Presa 5', 'area_11': 'PACTT_5 [kW] Potenza Aitiva Totale - Presa 5', 'area_12': 'ey se', 'area_13': 'es ne ee', 'area_14': 'ey en', 'area_15': 'ey Dn', 'area_16': 'ey en', 'area_17': 'es ne ee', 'area_18': 'es ne ee', 'area_19': 'ey en', 'area_20': 'ey Dn', 'area_21': 'ey', 'area_22': 'ey se', 'area_23': 'es es ee', 'area_24': 'TAC_M[C] Temperatura aria cabina media |32,031', 'area_25': 'T3(C] Temper. acqua IN B 119,362 sf', 'area_26': 'T4[C] Tempe

Unnamed: 0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,area_10,...,area_19,area_20,area_21,area_22,area_23,area_24,area_25,area_26,area_27,area_28
0,"P7 [bar] Pressione Condensazione 1 28,531",TC_P7[C] Temperatura condensazione 1 calcolata...,TC7[C) Temperatura Liquida 1,SOTT_1[C) Sottoraffreddamento 1 (TC7-TC_P7),"Pressione Evaporazione 1 19,300 sis",TC_P9[C] Temperatura evaporazione 1 calcolata ...,Tc9[C} Temperatura Aspirazione 1,SURR_1[C) Surriscaldamento 1 (TC9-TC_P9),VCM_5 [VJ Tensione concatenata media - Presa 5...,ITOT_5 [A] Corrente totale - Presa 5,...,ey en,ey Dn,ey,ey se,es es ee,"TAC_M[C] Temperatura aria cabina media |32,031","T3(C] Temper. acqua IN B 119,362 sf","T4[C] Temper. acqua OUT B [15,201 |",[MP3_4[lmin] _| Portata totale evaporatore B 2...,EVAP2_POWER [kW] | Resa lato evaporatore 2 (Ac...


In [57]:

def organize_extracted_text_to_dataframe(extracted_texts):
    data = {}
    
    for area, text in extracted_texts2.items():
        # Split the text into title and value based on the presence of a comma
        parts = re.split(r'(?<=\D),(?=\d)', text)
        if len(parts) == 2:
            title = parts[0].strip()
            value_str = parts[1].strip()
            # Remove any non-numeric characters except for the comma
            value_str = re.sub(r'[^0-9,]', '', value_str)
            # Convert the cleaned string to a float
            value = float(value_str.replace(',', '.'))
            data[title] = value
    
    # Convert the data dictionary to a DataFrame
    df = pd.DataFrame([data])
    
    return df

# Example usage


df2 = organize_extracted_text_to_dataframe(extracted_texts2)
print(df2)

Empty DataFrame
Columns: []
Index: [0]


In [63]:
import pandas as pd
df3 = pd.read_csv(pdf_path, header=None, delim_whitespace=True)
df3.head()

  df3 = pd.read_csv(pdf_path, header=None, delim_whitespace=True)


EmptyDataError: No columns to parse from file