In [1]:
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import pandas as pd
import cv2
import numpy as np

## Pre-processing de l'image

In [2]:
image = cv2.imread('test2.png', cv2.IMREAD_GRAYSCALE)

# Augmenter la résolution en redimensionnant l'image (si nécessaire)
image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

# Binarisation
_, image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Denoising
image = cv2.medianBlur(image, 1)

# Enregistrez l'image prétraitée si vous voulez la vérifier
cv2.imwrite('preprocessed_image.png', image)



True

## OCR avec tesseract basique

In [3]:
# Utilisez Tesseract pour extraire le texte
text = pytesseract.image_to_string(Image.open('preprocessed_image.png'))

text

'Capacité vitale Pleth. (L]\nVol. Residuel Pleth. (t]\nCap. Pulm. Totale Pleth. (t]\nCRF pléthysmo. (L]\nVR % CPT pleth. [%]\n\nConductances spécifiques [1/(KPA*S)]\n\nCV Forcée Expi {t]\nVEMS [L)\nVEMS % CV MAX [%]\n\nDEMM 25/75 [L/s]\n\nThéo.\n\n2.76\n1.79\n4.70\n\n2.62\n38.34\n\n1.04\n2.67\n\n2.26\n78.3\n2.97\n\nLIN\n\n2.07\n1.21\n3.72\n\n1.80\n28.75\n\n1.04\n1.96\n\n1.63\n67.6\n1.57\n\nBasal\n\n2.41\n3.83\n6.24\n\n5.01\n61.36\n\n0.35\n2.18\n\n1.12\n46.4\n0.48\n\n% Théo. .\n\n87\n214\n133\n\n191\n160\n\n34\n82\n\n50\n59\n16\n'

## Processing du texte

In [4]:
# Splitting the text into sections for LIN, Basal, and % Théo
sections = text.split('\n')

In [6]:
def extract_section(sections, start_keyword=None, end_keyword=None):
    if start_keyword:
        start_idx = sections.index(start_keyword) + 1
    else:
        start_idx = 0
    
    if end_keyword:
        end_idx = sections.index(end_keyword)
    else:
        end_idx = len(sections)
    
    return [item for item in sections[start_idx:end_idx] if item]

parameters_section = extract_section(sections, None, 'Théo.')
theo_values = extract_section(sections, 'Théo.', 'LIN')
lin_values = extract_section(sections, 'LIN', 'Basal')
basal_values = extract_section(sections, 'Basal', '% Théo. .')
perc_theo_values = extract_section(sections, '% Théo. .')


In [8]:
df_final = pd.DataFrame({
    'Parameters': parameters_section,
    'Théo': theo_values,
    'LIN': lin_values,
    'Basal': basal_values,
    '% Théo': perc_theo_values
})

In [9]:
df_final

Unnamed: 0,Parameters,Théo,LIN,Basal,% Théo
0,Capacité vitale Pleth. (L],2.76,2.07,2.41,87
1,Vol. Residuel Pleth. (t],1.79,1.21,3.83,214
2,Cap. Pulm. Totale Pleth. (t],4.7,3.72,6.24,133
3,CRF pléthysmo. (L],2.62,1.8,5.01,191
4,VR % CPT pleth. [%],38.34,28.75,61.36,160
5,Conductances spécifiques [1/(KPA*S)],1.04,1.04,0.35,34
6,CV Forcée Expi {t],2.67,1.96,2.18,82
7,VEMS [L),2.26,1.63,1.12,50
8,VEMS % CV MAX [%],78.3,67.6,46.4,59
9,DEMM 25/75 [L/s],2.97,1.57,0.48,16
