In [1]:
import io
import fitz
import numpy as np
from PIL import Image, ImageOps, ImageFilter, ImageEnhance
import pytesseract
import pandas as pd

In [2]:
zoom = 4
mat = fitz.Matrix(zoom, zoom)
config = r'--oem 3 --psm 12 -c tessedit_char_whitelist=0123456789,%'

In [3]:
def resize(image):
    '''Resize all pages to the same size'''
    invert_im = image.convert("RGB")
    invert_im = ImageOps.invert(invert_im)
    imageBox = invert_im.getbbox()
    cropped = image.crop(imageBox)
    aspect_ratio = cropped.height / cropped.width
    new_width = 2121
    new_height = int(new_width * aspect_ratio)
    resized = cropped.resize((new_width, new_height), Image.NEAREST)
    #print('width:', resized.width, 'height:', resized.height)
    resized_image = np.array(resized)
    return resized_image

def prepare(image):
    '''Process Images to make them more readable'''
    image = image.convert('L')
    image = image.filter(ImageFilter.SMOOTH_MORE)
    image = ImageOps.invert(image)
    brightness = ImageEnhance.Brightness(image)
    image = brightness.enhance(1.2)
    return image

In [4]:
values = []

def read(file, pageNumber):
    '''Read pages from a PDF file'''
    global values
    pdf = fitz.open(file)
    page = pdf.load_page(pageNumber)
    pixmap = page.get_pixmap(alpha=False, matrix=mat).tobytes()
    image = Image.open(io.BytesIO(pixmap))
    resized = resize(image)
    
    numbers_column = resized[730:image.height*2, 1500:1900]
    numbers_column_image = Image.fromarray(numbers_column)
    processed_image = prepare(numbers_column_image)
    results = pytesseract.image_to_string(processed_image, config=config).split('\n')
    
    out = [i for i in results if i]
    out.remove('\x0c')
    out = [value.replace(',', '.').replace('%', '') for value in out]
    out = out[:-2]

    it = iter(out)
    data = list(zip(it, it))
    for tuple_ in data:
        values.append(tuple_)

In [5]:
for i in range(0, 9):
    read('./data/report.pdf', i)

df = pd.DataFrame(values, columns=['prima_dose', 'seconda_dose'])

print(len(values), 'comuni')

df

390 comuni


Unnamed: 0,prima_dose,seconda_dose
0,92.67,90.81
1,89.84,87.92
2,92.29,91.78
3,90.80,90.00
4,93.60,93.31
...,...,...
385,84.07,81.38
386,92.44,90.86
387,85.96,83.40
388,87.83,85.39
