In [10]:
import time

import pandas as pd
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar

In [3]:
def get_pages(pdf):
    return [page for page in extract_pages(pdf)]

In [13]:
def get_figures(page):
    return [(page.pageid, figure) for figure in page]

In [5]:
def get_chars(figure):
    return [(figure[0], #page_number
                 element.get_text(),
                 element.x0,
                 element.x1,
                 element.width,
                 element.y0,
                 element.y1,
                 element.height) for element in figure[1] if isinstance(element, LTChar)]

In [6]:
def get_chars_df(chars_list):
    return pd.DataFrame(chars_list, columns=['page_number', 'chars', 'x0', 'x1', 'width', 'y0', 'y1', 'height'])

In [7]:
def get_pdf_chars_in_df(pdf):
    pages = get_pages(pdf)
    
    all_figures = []
    for page in pages:
        all_figures += get_figures(page)
    
    all_chars = []
    for figure in all_figures:
        all_chars += get_chars(figure)
        
    return get_chars_df(all_chars)

In [11]:
char_extract_start = time.time()

chars_df = get_pdf_chars_in_df('leaflets/pdfs/RO43-1000.pdf')

char_extract_end = time.time()
duration = char_extract_end - char_extract_start

print('Duration: ', duration, 's', sep='')

Duration: 471.1339626312256s


In [15]:
chars_df.sample(10)

Unnamed: 0,page_number,chars,x0,x1,width,y0,y1,height
6253,7,,99.50987,101.32987,1.82,11.33849,18.33849,7.0
20687,25,I,78.87447,82.08447,3.21,91.11219,101.11219,10.0
12173,13,e,270.38557,277.62657,7.241,140.72078,153.72078,13.0
28971,35,r,211.94696,215.10696,3.16,298.50928,306.50928,8.0
12649,13,U,395.19527,402.32527,7.13,222.36148,232.36148,10.0
29759,36,R,264.69008,280.52408,15.834,528.25201,554.25201,26.0
25933,31,9,518.22655,534.88555,16.659,571.17682,598.17682,27.0
24508,29,F,416.69146,422.60146,5.91,580.28973,590.28973,10.0
31016,37,ț,60.82327,65.30327,4.48,308.48898,318.48898,10.0
24762,29,,233.73057,235.55057,1.82,23.20839,30.20839,7.0


In [16]:
chars_df.to_csv('leaflets/pdfs_chars_output/RO43-1000_chars.csv', encoding='utf-8 sig', index=False)

In [151]:
'''
from bokeh.models import ColumnDataSource, Label, LabelSet, Range1d
from bokeh.plotting import figure, output_file, show

output_file("label.html", title="label.py example")

source = ColumnDataSource(data=dict(x0=list(data['x0'].values),
                                    y0=list(data['y0'].values),
                                    chars=list(data['Char'].values)))

p = figure(title='PDF',
           x_range=Range1d(240, 375),
          plot_width=1500,
          plot_height=10000)

p.scatter(x='x0', y='y0', size=2, source=source, color='white')
p.xaxis[0].axis_label = 'Column'
p.yaxis[0].axis_label = 'Row'

labels = LabelSet(x='x0', y='y0', text='chars', level='glyph',
              x_offset=15, y_offset=15, source=source, render_mode='canvas')

p.add_layout(labels)

show(p)'''