In [2]:
import fitz
from operator import itemgetter


In [3]:
def get_fonts_size(doc, granularity=False):
    styles = {}
    font_counts = {}
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        first = True
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if first == True:
                            previous_s = s
                            first = False
                            if granularity:
                                identifier = "{0}_{1}_{2}_{3}".format(
                                    round(s['size']), s['flags'], s['font'], s['color'])
                                styles[identifier] = {'size': round(s['size']), 'flags': s['flags'], 'font': s['font'],
                                                      'color': s['color']}
                            else:
                                identifier = "{0}".format(round(s['size']))
                                styles[identifier] = {
                                    'size': round(s['size']), 'font': s['font']}
                            font_counts[identifier] = font_counts.get(
                                identifier, 0) + 1  # count the fonts usage
                        else:
                            if round(s['size']) != round(previous_s['size']):
                                if granularity:
                                    identifier = "{0}_{1}_{2}_{3}".format(
                                        round(s['size']), s['flags'], s['font'], s['color'])
                                    styles[identifier] = {'size': round(s['size']), 'flags': s['flags'], 'font': s['font'],
                                                          'color': s['color']}
                                else:
                                    identifier = "{0}".format(round(s['size']))
                                    styles[identifier] = {
                                        'size': round(s['size']), 'font': s['font']}
                                font_counts[identifier] = font_counts.get(
                                    identifier, 0) + 1  # count the fonts usage
                            previous_s = s

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

def font_tags(font_counts, styles):
    p_style = styles[font_counts[0][0]
                     ]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    header_para = []
    previous_s = {}  # previous span

    for page in doc:
        first = True  # boolean operator for first header
        blocks = page.get_text("dict")["blocks"]
        one_text = []  # list with headers and paragraphs
        page_num = list(str(page).split(' '))
        page_num = page_num[1]
        block_string = ""  # text found in block
        
        for b in blocks:  # iterate through the text blocks

            if b['type'] == 0:  # this block contains text
                # REMEMBER: multiple fonts and sizes are possible IN one block

                # block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[round(
                                    s['size'])] + s['text']
                            else:
                                if round(s['size']) == round(previous_s['size']):

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[round(s['size'])
                                                                ] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[round(s['size'])
                                                                ] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    one_text.append(block_string)
                                    block_string = size_tag[round(s['size'])
                                                            ] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

        one_text.append(block_string)
        one_page = {page_num: one_text}
        header_para.append(one_page)
        # print(header_para)

    return header_para
 


In [4]:
doc = fitz.open('temp_dataset/os.pdf')
font_counts, styles = get_fonts_size(doc)
print("font_counts")
print(font_counts)
print("styles")
print(styles)
size_tag = font_tags(font_counts, styles)
print("size_tag")
print(size_tag)
header_para = headers_para(doc, size_tag)
print("header_para")
print(header_para)


font_counts
[('32', 45), ('12', 45), ('20', 39), ('24', 36), ('18', 31), ('16', 6), ('22', 6), ('17', 4), ('14', 4), ('28', 3), ('40', 1), ('26', 1), ('19', 1)]
styles
{'40': {'size': 40, 'font': 'MalgunGothic'}, '32': {'size': 32, 'font': 'MalgunGothic'}, '24': {'size': 24, 'font': 'MalgunGothic'}, '12': {'size': 12, 'font': 'MalgunGothicBold'}, '20': {'size': 20, 'font': 'MalgunGothic'}, '18': {'size': 18, 'font': 'Wingdings-Regular'}, '17': {'size': 17, 'font': 'MalgunGothic'}, '14': {'size': 14, 'font': 'MalgunGothic'}, '16': {'size': 16, 'font': 'MalgunGothic'}, '28': {'size': 28, 'font': 'MalgunGothic'}, '22': {'size': 22, 'font': 'MalgunGothic'}, '26': {'size': 26, 'font': 'MalgunGothic'}, '19': {'size': 19, 'font': 'MalgunGothic'}}
size_tag
{40.0: '<h1>', 32.0: '<p>', 28.0: '<s1>', 26.0: '<s2>', 24.0: '<s3>', 22.0: '<s4>', 20.0: '<s5>', 19.0: '<s6>', 18.0: '<s7>', 17.0: '<s8>', 16.0: '<s9>', 14.0: '<s10>', 12.0: '<s11>'}
header_para
[{'0': ['<h1>4| 장| 쓰레드| (thread)|']}, {'1': [

문장 단위로 글씨 크기 잘라내기

In [12]:
def get_fonts_size(doc, granularity=False):
    styles = {}
    font_counts = {}
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        first = True
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if first == True:
                            previous_s = s
                            first = False
                            if granularity:
                                identifier = "{0}_{1}_{2}_{3}".format(
                                    round(s['size']), s['flags'], s['font'], s['color'])
                                styles[identifier] = {'size': round(s['size']), 'flags': s['flags'], 'font': s['font'],
                                                      'color': s['color']}
                            else:
                                identifier = "{0}".format(round(s['size']))
                                styles[identifier] = {
                                    'size': round(s['size']), 'font': s['font']}
                            font_counts[identifier] = font_counts.get(
                                identifier, 0) + 1  # count the fonts usage
                        else:
                            if round(s['size']) != round(previous_s['size']):
                                if granularity:
                                    identifier = "{0}_{1}_{2}_{3}".format(
                                        round(s['size']), s['flags'], s['font'], s['color'])
                                    styles[identifier] = {'size': round(s['size']), 'flags': s['flags'], 'font': s['font'],
                                                          'color': s['color']}
                                else:
                                    identifier = "{0}".format(round(s['size']))
                                    styles[identifier] = {
                                        'size': round(s['size']), 'font': s['font']}
                                font_counts[identifier] = font_counts.get(
                                    identifier, 0) + 1  # count the fonts usage
                            previous_s = s

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


In [13]:
doc = fitz.open('temp_dataset/os.pdf')
font_counts, styles = get_fonts_size(doc)
# print("font_counts")
# print(font_counts)
# print("styles")
# print(styles)


In [24]:
def sentences(doc):
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        # sentences = [page.get_text('sentences').split('\n')]
        # sentences = sentences.split('\n')
        
        # print(sentences)
        print(blocks)


In [25]:
doc = fitz.open('temp_dataset/os.pdf')
font_counts, styles = sentences(doc)


KeyError: 'pages'