In [1]:
from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum


class DataLine:
    def __init__(self,
                 page_number: str = None,  #:Nстраницы
                 line_number: str = None,  #:Nстроки, сквозной
                 first_symbol: str = None,  #:Nсимвола, начального, сквозной
                 int_from_prev: str = None,  #:дельта-v-coord-up
                 int_from_next: str = None,  #:дельта-v-coord-down
                 h_left: str = None,  #:h-coord начала
                 h_right: str = None,  #:h-coord конца + 1
                 v: str = None,  #:v-coord
                 font0_size: str = None,  #:font-height0
                 font0_name: str = '',  #:font-name0
                 font1_size: str = None,  #:font-height1
                 font1_name: str = '',  #:font-name1
                 text: str = None,  #:TEXT
                 ):
        self.page_number = int(page_number) if len(page_number) else 0
        self.line_number = int(line_number) if len(line_number) else 0
        self.first_symbol = int(first_symbol) if len(first_symbol) else 0

        self.int_from_prev = int(int_from_prev) if len(int_from_prev) else 0
        self.int_from_next = int(int_from_next) if len(int_from_next) else 0

        self.h_left = int(h_left) if len(h_left) else 0
        self.h_right = int(h_right) if len(h_right) else 0
        self.v = int(v) if len(v) else 0

        self.font0_size = int(font0_size) if len(font0_size) else 0
        self.font0_name = font0_name
        self.font1_size = int(font1_size) if len(font1_size) else 0
        self.font1_name = font1_name

        self.text = text

        self.label = '?'

class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1
    BUKVITSA = 2

class Label(Enum):
    PAR_STARTS = 'A'
    PAR_CONTINUES = 'B'
    HEADER = 'C'
    BUKVITSA = 'D'
    LIST = 'E'
    FOOTNOTE = 'F'
    OTHER = '?'

In [21]:
import io
import os
from reportlab.pdfgen import canvas
from PyPDF2 import PdfFileWriter, PdfFileReader
import sys


def main(journal_handler):
    # чтение строк из txt-файла
    input_file_name = f'articles/{journal_handler}/{journal_handler}:orig'
    data = []

    with open(os.path.join(os.path.dirname(os.path.curdir), input_file_name + '.txt'), "r", encoding="utf8") as read_file:
        for line in read_file:
            splitted = line.replace('\n', '').split(':')
            # print(splitted, "LEN", len(splitted))

            if len(splitted) > 13:
                rt = ':'.join(splitted[12:])
                data_line = DataLine(*splitted[:12], rt)
            elif len(splitted) == 13:
                data_line = DataLine(*splitted)
            else:
                pass
                assert len(splitted) >= 12, 'must have at least 12 elements'
            data.append(data_line)

    # границы абзацев
    margin_lefts = np.array([x.h_left for x in data])

    unique_margin_lefts, pos_margin_lefts = np.unique(margin_lefts, return_inverse=True)
    counts_margin_lefts = np.bincount(pos_margin_lefts)
    sorted_margin_lefts = np.argsort(counts_margin_lefts)[::-1]

    EPS = 4
    EPS_FONT = 2

    left_keys = list()
    left_indents = defaultdict(int)
    for a, b in zip(unique_margin_lefts[sorted_margin_lefts[:15]], counts_margin_lefts[sorted_margin_lefts[:15]]):
        index = -1
        # print(a, b)
        for l in range(a - EPS, a + EPS + 1):
            if l in left_indents:
                index = l
        if index != -1:
            left_indents[index] += b
        else:
            left_indents[a] += b
            left_keys.append(a)

    left_len = 1
    left_sum = left_indents[left_keys[0]]

    LEFT_BORDERS = np.sort(left_keys[:left_len])

    
    margin_rights = np.array([x.h_right for x in data])

    unique_margin_rights, pos_margin_rights = np.unique(margin_rights, return_inverse=True)
    counts_margin_rights = np.bincount(pos_margin_rights)
    sorted_margin_rights = np.argsort(counts_margin_rights)[::-1]

    right_keys = list()
    right_indents = defaultdict(int)
    for a, b in zip(unique_margin_rights[sorted_margin_rights[:15]], counts_margin_rights[sorted_margin_rights[:15]]):
        index = -1
        # print(a, b)
        for l in range(a - EPS, a + EPS + 1):
            if l in right_indents:
                index = l
        if index != -1:
            right_indents[index] += b
        else:
            right_indents[a] += b
            right_keys.append(a)

    right_len = 1
    right_sum = right_indents[right_keys[0]]

    RIGHT_BORDERS = np.sort(right_keys[:right_len])    
    
    
    # regular font

    font_names = defaultdict(int)

    REGULAR_FONT_NAME = None
    REGULAR_FONT_SIZE = None
    max_font_name = 0
    MAX_FONT_SIZE = 0
    MAX_FONT_COUNT = 0

    for x in data:
        if x.font0_size > MAX_FONT_SIZE:
            MAX_FONT_SIZE = x.font0_size
            MAX_FONT_COUNT = len(x.text)
        elif x.font0_size == MAX_FONT_SIZE:
            MAX_FONT_COUNT += len(x.text)

        if min(abs(x.h_left - LEFT_BORDERS)) <= EPS:
            # чтобы уж наверняка, выбираем только полные строки
            font_names[(x.font0_name, x.font0_size)] += 1
            font_names[(x.font1_name, x.font1_size)] += 1
            if font_names[(x.font0_name, x.font0_size)] > max_font_name:
                max_font_name = font_names[(x.font0_name, x.font0_size)]
                REGULAR_FONT_NAME = x.font0_name
                REGULAR_FONT_SIZE = x.font0_size
            if font_names[(x.font1_name, x.font1_size)] > max_font_name:
                max_font_name = font_names[(x.font1_name, x.font1_size)]
                REGULAR_FONT_NAME = x.font1_name
                REGULAR_FONT_SIZE = x.font1_size
    
    header_fonts = set()
    
    # bukvitsa
    IS_BUKVITSA = (MAX_FONT_COUNT == 1)
    
    #print(REGULAR_FONT_SIZE)
    
    
    
    
    def process_line(line, cur_state):
        nonlocal header_fonts
        # ищем колонку, куда помещается строчка
        ind_block = 0
        
        is_font0_same = abs(np.log(line.font0_size / REGULAR_FONT_SIZE)) <= 0.03
        is_font1_same = abs(np.log(line.font1_size / REGULAR_FONT_SIZE)) <= 0.03

        is_font0_regular = line.font0_name == REGULAR_FONT_NAME
        is_font1_regular = line.font1_name == REGULAR_FONT_NAME

        is_aligned_left = abs(LEFT_BORDERS[ind_block] - line.h_left) <= EPS
        is_aligned_right = abs(RIGHT_BORDERS[ind_block] - line.h_right) <= EPS
        
        bukvitsa_size = MAX_FONT_SIZE if IS_BUKVITSA else MAX_FONT_SIZE + 3 * EPS

        if is_aligned_left and is_aligned_right:
            # продолжение абзаца
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_CONTINUES"], State["PARAGRAPH"]
            else:
                return Label["FOOTNOTE"], cur_state
                if (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.03):
                    header_fonts |= {line.font0_name, line.font1_name}
                    return Label["HEADER"], State["NEUTRAL"]
                elif (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                    return Label["FOOTNOTE"], State["NEUTRAL"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
        elif is_aligned_left:
            # конец абзаца
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_CONTINUES"], State["NEUTRAL"]
            else:
                if (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.00) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.00):
                    header_fonts |= {line.font0_name, line.font1_name}
                    return Label["HEADER"], State["NEUTRAL"]
                elif (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                    return Label["FOOTNOTE"], State["NEUTRAL"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
        elif is_aligned_right:
            # начало абзаца либо смещение из-за формулы
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_STARTS"], State["PARAGRAPH"]
            else:
                #if (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.03):
                #    return Label["HEADER"], State["NEUTRAL"]
                if (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                    return Label["FOOTNOTE"], State["NEUTRAL"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
        else:
            return Label["OTHER"], cur_state
            # например, формула по центру
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_CONTINUES"], State["PARAGRAPH"]
            elif (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.03):
                return Label["HEADER"], State["NEUTRAL"]
            elif (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                return Label["FOOTNOTE"], State["NEUTRAL"]
            else:
                return Label["OTHER"], cur_state
                tokens = line.text.split()
                if len(tokens) > 0:
                    first_token = tokens[0]
                    filtered = ''.join(list(filter(lambda x: x in 'QWERTYUIOPASDFGHJKLZXCVBNM', first_token)))
                    if len(filtered) > 0:
                        return Label["OTHER"], State["NEUTRAL"]
                    else:
                        return Label["LIST"], State["PARAGRAPH"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
    
    
    output_file_name_txt = f'articles/{journal_handler}/{journal_handler}:l.txt'
    with open(os.path.join(os.path.dirname(os.path.curdir), output_file_name_txt), 'w', encoding='utf8') as output:
        cur_state = State["NEUTRAL"]

        PAR_INDENTS = defaultdict(int)

        for line in data:
            label, cur_state = process_line(line, cur_state)
            line.label = label.value
            if label == Label["PAR_STARTS"]:
                index = line.h_left
                for l in range(line.h_left - EPS, line.h_left + EPS + 1):
                    if l in PAR_INDENTS:
                        index = l
                PAR_INDENTS[index] += 1

        for line in data:
            if line.label == Label["OTHER"].value:
                # print('YES', line.text)
                index = line.h_left
                for l in range(line.h_left - EPS, line.h_left + EPS + 1):
                    if l in PAR_INDENTS:
                        index = l
                # print(index)
                if PAR_INDENTS[index] > 1:
                    line.label = Label["PAR_STARTS"].value
        
        for line in data:
            if line.label == Label["OTHER"].value:
                # print('YES', line.text)
                if (line.font0_name in header_fonts and line.font0_size >= REGULAR_FONT_SIZE) and (line.font1_name in header_fonts and line.font1_size >= REGULAR_FONT_SIZE):
                    line.label = Label["HEADER"].value
                    
        for line in data:
            line_export = [line.page_number, line.line_number, line.first_symbol, line.int_from_prev,
                           line.int_from_next, line.h_left, line.h_right, line.v,
                           line.font0_size, line.font0_name, line.font1_size, line.font1_name,
                           line.label, line.text]

            line_export = ':'.join([str(y) for y in line_export])
            output.write(f'{line_export}\n')

    output_file_name_pdf = f'articles/{journal_handler}/{journal_handler}:l.pdf'

    color_dict = {'A': (1.0, 0.0, 0.0),
                  'B': (0.0, 1.0, 1.0),
                  'C': (0.0, 0.0, 1.0),
                  'D': (1.0, 0.0, 1.0),
                  'E': (1.0, 1.0, 0.0),
                  'F': (0.0, 1.0, 0.0),
                  '?': (0.5, 0.5, 0.5),
                  }
    try:
        input_file = PdfFileReader(open(os.path.join(os.path.dirname(os.path.curdir), input_file_name + '.pdf'), "rb"))

        output_file = PdfFileWriter()
        cur_page = 0
        input_page = input_file.getPage(cur_page)

        c = canvas.Canvas('watermark.pdf')
        for line in data:

            if line.page_number - 1 == cur_page:
                # new page == old page
                c.setStrokeColorRGB(*color_dict[line.label])
                c.rect(line.h_left, line.v, line.h_right - line.h_left, 8, stroke=1, fill=0)

                # input_page.mergePage(watermark.getPage(0))
            else:
                c.save()
                watermark = PdfFileReader(open("watermark.pdf", "rb"))
                input_page.mergePage(watermark.getPage(0))
                output_file.addPage(input_page)

                cur_page = line.page_number - 1
                input_page = input_file.getPage(cur_page)

                c = canvas.Canvas('watermark.pdf')
                c.setStrokeColorRGB(*color_dict[line.label])
                c.rect(line.h_left, line.v, line.h_right - line.h_left, 8, stroke=1, fill=0)

        c.save()
        watermark = PdfFileReader(open("watermark.pdf", "rb"))
        input_page.mergePage(watermark.getPage(0))
        output_file.addPage(input_page)

        with open(os.path.join(os.curdir, output_file_name_pdf), "wb") as o:
            output_file.write(o)
    except:
        pass

In [23]:
main('repec:nbr:nberwo:17248')

In [24]:
try:
    os.mkdir('articles')
except:
    pass

In [25]:
for journal in os.listdir('articles'):
    print(journal)
    for paper in os.listdir(f'articles/{journal}'):
        if paper.endswith('orig.txt'):
            print(paper)
            paper_name = paper[:-9]
            try:
                main(paper_name)
            except:
                pass

repec:nbr:nberwo:25618
repec:nbr:nberwo:25618:orig.txt
repec:nbr:nberwo:15293
repec:nbr:nberwo:15293:orig.txt
repec:nbr:nberwo:19944
repec:nbr:nberwo:19944:orig.txt
repec:nbr:nberwo:25167
repec:nbr:nberwo:25167:orig.txt
repec:nbr:nberwo:20021
repec:nbr:nberwo:20021:orig.txt
repec:nbr:nberwo:23754
repec:nbr:nberwo:23754:orig.txt




repec:nbr:nberwo:13912
repec:nbr:nberwo:13912:orig.txt
repec:nbr:nberwo:21024
repec:nbr:nberwo:21024:orig.txt




repec:nbr:nberwo:10914
repec:nbr:nberwo:10914:orig.txt




repec:nbr:nberwo:21532
repec:nbr:nberwo:21532:orig.txt
repec:nbr:nberwo:21941
repec:nbr:nberwo:21941:orig.txt
repec:nbr:nberwo:18084
repec:nbr:nberwo:18084:orig.txt
repec:nbr:nberwo:23786
repec:nbr:nberwo:23786:orig.txt
repec:nbr:nberwo:21390
repec:nbr:nberwo:21390:orig.txt
repec:nbr:nberwo:15578
repec:nbr:nberwo:15578:orig.txt
repec:nbr:nberwo:15806
repec:nbr:nberwo:15806:orig.txt
repec:nbr:nberwo:15762
repec:nbr:nberwo:15762:orig.txt
repec:nbr:nberwo:15491
repec:nbr:nberwo:15491:orig.txt
repec:nbr:nberwo:21780
repec:nbr:nberwo:21780:orig.txt
repec:nbr:nberwo:15672
repec:nbr:nberwo:15672:orig.txt
repec:nbr:nberwo:12604
repec:nbr:nberwo:12604:orig.txt
repec:nbr:nberwo:21242
repec:nbr:nberwo:21242:orig.txt
repec:nbr:nberwo:23379
repec:nbr:nberwo:23379:orig.txt
repec:nbr:nberwo:22507
repec:nbr:nberwo:22507:orig.txt
repec:nbr:nberwo:17070
repec:nbr:nberwo:17070:orig.txt
repec:nbr:nberwo:17611
repec:nbr:nberwo:17611:orig.txt
repec:nbr:nberwo:18766
repec:nbr:nberwo:18766:orig.txt




repec:nbr:nberwo:18143
repec:nbr:nberwo:18143:orig.txt
repec:nbr:nberwo:18626
repec:nbr:nberwo:18626:orig.txt
repec:nbr:nberwo:23731
repec:nbr:nberwo:23731:orig.txt
repec:nbr:nberwo:24689
repec:nbr:nberwo:24689:orig.txt
repec:nbr:nberwo:20857
repec:nbr:nberwo:20857:orig.txt
repec:nbr:nberwo:19144
repec:nbr:nberwo:18216
repec:nbr:nberwo:18216:orig.txt
repec:nbr:nberwo:15923
repec:nbr:nberwo:15923:orig.txt
repec:nbr:nberwo:17148
repec:nbr:nberwo:17148:orig.txt
repec:nbr:nberwo:23347
repec:nbr:nberwo:23347:orig.txt
repec:nbr:nberwo:13788
repec:nbr:nberwo:13788:orig.txt
repec:nbr:nberwo:19727
repec:nbr:nberwo:11161
repec:nbr:nberwo:11161:orig.txt
repec:nbr:nberwo:16943
repec:nbr:nberwo:16943:orig.txt
repec:nbr:nberwo:11629
repec:nbr:nberwo:11629:orig.txt
repec:nbr:nberwo:11017
repec:nbr:nberwo:11017:orig.txt
repec:nbr:nberwo:17255
repec:nbr:nberwo:17255:orig.txt
repec:nbr:nberwo:11200
repec:nbr:nberwo:11200:orig.txt
repec:nbr:nberwo:16057
repec:nbr:nberwo:16057:orig.txt
repec:nbr:nberwo:12



repec:nbr:nberwo:13309
repec:nbr:nberwo:13309:orig.txt
repec:nbr:nberwo:25518
repec:nbr:nberwo:25518:orig.txt
repec:nbr:nberwo:15228
repec:nbr:nberwo:15228:orig.txt
repec:nbr:nberwo:19416
repec:nbr:nberwo:19416:orig.txt
repec:nbr:nberwo:23283
repec:nbr:nberwo:23283:orig.txt
repec:nbr:nberwo:22688
repec:nbr:nberwo:22688:orig.txt
repec:nbr:nberwo:15863
repec:nbr:nberwo:15863:orig.txt
repec:nbr:nberwo:24123
repec:nbr:nberwo:24123:orig.txt
repec:nbr:nberwo:11730
repec:nbr:nberwo:11730:orig.txt
repec:nbr:nberwo:14055
repec:nbr:nberwo:14055:orig.txt
repec:nbr:nberwo:19810
repec:nbr:nberwo:19810:orig.txt
repec:nbr:nberwo:18095
repec:nbr:nberwo:18095:orig.txt
repec:nbr:nberwo:19519
repec:nbr:nberwo:19519:orig.txt
repec:nbr:nberwo:21294
repec:nbr:nberwo:21294:orig.txt
repec:nbr:nberwo:20437
repec:nbr:nberwo:20437:orig.txt
repec:nbr:nberwo:24943
repec:nbr:nberwo:24943:orig.txt
repec:nbr:nberwo:17100
repec:nbr:nberwo:17100:orig.txt
repec:nbr:nberwo:19979
repec:nbr:nberwo:19979:orig.txt
repec:nbr:



repec:nbr:nberwo:21922
repec:nbr:nberwo:21922:orig.txt
repec:nbr:nberwo:11156
repec:nbr:nberwo:11156:orig.txt
repec:nbr:nberwo:17080
repec:nbr:nberwo:17080:orig.txt
repec:nbr:nberwo:19311
repec:nbr:nberwo:19311:orig.txt
repec:nbr:nberwo:12884
repec:nbr:nberwo:12884:orig.txt
repec:nbr:nberwo:17946
repec:nbr:nberwo:17946:orig.txt
repec:nbr:nberwo:24890
repec:nbr:nberwo:24890:orig.txt
repec:nbr:nberwo:18808
repec:nbr:nberwo:18808:orig.txt
repec:nbr:nberwo:16462
repec:nbr:nberwo:16462:orig.txt
repec:nbr:nberwo:15035
repec:nbr:nberwo:15035:orig.txt
repec:nbr:nberwo:13314
repec:nbr:nberwo:13314:orig.txt
repec:nbr:nberwo:19034
repec:nbr:nberwo:19034:orig.txt
repec:nbr:nberwo:10298
repec:nbr:nberwo:10298:orig.txt
repec:nbr:nberwo:15716
repec:nbr:nberwo:15716:orig.txt
repec:nbr:nberwo:18118
repec:nbr:nberwo:18118:orig.txt
repec:nbr:nberwo:14109
repec:nbr:nberwo:14109:orig.txt
repec:nbr:nberwo:18906
repec:nbr:nberwo:18906:orig.txt
repec:nbr:nberwo:23302
repec:nbr:nberwo:23302:orig.txt
repec:nbr:



repec:nbr:nberwo:22522
repec:nbr:nberwo:22522:orig.txt
repec:nbr:nberwo:19081
repec:nbr:nberwo:19081:orig.txt
repec:nbr:nberwo:16108
repec:nbr:nberwo:16108:orig.txt
repec:nbr:nberwo:14584
repec:nbr:nberwo:14584:orig.txt
repec:nbr:nberwo:15370
repec:nbr:nberwo:15370:orig.txt
repec:nbr:nberwo:10343
repec:nbr:nberwo:10343:orig.txt
repec:nbr:nberwo:24308
repec:nbr:nberwo:24308:orig.txt
repec:nbr:nberwo:16596
repec:nbr:nberwo:16596:orig.txt
repec:nbr:nberwo:11168
repec:nbr:nberwo:11168:orig.txt
repec:nbr:nberwo:11673
repec:nbr:nberwo:11673:orig.txt
repec:nbr:nberwo:25083
repec:nbr:nberwo:25083:orig.txt
repec:nbr:nberwo:17118
repec:nbr:nberwo:17118:orig.txt
repec:nbr:nberwo:16085
repec:nbr:nberwo:16085:orig.txt
repec:nbr:nberwo:21159
repec:nbr:nberwo:21159:orig.txt
repec:nbr:nberwo:23014
repec:nbr:nberwo:23014:orig.txt
repec:nbr:nberwo:24014
repec:nbr:nberwo:24014:orig.txt
repec:nbr:nberwo:22375
repec:nbr:nberwo:22375:orig.txt
repec:nbr:nberwo:16530
repec:nbr:nberwo:16530:orig.txt
repec:nbr: