In [46]:
from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum


class DataLine:
    def __init__(self,
                 page_number: str = None,  #:Nстраницы
                 line_number: str = None,  #:Nстроки, сквозной
                 first_symbol: str = None,  #:Nсимвола, начального, сквозной
                 int_from_prev: str = None,  #:дельта-v-coord-up
                 int_from_next: str = None,  #:дельта-v-coord-down
                 h_left: str = None,  #:h-coord начала
                 h_right: str = None,  #:h-coord конца + 1
                 v: str = None,  #:v-coord
                 font0_size: str = None,  #:font-height0
                 font0_name: str = '',  #:font-name0
                 font1_size: str = None,  #:font-height1
                 font1_name: str = '',  #:font-name1
                 text: str = None,  #:TEXT
                 ):
        self.page_number = int(page_number) if len(page_number) else 0
        self.line_number = int(line_number) if len(line_number) else 0
        self.first_symbol = int(first_symbol) if len(first_symbol) else 0

        self.int_from_prev = int(int_from_prev) if len(int_from_prev) else 0
        self.int_from_next = int(int_from_next) if len(int_from_next) else 0

        self.h_left = int(h_left) if len(h_left) else 0
        self.h_right = int(h_right) if len(h_right) else 0
        self.v = int(v) if len(v) else 0

        self.font0_size = int(font0_size) if len(font0_size) else 0
        self.font0_name = font0_name
        self.font1_size = int(font1_size) if len(font1_size) else 0
        self.font1_name = font1_name

        self.text = text

        self.label = '?'

class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1
    BUKVITSA = 2

class Label(Enum):
    PAR_STARTS = 'A'
    PAR_CONTINUES = 'B'
    HEADER = 'C'
    BUKVITSA = 'D'
    LIST = 'E'
    FOOTNOTE = 'F'
    OTHER = '?'

In [64]:
import io
import os
from reportlab.pdfgen import canvas
from PyPDF2 import PdfFileWriter, PdfFileReader
import sys


def main(journal_handler):
    # чтение строк из txt-файла
    input_file_name = f'articles/{journal_handler}/{journal_handler}:orig'
    data = []

    with open(os.path.join(os.path.dirname(os.path.curdir), input_file_name + '.txt'), "r", encoding="utf8") as read_file:
        for line in read_file:
            splitted = line.replace('\n', '').split(':')
            # print(splitted, "LEN", len(splitted))

            if len(splitted) > 13:
                rt = ':'.join(splitted[12:])
                data_line = DataLine(*splitted[:12], rt)
            elif len(splitted) == 13:
                data_line = DataLine(*splitted)
            else:
                pass
                assert len(splitted) >= 12, 'must have at least 12 elements'
            data.append(data_line)

    # границы абзацев
    margin_lefts = np.array([x.h_left for x in data])

    unique_margin_lefts, pos_margin_lefts = np.unique(margin_lefts, return_inverse=True)
    counts_margin_lefts = np.bincount(pos_margin_lefts)
    sorted_margin_lefts = np.argsort(counts_margin_lefts)[::-1]

    EPS = 4
    EPS_FONT = 2

    left_keys = list()
    left_indents = defaultdict(int)
    for a, b in zip(unique_margin_lefts[sorted_margin_lefts[:15]], counts_margin_lefts[sorted_margin_lefts[:15]]):
        index = -1
        # print(a, b)
        for l in range(a - EPS, a + EPS + 1):
            if l in left_indents:
                index = l
        if index != -1:
            left_indents[index] += b
        else:
            left_indents[a] += b
            left_keys.append(a)

    left_len = 1
    left_sum = left_indents[left_keys[0]]

    LEFT_BORDERS = np.sort(left_keys[:left_len])
    
    # regular font

    font_names = defaultdict(int)

    REGULAR_FONT_NAME = None
    REGULAR_FONT_SIZE = None
    max_font_name = 0
    MAX_FONT_SIZE = 0
    MAX_FONT_COUNT = 0

    for x in data:
        if x.font0_size > MAX_FONT_SIZE:
            MAX_FONT_SIZE = x.font0_size
            MAX_FONT_COUNT = len(x.text)
        elif x.font0_size == MAX_FONT_SIZE:
            MAX_FONT_COUNT += len(x.text)

        if min(abs(x.h_left - LEFT_BORDERS)) <= EPS:
            # чтобы уж наверняка, выбираем только полные строки
            font_names[(x.font0_name, x.font0_size)] += 1
            font_names[(x.font1_name, x.font1_size)] += 1
            if font_names[(x.font0_name, x.font0_size)] > max_font_name:
                max_font_name = font_names[(x.font0_name, x.font0_size)]
                REGULAR_FONT_NAME = x.font0_name
                REGULAR_FONT_SIZE = x.font0_size
            if font_names[(x.font1_name, x.font1_size)] > max_font_name:
                max_font_name = font_names[(x.font1_name, x.font1_size)]
                REGULAR_FONT_NAME = x.font1_name
                REGULAR_FONT_SIZE = x.font1_size

    # bukvitsa
    IS_BUKVITSA = (MAX_FONT_COUNT == 1)
    
    #print(REGULAR_FONT_SIZE)
    
    def process_line(line, cur_state):
        # ищем колонку, куда помещается строчка
        ind_block = 0
        
        is_font0_same = abs(np.log(line.font0_size / REGULAR_FONT_SIZE)) <= 0.03
        is_font1_same = abs(np.log(line.font1_size / REGULAR_FONT_SIZE)) <= 0.03

        is_font0_regular = line.font0_name == REGULAR_FONT_NAME
        is_font1_regular = line.font1_name == REGULAR_FONT_NAME

        is_aligned_left = abs(LEFT_BORDERS[ind_block] - line.h_left) <= EPS
        
        bukvitsa_size = MAX_FONT_SIZE if IS_BUKVITSA else MAX_FONT_SIZE + 3 * EPS

        if is_aligned_left:
            # продолжение абзаца
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_CONTINUES"], State["PARAGRAPH"]
            # буквица
            elif abs(line.font0_size - bukvitsa_size) <= EPS or abs(line.font1_size - bukvitsa_size) <= EPS:
                return Label["BUKVITSA"], State["BUKVITSA"]
            # заголовок
            else:
                if (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.00) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.00):
                    return Label["HEADER"], State["NEUTRAL"]
                elif (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                    return Label["FOOTNOTE"], State["NEUTRAL"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
        else:
            if (is_font0_same and is_font0_regular) or (is_font1_same and is_font1_regular):
                return Label["PAR_STARTS"], State["PARAGRAPH"]
            elif (np.log(line.font0_size / REGULAR_FONT_SIZE) >= 0.00) or (np.log(line.font1_size / REGULAR_FONT_SIZE) >= 0.00):
                return Label["HEADER"], State["NEUTRAL"]
            elif (np.log(line.font0_size / REGULAR_FONT_SIZE) <= -0.03) or (np.log(line.font1_size / REGULAR_FONT_SIZE) <= -0.03):
                return Label["FOOTNOTE"], State["NEUTRAL"]
            else:
                tokens = line.text.split()
                if len(tokens) > 0:
                    first_token = tokens[0]
                    filtered = ''.join(list(filter(lambda x: x in 'QWERTYUIOPASDFGHJKLZXCVBNM', first_token)))
                    if len(filtered) > 0:
                        return Label["OTHER"], State["NEUTRAL"]
                    else:
                        return Label["LIST"], State["PARAGRAPH"]
                else:
                    return Label["OTHER"], State["NEUTRAL"]
    
    
    output_file_name_txt = f'articles/{journal_handler}/{journal_handler}:l.txt'
    with open(os.path.join(os.path.dirname(os.path.curdir), output_file_name_txt), 'w', encoding='utf8') as output:
        cur_state = State["NEUTRAL"]

        PAR_INDENTS = defaultdict(int)

        for line in data:
            label, cur_state = process_line(line, cur_state)
            line.label = label.value
            if label == Label["PAR_STARTS"]:
                index = line.h_left
                for l in range(line.h_left - EPS, line.h_left + EPS + 1):
                    if l in PAR_INDENTS:
                        index = l
                PAR_INDENTS[index] += 1

        for line in data:
            if line.label == Label["OTHER"].value:
                # print('YES', line.text)
                index = line.h_left
                for l in range(line.h_left - EPS, line.h_left + EPS + 1):
                    if l in PAR_INDENTS:
                        index = l
                # print(index)
                if PAR_INDENTS[index] > 1:
                    line.label = Label["PAR_STARTS"].value

        for line in data:
            line_export = [line.page_number, line.line_number, line.first_symbol, line.int_from_prev,
                           line.int_from_next, line.h_left, line.h_right, line.v,
                           line.font0_size, line.font0_name, line.font1_size, line.font1_name,
                           line.label, line.text]

            line_export = ':'.join([str(y) for y in line_export])
            output.write(f'{line_export}\n')

    output_file_name_pdf = f'articles/{journal_handler}/{journal_handler}:l.pdf'

    color_dict = {'A': (1.0, 0.0, 0.0),
                  'B': (0.0, 1.0, 1.0),
                  'C': (0.0, 0.0, 1.0),
                  'D': (1.0, 0.0, 1.0),
                  'E': (1.0, 1.0, 0.0),
                  'F': (0.0, 1.0, 0.0),
                  '?': (0.5, 0.5, 0.5),
                  }
    try:
        input_file = PdfFileReader(open(os.path.join(os.path.dirname(os.path.curdir), input_file_name + '.pdf'), "rb"))

        output_file = PdfFileWriter()
        cur_page = 0
        input_page = input_file.getPage(cur_page)

        c = canvas.Canvas('watermark.pdf')
        for line in data:

            if line.page_number - 1 == cur_page:
                # new page == old page
                c.setStrokeColorRGB(*color_dict[line.label])
                c.rect(line.h_left, line.v, line.h_right - line.h_left, 8, stroke=1, fill=0)

                # input_page.mergePage(watermark.getPage(0))
            else:
                c.save()
                watermark = PdfFileReader(open("watermark.pdf", "rb"))
                input_page.mergePage(watermark.getPage(0))
                output_file.addPage(input_page)

                cur_page = line.page_number - 1
                input_page = input_file.getPage(cur_page)

                c = canvas.Canvas('watermark.pdf')
                c.setStrokeColorRGB(*color_dict[line.label])
                c.rect(line.h_left, line.v, line.h_right - line.h_left, 8, stroke=1, fill=0)

        c.save()
        watermark = PdfFileReader(open("watermark.pdf", "rb"))
        input_page.mergePage(watermark.getPage(0))
        output_file.addPage(input_page)

        with open(os.path.join(os.curdir, output_file_name_pdf), "wb") as o:
            output_file.write(o)
    except:
        pass

In [66]:
main('repec:nbr:nberwo:19616')

In [67]:
try:
    os.mkdir('articles')
except:
    pass

In [68]:
for journal in os.listdir('articles'):
    print(journal)
    for paper in os.listdir(f'articles/{journal}'):
        if paper.endswith('orig.txt'):
            print(paper)
            paper_name = paper[:-9]
            try:
                main(paper_name)
            except:
                pass

repec:pra:mprapa:60253
repec:pra:mprapa:47959
repec:pra:mprapa:47959:orig.txt
repec:pra:mprapa:58584
repec:pra:mprapa:58584:orig.txt
repec:nbr:nberwo:13479
repec:nbr:nberwo:13479:orig.txt
repec:nbr:nberwo:22430
repec:nbr:nberwo:22430:orig.txt
repec:nbr:nberwo:16185
repec:nbr:nberwo:16185:orig.txt
repec:pra:mprapa:18531
repec:pra:mprapa:18531:orig.txt
repec:pra:mprapa:50796
repec:pra:mprapa:50796:orig.txt
repec:nbr:nberwo:2829
repec:nbr:nberwo:2829:orig.txt
repec:pra:mprapa:52752
repec:pra:mprapa:52752:orig.txt
repec:nbr:nberwo:21830
repec:nbr:nberwo:21830:orig.txt




repec:nbr:nberwo:15264
repec:pra:mprapa:3658
repec:pra:mprapa:3658:orig.txt
repec:nbr:nberwo:10419
repec:nbr:nberwo:10419:orig.txt
repec:pra:mprapa:32697
repec:pra:mprapa:32697:orig.txt
repec:nbr:nberwo:12749
repec:nbr:nberwo:12749:orig.txt
repec:pra:mprapa:63697
repec:pra:mprapa:63697:orig.txt
repec:nbr:nberwo:23823
repec:nbr:nberwo:23823:orig.txt
repec:pra:mprapa:14815
repec:pra:mprapa:14815:orig.txt
repec:pra:mprapa:7146
repec:pra:mprapa:7146:orig.txt
repec:pra:mprapa:48801
repec:pra:mprapa:48801:orig.txt
repec:pra:mprapa:85364
repec:pra:mprapa:85364:orig.txt
repec:nbr:nberwo:19451
repec:nbr:nberwo:19451:orig.txt
repec:nbr:nberwo:19363
repec:nbr:nberwo:19363:orig.txt




repec:pra:mprapa:8696
repec:pra:mprapa:8696:orig.txt
repec:nbr:nberwo:7427
repec:nbr:nberwo:7427:orig.txt
repec:nbr:nberwo:21745
repec:nbr:nberwo:21745:orig.txt
repec:pra:mprapa:39184
repec:pra:mprapa:39184:orig.txt
repec:nbr:nberwo:24041
repec:nbr:nberwo:24041:orig.txt
repec:nbr:nberwo:14882
repec:nbr:nberwo:14882:orig.txt
repec:pra:mprapa:25427
repec:pra:mprapa:25427:orig.txt
repec:nbr:nberwo:3130
repec:nbr:nberwo:3130:orig.txt
repec:nbr:nberwo:25030
repec:nbr:nberwo:25030:orig.txt
repec:pra:mprapa:6311
repec:pra:mprapa:6311:orig.txt
repec:pra:mprapa:30187
repec:pra:mprapa:30187:orig.txt
repec:nbr:nberwo:11668
repec:nbr:nberwo:11668:orig.txt
repec:nbr:nberwo:25285
repec:nbr:nberwo:25285:orig.txt
repec:nbr:nberwo:7469
repec:nbr:nberwo:7469:orig.txt
repec:pra:mprapa:22921
repec:pra:mprapa:22921:orig.txt
repec:nbr:nberwo:23478
repec:nbr:nberwo:23478:orig.txt
repec:nbr:nberwo:10948
repec:nbr:nberwo:10948:orig.txt
repec:nbr:nberwo:20116
repec:nbr:nberwo:20116:orig.txt
repec:nbr:nberwo:105



repec:pra:mprapa:23980
repec:pra:mprapa:23980:orig.txt
repec:pra:mprapa:91785
repec:pra:mprapa:91785:orig.txt
repec:nbr:nberwo:0972
repec:nbr:nberwo:0972:orig.txt
repec:nbr:nberwo:10198
repec:nbr:nberwo:10198:orig.txt
repec:nbr:nberwo:17995
repec:nbr:nberwo:17995:orig.txt
repec:nbr:nberwo:20547
repec:nbr:nberwo:20547:orig.txt
repec:nbr:nberwo:11983
repec:nbr:nberwo:11983:orig.txt
repec:nbr:nberwo:0881
repec:nbr:nberwo:0881:orig.txt
repec:pra:mprapa:8632
repec:pra:mprapa:8632:orig.txt
repec:pra:mprapa:2163
repec:pra:mprapa:2163:orig.txt
repec:nbr:nberwo:22872
repec:nbr:nberwo:22872:orig.txt
repec:nbr:nberwo:14885
repec:nbr:nberwo:14885:orig.txt
repec:pra:mprapa:4438
repec:pra:mprapa:4438:orig.txt
repec:pra:mprapa:5387
repec:pra:mprapa:5387:orig.txt
repec:nbr:nberwo:23003
repec:nbr:nberwo:23003:orig.txt
repec:pra:mprapa:91394
repec:pra:mprapa:91394:orig.txt
repec:nbr:nberwo:23984
repec:nbr:nberwo:23984:orig.txt
repec:nbr:nberwo:17907
repec:nbr:nberwo:17907:orig.txt
repec:pra:mprapa:88402



repec:pra:mprapa:21977
repec:pra:mprapa:21977:orig.txt
repec:pra:mprapa:30066
repec:pra:mprapa:30066:orig.txt
repec:pra:mprapa:28762
repec:pra:mprapa:28762:orig.txt
repec:nbr:nberwo:1756
repec:nbr:nberwo:1756:orig.txt
repec:nbr:nberwo:9100
repec:nbr:nberwo:9100:orig.txt
repec:pra:mprapa:41027
repec:pra:mprapa:41027:orig.txt
repec:nbr:nberwo:20431
repec:nbr:nberwo:20431:orig.txt
repec:pra:mprapa:58177
repec:pra:mprapa:58177:orig.txt
repec:nbr:nberwo:13606
repec:nbr:nberwo:13606:orig.txt
repec:pra:mprapa:79641
repec:pra:mprapa:79641:orig.txt
repec:nbr:nberwo:19286
repec:nbr:nberwo:19286:orig.txt
repec:nbr:nberwo:24869
repec:nbr:nberwo:24869:orig.txt




repec:nbr:nberwo:3480
repec:nbr:nberwo:3480:orig.txt
repec:nbr:nberwo:3834
repec:nbr:nberwo:3834:orig.txt
repec:pra:mprapa:76094
repec:pra:mprapa:76094:orig.txt
repec:pra:mprapa:351
repec:pra:mprapa:351:orig.txt
repec:pra:mprapa:1385
repec:pra:mprapa:1385:orig.txt
repec:pra:mprapa:48969
repec:pra:mprapa:48969:orig.txt
repec:nbr:nberwo:1173
repec:nbr:nberwo:1173:orig.txt
repec:nbr:nberwo:20083
repec:nbr:nberwo:20083:orig.txt
repec:pra:mprapa:44013
repec:pra:mprapa:44013:orig.txt
repec:pra:mprapa:78619
repec:pra:mprapa:78619:orig.txt
repec:nbr:nberwo:24054
repec:nbr:nberwo:24054:orig.txt
repec:pra:mprapa:37012
repec:pra:mprapa:37012:orig.txt
repec:pra:mprapa:2489
repec:pra:mprapa:2489:orig.txt
repec:nbr:nberwo:25529
repec:nbr:nberwo:25529:orig.txt
repec:nbr:nberwo:9062
repec:nbr:nberwo:9062:orig.txt
repec:pra:mprapa:3564
repec:pra:mprapa:3564:orig.txt
repec:nbr:nberwo:11031
repec:nbr:nberwo:11031:orig.txt
repec:pra:mprapa:51441
repec:pra:mprapa:51441:orig.txt
repec:nbr:nberwo:17465
repec

repec:pra:mprapa:5176
repec:pra:mprapa:5176:orig.txt
repec:nbr:nberwo:20072
repec:nbr:nberwo:20072:orig.txt
repec:pra:mprapa:400
repec:pra:mprapa:400:orig.txt
repec:pra:mprapa:67202
repec:pra:mprapa:67202:orig.txt
repec:pra:mprapa:35393
repec:pra:mprapa:35393:orig.txt
repec:nbr:nberwo:23707
repec:nbr:nberwo:23707:orig.txt
repec:nbr:nberwo:24610
repec:nbr:nberwo:24610:orig.txt
repec:nbr:nberwo:14863
repec:nbr:nberwo:14863:orig.txt
repec:pra:mprapa:52469
repec:pra:mprapa:52469:orig.txt
repec:nbr:nberwo:20635
repec:nbr:nberwo:20635:orig.txt
repec:pra:mprapa:4477
repec:pra:mprapa:4477:orig.txt
repec:nbr:nberwo:13970
repec:nbr:nberwo:13970:orig.txt
repec:nbr:nberwo:8607
repec:nbr:nberwo:8607:orig.txt
repec:nbr:nberwo:2827
repec:nbr:nberwo:2827:orig.txt
repec:pra:mprapa:35917
repec:pra:mprapa:35917:orig.txt
repec:nbr:nberwo:19772
repec:nbr:nberwo:19772:orig.txt
repec:pra:mprapa:28004
repec:pra:mprapa:28004:orig.txt
repec:nbr:nberwo:17147
repec:nbr:nberwo:17147:orig.txt
repec:nbr:nberwo:24077



repec:pra:mprapa:2158
repec:pra:mprapa:2158:orig.txt
repec:pra:mprapa:74875
repec:pra:mprapa:74875:orig.txt
repec:nbr:nberwo:18399
repec:nbr:nberwo:18399:orig.txt
repec:nbr:nberwo:23249
repec:nbr:nberwo:23249:orig.txt
repec:nbr:nberwo:16740
repec:nbr:nberwo:16740:orig.txt
repec:nbr:nberwo:17622
repec:nbr:nberwo:17622:orig.txt




repec:nbr:nberwo:13589
repec:nbr:nberwo:13589:orig.txt
repec:nbr:nberwo:23725
repec:nbr:nberwo:23725:orig.txt
repec:pra:mprapa:87641
repec:pra:mprapa:87641:orig.txt
repec:nbr:nberwo:17908
repec:nbr:nberwo:17908:orig.txt
repec:pra:mprapa:40667
repec:pra:mprapa:40667:orig.txt
repec:nbr:nberwo:4833
repec:nbr:nberwo:4833:orig.txt
repec:nbr:nberwo:13784
repec:nbr:nberwo:13784:orig.txt
repec:nbr:nberwo:11886
repec:nbr:nberwo:11886:orig.txt
repec:pra:mprapa:18191
repec:pra:mprapa:18191:orig.txt
repec:nbr:nberwo:8517
repec:nbr:nberwo:8517:orig.txt
repec:nbr:nberwo:2119
repec:nbr:nberwo:2119:orig.txt
repec:nbr:nberwo:7205
repec:nbr:nberwo:7205:orig.txt
repec:pra:mprapa:90647
repec:pra:mprapa:90647:orig.txt
repec:pra:mprapa:61412
repec:pra:mprapa:61412:orig.txt
repec:nbr:nberwo:2511
repec:nbr:nberwo:2511:orig.txt
repec:pra:mprapa:12323
repec:nbr:nberwo:12367
repec:nbr:nberwo:12367:orig.txt
repec:nbr:nberwo:17286
repec:nbr:nberwo:17286:orig.txt
repec:pra:mprapa:23640
repec:pra:mprapa:23640:orig.t