In [10]:
import os

from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum


class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1
    BUKVITSA = 2

class Label(Enum):
    PAR_STARTS = 'A'
    PAR_CONTINUES = 'B'
    HEADER = 'C'
    BUKVITSA = 'D'
    LIST = 'E'
    FOOTNOTE = 'F'
    OTHER = '?'

In [42]:
results = []
for paper in os.listdir():
    if not paper.startswith('repec:'):
        continue
    text_l = os.path.join(paper, paper + ':l.txt')
    try:
        paragraph_counter, header_counter, header_stack = collect(text_l)

        #repec:nbr:nberwo:14176; 118; 28; 125 ( ); 185 ( II. Institutional Detail and Conceptual Framework a. Institutional Background ); 280 (); ...
        prefix_part = ';'.join([paper, str(paragraph_counter), str(header_counter)])
        main_part = ';'.join(list(map(lambda x: f'{x[0]}[[{x[1]}]]', header_stack)))
        res = ';'.join([prefix_part, main_part]) + '\n'
        results.append(res)
    except:
        pass
with open('results.txt', 'w') as f:
    f.writelines(results)

In [37]:
def collect(path):
    paragraph_counter = 0
    header_counter = 0
    
    header_stack = []
    is_prev_header = False
    cur_header = ''
    cur_number = -1
    
    with open(path) as f:
        """
        3:44:2184:25:26:267:347:634:122:g_d0_f6:122:g_d0_f6:C:I. INTRODUCTION
        pn 3
        ln 44
        fs 2184
        ip 25
        in 26
        hl 267
        hr 347
        v 634
        f0 122
        s0 g_d0_f6
        f1 122
        s1 g_d0_f6
        LABEL C
        tx I. INTRODUCTION
        """
        for line in f:
            data = line.split(':')
            line_number = data[1]
            label = data[12]
            text = ':'.join(data[13:]).replace("\n", "")
            if label == Label["PAR_STARTS"].value:
                paragraph_counter += 1
            if label == Label["HEADER"].value:
                if is_prev_header:
                    cur_header += r'@@' + text
                else:
                    cur_header = text
                    cur_number = line_number
                is_prev_header = True
            else:
                if is_prev_header:
                    header_stack.append((cur_number, cur_header))
                is_prev_header = False
        if is_prev_header:
            header_stack.append((cur_number, cur_header))
    header_counter = len(header_stack)
    return paragraph_counter, header_counter, header_stack