In [1]:
from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum

In [2]:
DataLine = namedtuple('DataLine', [
    'page_number', 
    'line_global_number', 
    'int_from_prev', 
    'int_from_next',
    'margin_left',
    'font_size',
    'font_type',
    'raw_text'
])

In [3]:
structure_file_name = 'y:2018:i:3:p:58-66.json'

In [4]:
data = []

with open(structure_file_name, "r", encoding="utf8") as read_file:
    for line in read_file:
        splitted = line.replace('\n', '').split(':')
        for i in range(6):
            splitted[i] = int(splitted[i]) if len(splitted[i]) > 0 else 0
        #print(splitted)
        if len(splitted) > 8:
            rt = ':'.join(splitted[7:])
            data_line = DataLine(*splitted[:7], rt)
        elif len(splitted) == 8:
            data_line = DataLine(*splitted)
        else:
            assert len(splitted) >= 8, 'must have at least 8 elements'
        data.append(data_line)

In [5]:
len(data)

1178

**Notation**:

- A --- paragraph starts
- B --- paragraph continues
- C --- header starts
- ? --- other

# Нахождение отступа основного текста

Основной текст (в одноколочных публикациях) должен находиться левее всего.

In [8]:
indents = np.array([x.margin_left for x in data])

In [14]:
print(np.sort(indents)[::2])

[  0  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55
  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55
  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55  55
  55  55  55  55  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58  58
  58  58  58  58  58  58  63  63  63  63  63  63  66  66  66  66  66  68
  69  72  72  72  72  72  72  75  75  75  75  75  75  75  75  75  85  90
 108 134 134 134 134 134 134 134 134 134 134 134 134 134 135 151 165 220
 220 220 220 220 220 220 220 220 220 220 220 220 220 220 220 220 220 220
 220 220 220 220 220 220 220 220 220 220 220 220 22

Здесь можно для каждого журнала найти [и захардкодить] "важные" значения: отступы для основного текста, для начал абзацев, ...

Мы попытаемся определять эти границы при помощи некоторой статистики.

In [25]:
margin_lefts = np.array([x.margin_left for x in data])

unique_margin_lefts, pos_margin_lefts = np.unique(margin_lefts, return_inverse=True)
counts_margin_lefts = np.bincount(pos_margin_lefts)
sorted_margin_lefts = np.argsort(counts_margin_lefts)[::-1]

In [26]:
unique_margin_lefts

array([  0,  55,  58,  63,  64,  66,  68,  69,  72,  75,  83,  85,  90,
       106, 108, 114, 134, 135, 142, 151, 153, 165, 174, 220, 223, 229,
       231, 234, 237, 243, 246, 266, 385, 388, 391, 394, 396, 397, 400,
       402, 434, 447])

In [33]:
counts_margin_lefts[sorted_margin_lefts[:15]]

array([220, 205, 195, 125, 122, 114,  26,  18,  17,  16,  13,  12,  12,
        12,  12])

In [34]:
unique_margin_lefts[sorted_margin_lefts[:15]]

array([ 58, 223, 388, 385, 220,  55, 134,  75, 243, 397, 231, 394,  63,
       229,  72])

In [79]:
MIN_MARGINS = unique_margin_lefts[sorted_margin_lefts[:6]]

In [80]:
lens = unique_margin_lefts[sorted_margin_lefts]

In [81]:
lens[:15]

array([ 58, 223, 388, 385, 220,  55, 134,  75, 243, 397, 231, 394,  63,
       229,  72])

# Определение шрифта основного текста

Это должен быть самый частый шрифт для строк с основным текстом.

In [99]:
font_types = defaultdict(int)

REGULAR_FONT_TYPE = None
REGULAR_FONT_SIZE = None
max_font_type = 0

for x in data:
    if min(abs(x.margin_left - MIN_MARGINS)) <= 2: # допустимая погрешность
        font_types[x.font_type] += 1
        if font_types[x.font_type] > max_font_type:
            max_font_type = font_types[x.font_type]
            REGULAR_FONT_TYPE = x.font_type
            REGULAR_FONT_SIZE = x.font_size

In [100]:
REGULAR_FONT_TYPE, REGULAR_FONT_SIZE

('g_d0_f6', 81)

# Процессинг документа (первичный)

In [225]:
class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1

cur_state = State["NEUTRAL"]

In [267]:
PAR_MARGINS = MIN_MARGINS + 14  # захардкодили
C_count = 0

In [268]:
#PAR_MARGINS = -np.ones(6)  # могли бы захардкодить, если бы знали заранее

def process_line(line, cur_state):
    global PAR_MARGINS, C_count
    
    margin_diffs = abs(line.margin_left - MIN_MARGINS)
    ind_column = np.argmin(margin_diffs)
            
    font_diff = abs(line.font_size - REGULAR_FONT_SIZE)
    margin_diff = min(margin_diffs)
    par_margin = min(abs(line.margin_left - PAR_MARGINS))
       
    if line.font_type != REGULAR_FONT_TYPE and font_diff < 4 and margin_diff >= 17:  # заголовок секции
        #print('0', line.font_type, font_diff, margin_diff, line.raw_text)
        label = 'C' # header
        new_state = State["NEUTRAL"]
        C_count += 1
        return label, new_state
    
    if font_diff >= 4 or C_count == 0:
        label = '?' # something strange
        new_state = cur_state
        return label, new_state
    
    if cur_state == State["NEUTRAL"]:
        #print('1')
        if par_margin <= 2:
            #print('11!')
            label = 'A' # paragraph starts
            new_state = State["PARAGRAPH"]    
        else:
            #print('12')
            label = '?' # something strange
            new_state = State["NEUTRAL"]
    
    elif cur_state == State["PARAGRAPH"]:           
        if par_margin <= 2:
            label = 'A' # paragraph starts
            new_state = State["PARAGRAPH"]
        elif margin_diff <= 2:
            label = 'B' # paragraph continues
            new_state = State["PARAGRAPH"]
        else:
            # list or other
            # TODO: find features for lists
            label = '?' # something strange
            new_state = State["PARAGRAPH"]
    
    #print(line.raw_text)
    return label, new_state

In [271]:
def write_labeled(output_file_name=None):
    if output_file_name is None:
        output_file_name = structure_file_name[:-5] + '_l' + structure_file_name[-5:]
    with open(output_file_name, 'w', encoding='utf8') as output:
        cur_state = State["NEUTRAL"]

        for line in data:
            label, cur_state = process_line(line, cur_state)
            line_export = list(line[:7])
            line_export.append(label)
            line_export.append(line[-1])
            
            line_export = ':'.join([str(y) for y in line_export])
            output.write(f'{line_export}\n')

In [272]:
write_labeled()