In [1]:
from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum

In [2]:
DataLine = namedtuple('DataLine', [
    'page_number', 
    'line_global_number', 
    'int_from_prev', 
    'int_from_next',
    'margin_left',
    'font_size',
    'font_type',
    'raw_text'
])

In [3]:
structure_file_name = 'y:2015:i:1:p:1-17.json'

In [4]:
data = []

with open(structure_file_name, "r", encoding="utf8") as read_file:
    for line in read_file:
        splitted = line.replace('\n', '').split(':')
        for i in range(6):
            splitted[i] = int(splitted[i]) if len(splitted[i]) > 0 else 0
        #print(splitted)
        if len(splitted) > 8:
            rt = ':'.join(splitted[7:])
            data_line = DataLine(*splitted[:7], rt)
        elif len(splitted) == 8:
            data_line = DataLine(*splitted)
        else:
            assert len(splitted) >= 8, 'must have at least 8 elements'
        data.append(data_line)

In [5]:
len(data)

678

**Notation**:

- A --- paragraph starts
- B --- paragraph continues
- C --- header starts
- ? --- other

# Нахождение отступа основного текста

Основной текст (в одноколочных публикациях) должен находиться левее всего.

In [6]:
indents = np.array([x.margin_left for x in data])

In [7]:
np.sort(indents)[::5]

array([  0,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
        62,  62,  62,  62,  62,  62,  62,  72,  72,  72,  72,  83,  83,
        83,  83,  83,  83,  83,  83,  83,  90,  90,  90,  90,  90,  90,
        90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,
        90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  93, 156,
       251, 265, 283, 317, 317, 351])

Здесь можно для каждого журнала найти [и захардкодить] "важные" значения: отступы для основного текста, для начал абзацев, ...

Мы попытаемся определять эти границы при помощи некоторой статистики.

In [9]:
lens = np.unique(indents)
MIN_MARGIN = lens[int(lens[0] == 0)]

In [10]:
MIN_MARGIN

62

# Определение шрифта основного текста

Это должен быть самый частый шрифт для строк с основным текстом.

In [11]:
font_types = defaultdict(int)

REGULAR_FONT_TYPE = None
REGULAR_FONT_SIZE = None
max_font_type = 0

for x in data:
    if -2 <= x.margin_left - MIN_MARGIN <= 2: # допустимая погрешность
        font_types[x.font_type] += 1
        if font_types[x.font_type] > max_font_type:
            max_font_type = font_types[x.font_type]
            REGULAR_FONT_TYPE = x.font_type
            REGULAR_FONT_SIZE = x.font_size

In [12]:
REGULAR_FONT_TYPE, REGULAR_FONT_SIZE

('g_d0_f1', 12)

# Процессинг документа (первичный)

In [13]:
PAR_MARGIN = None  # могли бы захардкодить, если бы знали заранее

In [14]:
class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1

cur_state = State["NEUTRAL"]

In [18]:
def process_line(line, cur_state):
    global PAR_MARGIN
    if cur_state == State["NEUTRAL"]:
        if line.margin_left == MIN_MARGIN:
            label = '?' # something strange
            new_state = State["NEUTRAL"]
        else:
            # hopefully start of a paragraph
            if ((PAR_MARGIN is None and line.margin_left in lens[:6]) \
                or line.margin_left == PAR_MARGIN) and line.font_size == REGULAR_FONT_SIZE:
                if PAR_MARGIN is None:
                    PAR_MARGIN = line.margin_left
                label = 'A' # paragraph starts
                new_state = State["PARAGRAPH"]
            # введение
            #elif PAR_MARGIN is None and line.font_type != ...
            else:
                label = '?' # something strange
                new_state = State["NEUTRAL"]
    
    elif cur_state == State["PARAGRAPH"]:
        if line.margin_left == PAR_MARGIN:
            label = 'A' # paragraph starts
            new_state = State["PARAGRAPH"]
        elif line.margin_left == MIN_MARGIN:
            label = 'B' # paragraph continues
            new_state = State["PARAGRAPH"]
        else:
            # header, list or other
            if line.font_type != REGULAR_FONT_TYPE:
                label = 'C' # header
                new_state = State["NEUTRAL"]
            else:
                # TODO: find features for lists
                label = '?' # something strange
                new_state = State["PARAGRAPH"]
    
    return label, new_state

In [21]:
def write_labeled(output_file_name=None):
    if output_file_name is None:
        output_file_name = structure_file_name[:-5] + '_l' + structure_file_name[-5:]
    with open(output_file_name, 'w', encoding='utf8') as output:
        cur_state = State["NEUTRAL"]
        PAR_MARGIN = None # могли бы захардкодить, если бы знали заранее

        for line in data:
            label, cur_state = process_line(line, cur_state)
            line_export = list(line[:6])
            line_export.append(label)
            line_export.append(line[-1])
            
            line_export = ':'.join([str(y) for y in line_export])
            output.write(f'{line_export}\n')

In [22]:
write_labeled()