In [1]:
from collections import namedtuple, defaultdict
import numpy as np
from enum import Enum

In [2]:
DataLine = namedtuple('DataLine', [
    'page_number', 
    'line_global_number', 
    'int_from_prev', 
    'int_from_next',
    'margin_left',
    'font_size',
    'font_type',
    'raw_text'
])

In [3]:
structure_file_name = 'y:2018:i:3:p:289-295.json'

In [4]:
data = []

with open(structure_file_name, "r", encoding="utf8") as read_file:
    for line in read_file:
        splitted = line.replace('\n', '').split(':')
        for i in range(6):
            splitted[i] = int(splitted[i]) if len(splitted[i]) > 0 else 0
        #print(splitted)
        if len(splitted) > 8:
            rt = ':'.join(splitted[7:])
            data_line = DataLine(*splitted[:7], rt)
        elif len(splitted) == 8:
            data_line = DataLine(*splitted)
        else:
            assert len(splitted) >= 8, 'must have at least 8 elements'
        data.append(data_line)

In [5]:
len(data)

740

**Notation**:

- A --- paragraph starts
- B --- paragraph continues
- C --- header starts
- D --- bukvitsa
- ? --- other

# Нахождение отступа основного текста

Основной текст (в одноколочных публикациях) должен находиться левее всего.

In [6]:
indents = np.array([x.margin_left for x in data])

In [7]:
print(np.sort(indents))

[  0  31  33  33  33  34  34  34  34  34  34  34  34  34  34  34  34  34
  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34
  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34
  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34
  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34  34
  34  34  34  34  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42  42
  42  42  42  42  42  42  42  42  42  42  42  42  4

Здесь можно для каждого журнала найти [и захардкодить] "важные" значения: отступы для основного текста, для начал абзацев, ...

Мы попытаемся определять эти границы при помощи некоторой статистики.

In [34]:
last_page = data[-1].page_number

In [35]:
last_page

7

In [8]:
margin_lefts = np.array([x.margin_left for x in data])

unique_margin_lefts, pos_margin_lefts = np.unique(margin_lefts, return_inverse=True)
counts_margin_lefts = np.bincount(pos_margin_lefts)
sorted_margin_lefts = np.argsort(counts_margin_lefts)[::-1]

In [9]:
unique_margin_lefts

array([  0,  31,  33,  34,  42,  43,  48,  49,  50,  56,  58,  79, 217,
       225, 228, 232, 241, 263, 354, 362, 368, 377, 400, 409, 416, 424,
       427, 432])

In [14]:
counts_margin_lefts[sorted_margin_lefts[:10]]

array([166, 100,  89,  79,  49,  41,  30,  29,  29,  26])

In [15]:
unique_margin_lefts[sorted_margin_lefts[:10]]

array([ 42, 362,  34, 354, 225, 424, 232, 416,  49, 409])

In [20]:
lens = unique_margin_lefts[sorted_margin_lefts]

In [21]:
lens[:10]

array([ 42, 362,  34, 354, 225, 424, 232, 416,  49, 409])

In [22]:
MIN_MARGINS = lens[:4]

In [57]:
MIN_MARGINS

array([ 42, 362,  34, 354])

# Определение шрифта основного текста

Это должен быть самый частый шрифт для строк с основным текстом.

In [49]:
font_types = defaultdict(int)

REGULAR_FONT_TYPE = None
REGULAR_FONT_SIZE = None
MAX_FONT_SIZE = 0
max_font_type = 0
max_font_size_ind = None

for x in data:
    if x.page_number in [1, last_page]:
        continue
    
    # буквица
    if MAX_FONT_SIZE < x.font_size:
        MAX_FONT_SIZE = x.font_size
        max_font_size_ind = x.line_global_number
        
    if min(abs(x.margin_left - MIN_MARGINS)) <= 2: # допустимая погрешность
        font_types[x.font_type] += 1
        if font_types[x.font_type] > max_font_type:
            max_font_type = font_types[x.font_type]
            REGULAR_FONT_TYPE = x.font_type
            REGULAR_FONT_SIZE = x.font_size

In [45]:
REGULAR_FONT_TYPE, REGULAR_FONT_SIZE

('g_d0_f7', 100)

In [50]:
MAX_FONT_SIZE, max_font_size_ind

(4195, 97)

In [54]:
MARGIN_FIRST = data[max_font_size_ind+1].margin_left

In [59]:
data[max_font_size_ind]

DataLine(page_number=2, line_global_number=97, int_from_prev=0, int_from_next=0, margin_left=31, font_size=4195, font_type='g_d0_f7', raw_text='И')

In [55]:
MARGIN_FIRST

79

In [67]:
C_FONT_TYPE = 'g_d0_f9' # TODO

In [72]:
data[206]

DataLine(page_number=3, line_global_number=206, int_from_prev=0, int_from_next=12, margin_left=42, font_size=100, font_type='g_d0_f7', raw_text='получен  ложноположительный  результат,  тогда  ')

# Процессинг документа (первичный)

In [68]:
class State(Enum):
    NEUTRAL = 0
    PARAGRAPH = 1
    FIRST_PAR = 2

cur_state = State["NEUTRAL"]

In [82]:
PAR_MARGINS = MIN_MARGINS + 14

def process_line(line, cur_state):
    
    margin_diffs = abs(line.margin_left - MIN_MARGINS)
    ind_column = np.argmin(margin_diffs)
            
    font_diff = abs(line.font_size - REGULAR_FONT_SIZE)
    margin_diff = min(margin_diffs)
    par_margin = min(abs(line.margin_left - PAR_MARGINS))
    
    # игнорим 
    if line.font_type not in [PAR_FONT_TYPE, REGULAR_FONT_TYPE]:
        label = '?'
        new_state = cur_state
        return label, new_state
    
    # буквица
    if line.font_size == MAX_FONT_SIZE:
        label = 'A' # bukvitsa
        new_state = State["PARAGRAPH"]
        return label, new_state
    
    # буквица:after
    if line.margin_left == MARGIN_FIRST:
        label = 'B' # paragraph
        new_state = State["PARAGRAPH"]
        return label, new_state
    
    # заголовок
    if line.font_type == C_FONT_TYPE:
        label = 'C' # header
        new_state = State["PARAGRAPH"]
        return label, new_state
    
    # абзац
    if cur_state == State["NEUTRAL"]:
        #print('1')
        if par_margin <= 2:
            #print('11!')
            label = 'A' # paragraph starts
            new_state = State["PARAGRAPH"]    
        else:
            #print('12')
            label = '?' # something strange
            new_state = State["NEUTRAL"]
    
    elif cur_state == State["PARAGRAPH"]:           
        if margin_diff <= 2:
            label = 'B' # paragraph continues
            new_state = State["PARAGRAPH"]
        elif par_margin <= 2:
            label = 'A' # paragraph starts
            new_state = State["PARAGRAPH"]
        else:
            # list or other
            # TODO: find features for lists
            label = '?' # something strange
            new_state = State["PARAGRAPH"]
    
    #print(line.raw_text)
    return label, new_state

In [83]:
def write_labeled(output_file_name=None):
    if output_file_name is None:
        output_file_name = structure_file_name[:-5] + '_l' + structure_file_name[-5:]
    with open(output_file_name, 'w', encoding='utf8') as output:
        cur_state = State["NEUTRAL"]

        for line in data:
            label, cur_state = process_line(line, cur_state)
            line_export = list(line[:7])
            line_export.append(label)
            line_export.append(line[-1])
            
            line_export = ':'.join([str(y) for y in line_export])
            output.write(f'{line_export}\n')

In [84]:
write_labeled()