In [None]:
import io

import pymupdf
from lark import Lark, Tree, UnexpectedInput

from deep_statutes.pdf.token_stream import pdf_to_token_stream, PDFTokenConversionOptions

In [2]:
# these need to be inferred for the particular document
LeftMargin = 72.
IndentSize = 36.

In [3]:
path = "/Users/eric/Development/statutes_data/wy/raw_pdf/title03.pdf"

doc = pymupdf.open(path)

In [4]:
options = PDFTokenConversionOptions(
    left_margin=LeftMargin,
    indent_size=IndentSize,
    infer_centered=False,
    font_sizes=[float('inf'), 12, float('inf'), float('inf')],
)

pdf_tokens_gen = pdf_to_token_stream(doc, options)
text_buffer = io.StringIO()
text_idx_to_pos = []
for pos, pdf_token in pdf_tokens_gen:
    text_buffer.write("\n")
    text_idx_to_pos.append((text_buffer.tell(), pos))
    text_buffer.write(pdf_token)

text = text_buffer.getvalue()

with open('foo.txt', 'w') as f:
    f.write(text)

In [5]:
grammar = r'''
_start: LINE* title
_heading_start: title_start | chapter_start | article_start | section_start

title: title_start _blank_line chapter+

chapter: chapter_start _blank_line article+

article: article_start _blank_line many_sections 

_centeredish: INDENT*

title_start.2: LINE* _centeredish SPAN_M "TITLE " NAT " - " RAW_UPPER_TEXT _cont_upper_line?
chapter_start.2: LINE* _centeredish SPAN_M "CHAPTER " NAT " - " RAW_UPPER_TEXT _cont_upper_line?
article_start.2: LINE* _centeredish SPAN_M "ARTICLE " NAT " - " RAW_UPPER_TEXT _cont_upper_line?
_cont_upper_line: LINE _centeredish _span_m_upper

section_number: NAT "-" NAT "-" NAT
section_start: INDENT SPAN_M_B section_number "." RAW_TEXT (LINE _sep{_span_m_b, LINE})? 
section: section_start (_blank_line _sep{paragraph, _blank_line})?

many_sections: _sep{section, _blank_line}

_blank_line: LINE LINE

paragraph: INDENT* _sep{_multi_span_m, LINE}

_span_m: SPAN_M RAW_TEXT
_span_m_upper: SPAN_M RAW_UPPER_TEXT
_span_m_b: SPAN_M_B RAW_TEXT

_multi_span_m: _span_m+
_multi_span_m_b: _span_m_b+

NAT: /[1-9][0-9]*/

RAW_TEXT: /(?:(?!<<)[^\n])+/
RAW_UPPER_TEXT: /(?:(?!<<)[^\na-z])+/

_sep{x, sep}: x (sep x)*

//CENTER.2: /<<CENTER>>/
INDENT.2: /<<INDENT>>/
LINE.2: /<<LINE>>/
BLOCK.2: /<<BLOCK>>/
PAGE.2: /<<PAGE>>/

SPAN_M.2: /<<SPAN_M>>/
SPAN_M_B.2: /<<SPAN_M_B>>/

EOL: /\n/

%ignore EOL
'''


In [6]:
lark = Lark(grammar, parser='lalr', start='_heading_start')

In [7]:
END_ACTION = '$END'

def find_matches(text: str, start: str,lark: Lark) -> list[Tree]:
    matched_parse_trees = []
    if not lark.options.parser == 'lalr':
        raise ValueError("Only lalr parser is supported")
    
    pi = lark.parse_interactive(text, start=start)
    lex_stream = pi.lexer_thread.lex(pi.parser_state)
    while True:
        # check if we are in the valid end state for a rule
        choices = pi.choices()
        if END_ACTION in choices:
            #valid_parses.append(choices[END_ACTION])
            # we feed the parser EOF to trick it into giving us back the parse tree for
            # this range of the input text
            r = pi.copy().feed_eof()
            matched_parse_trees.append(r)
        try:
            token = next(lex_stream)
        except StopIteration:
            break
        except UnexpectedInput as e:
            break
        pi.feed_token(token)

    return matched_parse_trees

In [9]:
idx_parses = []

for idx in range(len(text)):
    valid_parses = find_matches(text[idx:], '_heading_start', lark)
    idx_parses += [(idx, p) for p in valid_parses]

In [13]:
for idx, parse in idx_parses[::177]:
    print(idx, parse)


19 Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'title_start'), [Token('LINE', '<<LINE>>'), Token('INDENT', '<<INDENT>>'), Token('INDENT', '<<INDENT>>'), Token('INDENT', '<<INDENT>>'), Token('INDENT', '<<INDENT>>'), Token('SPAN_M', '<<SPAN_M>>'), Token('NAT', '3'), Token('RAW_UPPER_TEXT', 'GUARDIAN AND WARD ')])])
54564 Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'chapter_start'), [Token('SPAN_M', '<<SPAN_M>>'), Token('NAT', '3'), Token('RAW_UPPER_TEXT', 'CONSERVATORSHIPS ')])])
96686 Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'article_start'), [Token('INDENT', '<<INDENT>>'), Token('INDENT', '<<INDENT>>'), Token('INDENT', '<<INDENT>>'), Token('SPAN_M', '<<SPAN_M>>'), Token('NAT', '7'), Token('RAW_UPPER_TEXT', 'CLAIMS ')])])
123932 Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'section_start'), [Token('INDENT', '<<INDENT>>'), Token('SPAN_M_B', '<<SPAN_M_B>>'), Tree(Token('RULE', 'section_number'), [Token('NAT', '3'), Token('NAT', '5'