In [None]:
import re

from collections import Counter

def normalized(s):
    parts = re.split(r'[-_]', str(s))
    return parts[0] + parts[1].lstrip('0') if len(parts) == 2 else str(s)


def search_adjacent_blocks(blocks, date_time_block_index, doc_num_pattern, mode):

  current_block_index = date_time_block_index
  potential_doc_nums = []
  invalid_doc_nums = []

  while current_block_index >= 0 and current_block_index >= date_time_block_index - 1 and current_block_index < len(blocks) and current_block_index <= date_time_block_index + 1:

    current_block = blocks[current_block_index].group()
    doc_num = re.finditer(doc_num_pattern, current_block)
    invalid_doc_num = find_doc_num_matches(current_block)
    invalid_doc_nums.extend(invalid_doc_num)
    potential_doc_nums.extend(dn.group() for dn in doc_num)
    current_block_index -= mode

  potential_doc_nums = [re.sub(r"\s+", "", s) for s in potential_doc_nums]
  invalid_doc_nums = [re.sub(r"\s+", "", s) for s in invalid_doc_nums]


  return list(set(potential_doc_nums)-set(invalid_doc_nums))



import re

def find_doc_num_matches(text):

    doc_num_pattern = r"(?:(?<!\d)\d{6,15}\s+|(?<!\d)\d{4}\s{0,4}-\s{0,4}\d{4,15}\s+|\s+\d{4}\s{0,4}_\s{0,4}\d{4,15}\s+)"
    direct_doc_num_pattern = r"(?:(?<!\d)\d{4,15}\s+|(?<!\d)\d{4}\s{0,4}-\s{0,4}\d{4,15}\s+|\s+\d{4}\s{0,4}_\s{0,4}\d{4,15}\s+)"
    double_key_pattern = r"(?:\n[a-zA-Z.#]+\s*[a-zA-Z.#]+)"
    key_val_sep = r"(?:-|:|:-)"
    prefix = r"(?:doc|doc\.|document|instrument|instr\.|instr|inst\.|inst|recording|AFN|CFN|sequence|seq|seq\.)"
    suffix = r"(?:number|num(?:\.)?|id|no(?:\.)?|#)"
    explicit_doc_num_key_pattern = rf"{prefix}\s*{suffix}"
    doc_num_generic_pattern = rf"(?:{double_key_pattern}\s*{key_val_sep}\s*{direct_doc_num_pattern})"

    potential_invalid_doc_num_matches = re.finditer(doc_num_generic_pattern, text, flags=re.IGNORECASE)
    invalid_doc_nums = []

    for m in potential_invalid_doc_num_matches:
        keys = re.search(double_key_pattern, m.group(), flags=re.IGNORECASE)
        is_true_match = re.search(explicit_doc_num_key_pattern, keys.group(), flags=re.IGNORECASE)
        if is_true_match is None:
            invalid_num = re.search(doc_num_pattern, m.group(), flags=re.IGNORECASE)
            if invalid_num:
                invalid_doc_nums.append(invalid_num.group())

    return invalid_doc_nums



def get_doc_num(text, date_time_span):


  doc_num_pattern = r"(?:(?<!\d)\d{6,15}\s+|(?<!\d)\d{4}\s{0,4}-\s{0,4}\d{4,15}\s+|\s+\d{4}\s{0,4}_\s{0,4}\d{4,15}\s+)"
  direct_doc_num_pattern = r"(?:(?<!\d)\d{4,15}\s+|(?<!\d)\d{4}\s{0,4}-\s{0,4}\d{4,15}\s+|\s+\d{4}\s{0,4}_\s{0,4}\d{4,15}\s+)"
  block_boundary_pattern = r"(?s)(?:.*?<laysep@@##\$\$>|.+)"

  prefix = r"(?:doc(?:\.)?|document|instrument|instr(?:\.)?|inst(?:\.)?|recording|AFN|CFN|sequence|seq(?:\.)?)"
  suffix = r"(?:number|num(?:\.)?|id(?:\.)?|no(?:\.)?|#)"
  key_val_sep = r"(?:-|:|:-|\s*)"
  explicit_doc_num_pattern = rf"{prefix}\s*{suffix}(?:.{0,8})?\s*{key_val_sep}\s*{direct_doc_num_pattern}"

  explicit_doc_num = re.finditer(explicit_doc_num_pattern, text, flags=re.IGNORECASE)


  blocks = re.finditer(block_boundary_pattern, text, re.DOTALL)
  blocks = list(blocks)

  if date_time_span is None:
    return None

  index, date_time_block = [(i, m) for i, m in enumerate(blocks) if (date_time_span[0] >= m.span()[0] and date_time_span[1] <= m.span()[1])][0]

  preceding_blocks_doc_num = search_adjacent_blocks(blocks, index, doc_num_pattern, 1)
  following_blocks_doc_num = search_adjacent_blocks(blocks, index, doc_num_pattern, -1)
  doc_num = preceding_blocks_doc_num + following_blocks_doc_num
  doc_num = [d for d in doc_num if d is not None]

  if explicit_doc_num :
    explicit_doc_num = [e.group() for e in explicit_doc_num]

    cleaned_nums = [
        re.sub(r'^\D+', '', str(s))
        for s in explicit_doc_num
        if s and re.search(r'\d', str(s))
    ]

    c = Counter(cleaned_nums)

    if c:
        most_common_item, frequency = c.most_common(1)[0]
        if frequency > 1:
            doc_num = [most_common_item]
        else:
            m = next(
                (c_inner for c_inner in c if any(normalized(c_inner) == normalized(dn) for dn in doc_num)),
                None
            )
            if m:
                doc_num = [m]
            else:
                doc_num.extend(list(c))
    else:
        doc_num.extend(explicit_doc_num)
  return doc_num

text = ""
date_time_match = find_date_time(text)
if date_time_match is not None:
  date_time_span = date_time_match.span()

else:
  date_time_span = None

doc_num = get_doc_num(text, date_time_span)




In [None]:
def filter_matches(matches, numeric_pattern):
  book_nums = []
  page_nums= []
  for m in matches:

    numbers = re.findall(numeric_pattern, m)
    book_num, page_num = numbers
    book_nums.append(book_num)
    page_nums.append(page_num)

  if(len(set(book_nums)) ==  1):

    if(len(set(page_nums)) ==  1):
      return (book_num, page_num)

    else:

      if(all('-' not in s for s in page_nums)):

        max_val = max(map(int, page_nums))
        min_val = min(map(int, page_nums))
        return (book_num, str(min_val)+"-"+str(max_val))

      else:
        valid_page_num = next(s for s in page_nums if '-' in s)
        return (book_num, valid_page_num)
  else:
    return (book_nums[0], page_nums[0])

In [None]:
numeric_pattern = r"\d+(?:\s*-\s*\d+)?"

import re



def extract_document_info(filename, document_text):

  header = classify_deed_type(document_text)

  month = r"(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
  date_pattern = rf"(?:(?:19|20)\d{{2}}[-/\.](?:0?[1-9]|1[0-2])[-/\.](?:0?[1-9]|[12]\d|3[01])\b|(?:0?[1-9]|1[0-2])[-/\.](?:0?[1-9]|[12]\d|3[01])[-/\.](?:19|20)\d{{2}}\b|(?:0?[1-9]|[12]\d|3[01])[-/\.](?:0?[1-9]|1[0-2])[-/\.](?:19|20)\d{{2}}\b|(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?(?:\s*day)?(?:\s*of)?\s+{month}(?:\s*[,\/\-]?\s*)?(?:\d{{4}})?\b|{month}(?:\s*[,\/\-]?\s*)?(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?(?:\s*[,\/\-]?\s*)(?:\d{{4}})?\b|(?:19|20)\d{{2}}(?:\s*[,\/\-]?\s*)?{month}(?:\s*[,\/\-]?\s*)?(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?\b|(?:0?[1-9]|[12]\d|3[01])\.(?:0?[1-9]|1[0-2])\.(?:19|20)\d{{2}})"
  time_pattern = r"(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?(?:\s*[APap][Mm])?|\b(?:[1-9]|1[0-2])(?:[:.][0-5]\d){1,2}\s*[APap][Mm]"

  date_time_separator = r"(?:\s*-\s*|\s+at\s+|\s*,\s*|\s*|.{0,20})"
  date_time_pattern = rf"(?:{date_pattern})(?:{date_time_separator})(?:{time_pattern})"


  book = r"(?:book|bk(?:\.)?|b(?:\.)?|volume|vol(?:\.)?|v(?:\.)?)"
  page = r"(?:page(?:s)?|pg(?:\.)?(?:s)?|p(?:\.)?(?:s)?)"
  numeric_pattern = r"\d+(?:\s*-\s*\d+)?"

  suffix = r"(?:no(?:\.)?|number|num(?:\.)?|\s*)?"

  key_value_separator = r"(?:-|:|:-|\s+)?"
  key_value_pattern = rf"(?:\s*{key_value_separator}\s*)"

  comma_separator = r"(?:\s*,\s*)"
  and_separator = r"(?:\s*and\s*)"

  word_separator = rf"(?:{comma_separator}|{and_separator}|\s+|.{0,20})"
  #fix word separator

  book_pattern = rf"(?:{book}{suffix}{key_value_pattern}{numeric_pattern})"
  page_pattern = rf"(?:{page}{suffix}{key_value_pattern}{numeric_pattern})"

  full_pattern = rf"{book_pattern}{word_separator}{page_pattern}"
  book_page_matches = re.finditer(full_pattern, document_text, re.IGNORECASE)

  date_time_match = re.search(date_time_pattern, document_text, re.IGNORECASE)

  all_recording_dates = re.finditer(date_time_pattern, document_text, re.IGNORECASE)

  if date_time_match is not None:
    date_time_span = date_time_match.span()

  else:
    date_time_span = None


  if date_time_span is not None:

    block_boundary_pattern = r"(?s)(?:.*?<laysep@@##\$\$>|.+)"
    blocks = re.finditer(block_boundary_pattern, document_text, re.DOTALL)
    blocks = list(blocks)
    index, date_time_block = [(i, m) for i, m in enumerate(blocks) if (date_time_span[0] >= m.span()[0] and date_time_span[1] <= m.span()[1])][0]

    if index > 0 and index < len(blocks) - 1:
      valid_book_page_span = (blocks[index-1].span()[0], blocks[index+1].span()[1])
    elif index == 0:
      valid_book_page_span = (blocks[index].span()[0], blocks[index+1].span()[1])
    else:
      valid_book_page_span = (blocks[index-1].span()[0], blocks[index].span()[1])



    valid_book_page_matches = [m.group() for m in book_page_matches if m.span()[0] >= valid_book_page_span[0] and m.span()[1] <= valid_book_page_span[1]]

  else:
    valid_book_page_matches = []


  doc_num = get_doc_num(document_text, date_time_span)
  recording_date = None

  if date_time_match is not None:
    recording_date = re.search(date_pattern, date_time_match.group(), re.IGNORECASE)
    recording_date = recording_date.group()

  book_num = None
  page_num = None

  if len(valid_book_page_matches) > 0:
    book_num, page_num = filter_matches(valid_book_page_matches, numeric_pattern)

  return (book_num, page_num, recording_date, doc_num)


