In [None]:
import re

def find_date_time(text):

  date_pattern = r"(?:\b(?:19|20)\d{2}[-/.](?:0?[1-9]|1[0-2])[-/.](?:0?[1-9]|[12]\d|3[01])\b)|(?:\b(?:0?[1-9]|1[0-2])[-/.](?:0?[1-9]|[12]\d|3[01])[-/.](?:19|20)\d{2}\b)|(?:\b(?:0?[1-9]|[12]\d|3[01])[-/.](?:0?[1-9]|1[0-2])[-/.](?:19|20)\d{2}\b)|(?:\b(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?(?:\s*of)?[\s-]+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[\s-]*(?:\d{4})?\b)|(?:\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[\s-]+(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?(?:[\s,-]*\d{4})?\b)|(?:\b\d{4}[\s,-]+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[\s,-]+(?:0?[1-9]|[12]\d|3[01])(?:st|nd|rd|th)?\b)|(?:\b(?:0?[1-9]|[12]\d|3[01])\.(?:0?[1-9]|1[0-2])\.(?:19|20)\d{2}\b)"
  time_pattern = r"(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?(?:\s*[APap][Mm])?|\b(?:[1-9]|1[0-2])(?:[:.][0-5]\d){1,2}\s*[APap][Mm]\b"

  date_time_separator = r"\s*-\s*|\s+at\s+|\s*,\s*|\s*"
  date_time_pattern = rf"(?:{date_pattern})(?:{date_time_separator})(?:{time_pattern})"


  comma_separator = r"\s*,\s*"
  and_separator = r"\s*and\s*"

  word_separator = rf"(?:{comma_separator}|{and_separator}|\s+)"

  return re.search(date_time_pattern, text, re.IGNORECASE)


In [None]:
def find_wordy_blocks(text):
    # Split using your custom separator or 3+ newlines
    separator_pattern = r"(?:laysep@@##\$\$>|(?:\n\s*){2,})"
    blocks = re.split(separator_pattern, text)

    wordy_blocks = []
    consecutive_words = r"(?:[a-zA-Z]{2,}\s){5,}"

    offset = 0  # Track position of each block in original text

    for block in blocks:
        block = block.strip()
        if not block:
            offset += 1  # Move past empty block (e.g. multiple separators)
            continue

        total_words = len(block.split())
        total_lines = len(block.splitlines())
        num_consecutive_words = len(re.findall(consecutive_words, block, re.IGNORECASE))

        if num_consecutive_words >= 1:
            start = text.find(block, offset)
            end = start + len(block)
            wordy_blocks.append((start, end))
            offset = end
        else:
            offset += len(block)

    return wordy_blocks

In [None]:
def filter_matches(matches, numeric_pattern):
  book_nums = []
  page_nums= []
  for m in matches:

    numbers = re.findall(numeric_pattern, m)
    book_num, page_num = numbers
    book_nums.append(book_num)
    page_nums.append(page_num)

  if(len(set(book_nums)) ==  1):

    if(len(set(page_nums)) ==  1):
      return (book_num, page_num)

    else:

      if(all('-' not in s for s in page_nums)):

        max_val = max(map(int, page_nums))
        min_val = min(map(int, page_nums))
        return (book_num, str(min_val)+"-"+str(max_val))

      else:
        valid_page_num = next(s for s in page_nums if '-' in s)
        return (book_num, valid_page_num)
  else:
    return (book_nums[0], page_nums[0])

In [None]:

def extract_document_info(document_text):


  book = r"(?i:book|bk|bk\.|b|volume|vol|v)"
  page = r"(?i:page|pg|pg\.|p)"
  numeric_pattern = r"\d+(?:\s*-\s*\d+)?"

  suffix = r"(?:\s*|no|no\.|number|num|\s*)?"

  key_value_separator = r"(?:-|:|:-|\s+)"
  key_value_pattern = rf"(\s*{key_value_separator}\s*)"

  comma_separator = r"\s*,\s*"
  and_separator = r"\s*and\s*"

  word_separator = rf"(?:{comma_separator}|{and_separator}|\s+)"

  book_pattern = rf"{book}{suffix}{key_value_pattern}{numeric_pattern}"
  page_pattern = rf"{page}{suffix}{key_value_pattern}{numeric_pattern}"

  full_pattern = rf"{book_pattern}{word_separator}{page_pattern}"
  book_page_matches = re.finditer(full_pattern, document_text, re.IGNORECASE)

  wordy_blocks = find_wordy_blocks(document_text)
  valid_matches = []

  for match in book_page_matches:

    is_valid_match = True

    for wordy_block in wordy_blocks:
      if match.start() >= wordy_block[0] and match.end() <= wordy_block[1]:
        is_valid_match = False


    if is_valid_match:
      valid_matches.append(match.group())


  return book_page_matches