In [1]:
# -----------------------------------------> S
import re
from collections import defaultdict

re_numbering = re.compile('^ *[(]?[0-9]{1,3}[).]? +')
re_number = re.compile('[0-9]{1,3}')
re_roman_numbering = re.compile('^ *[(]?[iIvVxXlL]{1,8}[).]? +')
re_roman_number = re.compile('[iIvVxXlLcC]+')
re_space = re.compile('[ ]{3,}')
re_bullet = re.compile('^\s*([^\w\s%$.,^&()#@!?/\[\]{}|<>_\\=~`\'"“|:;]|o ){1}\s*')
re_space = re.compile('[ ]{3,}')

def rn_to_int(s):
    d = {'l': 50, 'x': 10, 'v': 5, 'i': 1}
    n = [d[i] for i in s.lower() if i in d]
    return sum([i if i>=n[min(j+1, len(n)-1)] else -i for j,i in enumerate(n)])


def _is_non_continuous_increasing(numbering_1, numbering_2, is_roman):
    number_extract = re_roman_number if is_roman else re_number
    numbering_1 = number_extract.findall(numbering_1)
    numbering_2 = number_extract.findall(numbering_2)
    str_to_int = rn_to_int if is_roman else int
    numbering_1 = str_to_int(numbering_1[0])
    numbering_2 = str_to_int(numbering_2[0])
    return numbering_1 + 1 < numbering_2


def has_table(text, additional_table_char=''):
    num_of_column_before = None
    num_of_colon = 0
    if isinstance(text, str):
        text = text.split('\n')
    for idx, line in enumerate(text):
        line = re_space.sub('\t', line).split('\t')
        if not line:
            continue
        if (idx > 0 and additional_table_char
            and any(additional_table_char in token for token in line)):
            num_of_colon += 1
        if num_of_colon > 1:
            return True
        num_of_column = sum([token[0].isalpha() for token in line[1:] if token])
        if num_of_column < 2:
            num_of_column_before = None
        elif not num_of_column_before or num_of_column != num_of_column_before:
            num_of_column_before = num_of_column
        elif num_of_column == num_of_column_before:
            return True
    return False


def remove_new_line(text):
    new_text = ''
    for idx, line in enumerate(text.split('\n')):
        if not line.strip():
            continue
        if idx == 0:
            new_text += line
        else:
            new_text += ' ' + line.strip()
    return new_text

def get_bullet_parsed_text(content):
    content = content.strip('\n')
    seperated_content = []
    count = 0
    max_count = 0
    tmp = ['', None]
    bullet_before = None
    for line in content.split('\n'):
        if not line.strip():
            continue
        bullet = re_bullet.findall(line)
        if not bullet and len(line) == len(line.lstrip()):
            seperated_content.append(tmp)
            tmp = [line + '\n', None]
            bullet_before = None
            max_count = max(count, max_count)
            count = 0
            continue
        elif not bullet:
            tmp[0] += line + '\n'
            continue
        bullet = bullet[0].lstrip()
        if not bullet_before or bullet != bullet_before:
            max_count = max(count, max_count)
            count = 1
        else:
            count += 1
        bullet_before = bullet
        if not has_table(tmp[0], ':'):
            tmp[0] = remove_new_line(tmp[0])
        seperated_content.append(tmp)
        tmp = [line + '\n', bullet]
    if not has_table(tmp[0], ':'):
        tmp[0] = remove_new_line(tmp[0])
    seperated_content.append(tmp)
    max_count = max(count, max_count)
    # print(seperated_content)
    if max_count < 2:
        return False, content
    else:
        seperated_content = [sc for sc, __ in seperated_content if sc.strip()]
        return True, seperated_content
# -----------------------------------------< S

In [2]:
# -----------------------------------------> S
class NumberingParser:
    def __init__(self):
        self.seperated_content = []
        self.count = 0

    def get_parsed_text(self, content):
        self.seperated_content = []
        self.count = 0
        tmp = ['', None]
        is_roman = None
        numbering_before = None
        for line in content.split('\n'):
            arabian_numbering = re_numbering.findall(line)
            roman_numbering = re_roman_numbering.findall(line)
            numbering = arabian_numbering if arabian_numbering else roman_numbering
            is_roman = True if roman_numbering else False
            # If it has no numbering, add it tmp and continue next line
            if not numbering:
                tmp[0] += line + '\n'
                continue
            numbering = numbering[0].lstrip()
            if not numbering_before:
                numbering_before = numbering
                self.seperated_content.append(tmp)
            elif self._is_formt_mismatch(numbering_before, numbering):
                tmp[0] += line + '\n'
                continue
            elif _is_non_continuous_increasing(numbering_before, numbering, is_roman):
                numbering_before = self._sep_if_has_another_numbering(tmp)
            else:
                self.seperated_content.append(tmp)
            tmp = [line + '\n', numbering]
        if tmp[0] and tmp[1]:
            self._sep_if_has_another_numbering(tmp)

        count = self._count_max_continuous_numbering()
        if count < 2:
            return False, content
        else:
            new_line_removed_contents = []
            is_table = has_table([text for text, __ in self.seperated_content])
            # print(self.seperated_content)
            for content, numbering in self.seperated_content:
                if not content.strip():
                    continue
                elif not numbering or is_table:
                    new_line_removed_contents.append(content)
                    continue
                content = self._seperate_subnumbering(content)
                if isinstance(content, str):
                    new_line_removed_contents.append(remove_new_line(content))
                else:
                    new_line_removed_contents.extend(content)
            return True, new_line_removed_contents

    def _seperate_subnumbering(self, text):
        if not ('a.' in text and 'b.' in text):
            return text
        subnumbering = 'abcdefghijklmnopqrstuvwxyz'
        seperated_text = []
        idx = 0
        tmp = []
        for line in text.split('\n'):
            if line.lstrip().startswith(f'{subnumbering[idx]}.'):
                seperated_text.append(' '.join(tmp))
                tmp = [line]
                idx += 1
            else:
                line = line.lstrip() if tmp else line
                tmp.append(line)
        if tmp:
            seperated_text.append(' '.join(tmp))
        return seperated_text

    def _is_formt_mismatch(self, numbering_1, numbering_2):
        if (re_number.findall(numbering_1) and re_number.findall(numbering_2)
                and re_number.sub('', numbering_1).strip() != re_number.sub('', numbering_2).strip()):
            return True
        elif (re_roman_number.findall(numbering_1) and re_roman_number.findall(numbering_2)
              and re_roman_number.sub('', numbering_1).strip() != re_roman_number.sub('', numbering_2).strip()):
            return True
        elif re_number.findall(numbering_1) and re_roman_number.findall(numbering_2):
            return True
        elif re_roman_number.findall(numbering_1) and re_number.findall(numbering_2):
            return True
        else:
            return False

    def _count_max_continuous_numbering(self):
        max_count = 0
        numbering_dic = defaultdict(int)
        for num_format, str_to_int in [(re_number, int), (re_roman_number, rn_to_int)]:
            pre_format = None
            pre_number = None
            count = 0
            for __, numbering in self.seperated_content:
                if not numbering:
                    continue
                numbering_format = num_format.sub('', numbering)
                number = num_format.findall(numbering)
                if not number:
                    continue
                number = str_to_int(number[0])
                numbering_dic[number] += 1
                if not pre_format:
                    pre_format = numbering_format
                    count = 1
                    pre_number = number
                elif (numbering_format == pre_format
                      and number == pre_number + 1):
                    count += 1
                    pre_number += 1
                    max_count = max(max_count, count)
                else:
                    max_count = max(max_count, count)
                    count = 0
                    pre_format = None
                    pre_number = None
        for count in numbering_dic.values():
            if count > 1:
                return 1
        return max_count

    def _sep_if_has_another_numbering(self, text_and_numbering):
        sep_text = [text_and_numbering]
        numbering = text_and_numbering[1]
        while True:
            number = re_number.findall(numbering)
            if not number:
                break
            next_number = int(number[0]) + 1
            new_numbering = re_number.sub(str(next_number), numbering)
            next_numbering_idx = sep_text[-1][0].find(new_numbering)
            if next_numbering_idx > 2:
                tmp, old_numbering = sep_text.pop()
                sep_text.append((tmp[:next_numbering_idx], old_numbering))
                sep_text.append((tmp[next_numbering_idx:], new_numbering))
                self.count += 1
                numbering = new_numbering
            else:
                break
        self.seperated_content.extend(sep_text)
        return numbering
# -----------------------------------------> S

In [11]:
text = """ 1.12 Air Conditioning and Refrigeration Plant Piping"""

In [12]:
print(text)

 1.12 Air Conditioning and Refrigeration Plant Piping


In [16]:
print(text.split()[0].isdigit())

False


In [3]:
new_text = ""
for line in text.splitlines():
    new_text += ' ' + line.lstrip()
    
print(new_text)

 (3) In principle, inspection and test are to be carried out in accordance with the Builder's working schedule. For smooth construction of the vessel, the inspection parties shall not refuse to inspect the parts designated by the Builder even if some minor works remain which can be completed and subjected to inspection at later stage.


In [4]:
chars = set('o,')
if any((c in chars) for c in text.lstrip()[0]):
    print('Found')
else:
    print('Not Found')

Not Found


In [15]:
print(text.splitlines()[0].strip()[-1])

e
