Adapted from [SPEC5G Preprocessing/CellularPreprocessor](https://github.com/Imtiazkarimik23/SPEC5G)

To use: change output_dir and make sure to have the input file in the same directory as this notebook.

In [11]:
import os
import re

In [12]:
class Preprocessor:
    def __init__(self, output_path):
        self.output_path = output_path
        self.output_file = None
        self.line_count = 0
        self.code_ident = ["((", "))", "{{", "}}",  "[[",  "]]",  "::", ":="]
        self.html_tags = ["<img", "<input type", "<p>", "<br>"]
        
    def processAll(self, input_path):
        #input_path is a directory containing raw data
        
        if os.path.exists(step_1_output_path):
            with open(step_1_output_path, 'r') as fp:
                self.line_count = len(fp.readlines()) #To get the next sequence number, as we are appending
            
        self.output_file = open(step_1_output_path,'w')

        self.processIt(input_path, [1])
            
        self.output_file.close()
    
    def processIt(self, file, task:list):
        
        isSpec = False
        section_at_next = True
        file_line_count = 0
        file_word_count = 0
        #file: path to a file
        with open(file, 'r', encoding='utf-8') as f:
            text = f.readlines()
            for line in text:
                #print(line)
                
                if any(x in line for x in self.code_ident):
                    continue
                if any(x in line for x in self.html_tags):
                    continue
                if '\\' in line: #noisy data
                    continue
                #if 'Editor\'s Note' in line:
                #    continue
                if line.istitle(): #Title case: first letter of the first word capitalized- means it is a new sentence
                    self.output_file.write('\n')
                else: 
                    self.output_file.write(' ')
                line = re.sub(r'\[\d*?\]', '', line) #Citation removal
                line = re.sub(r'(as shown below|[Ss]ee figure below):*\-*', '', line) #remove certain strings
                line = re.sub('\(see [fF]igure .*?\d\)', '', line) #remove '(see Figure #)'
                line = re.sub('Figure .*?[\dA-Za-z]:', '', line) #Figure captions
                line = re.sub(r'(\d[A-Za-z]*\.)*\d[\.\-]\d?', '', line) #5.3.4.1 or 4.5.2-1 type number removal
                line = re.sub(r'\[pic\]', '', line) #picture alternative bad text removal
                #line = re.sub(r'\\+', '\\', line) #keep only one backslash
                line = re.sub(r'\\x[0-9a-fA-F]{2}', '', line) #removing '\xc#' type patterns
                
                
                line = re.sub(r'(\n)+', '', line) #remove extra newlines
                line = re.sub(r'(\t)+', ' ', line) #remove tabs
                #line = re.sub(r'^[\.·\-]', '', line) #starting dot, interpunct, hyphen removal
                line = re.sub(r'^[·\-]', '', line) #modified ^, does not remove starting dots
                #line = re.sub(r'^ ', '', line) #starting whitespace removal
                #line = re.sub(r'[\.:,;\-]*$', '', line) #one or multiple dot, colon, hyphen, semicolon removal at the end
                line = re.sub(r'\|', '', line) #pipe generated from table
                line = re.sub(r'\.{2,}', '.', line) #multiple dots, keep one
                line = re.sub(r' +', ' ', line) #multiple whitespace, keep one
                #maybe put this back in @elli
                #if section_at_next is True:
                #    line = re.sub(r'^\d.*', '', line) #remove lines with starting digits, meaning sections

                line = re.sub(r'([,;:])(\w)', r'\1 \2', line) #insert whitespace after punctuations, except fullstop, underscore
                                                              #hyphen
                line = re.sub(r'\ue000', '', line, re.UNICODE)
                line = re.sub(r'\xa0|&nbsp;', '', line, flags=re.IGNORECASE) #remove nonbreaking spaces
                
                line = re.sub(r'(\( )', '\(', line) #whitespace after opening paren.
                line = re.sub(r'( \))', '\)', line) #whitespace before closing paren.
                line = re.sub(r'(\(\))|(\[\])|(\{\})', '', line) #remove empty paren, curly-braces, brackets
                line = re.sub(r'^(TS \d\d\d?)|^(TR \d\d\d?)', 'Specification document', line)
                line = re.sub(r'(TS \d\d\d?)|(TR \d\d\d?)', 'specification document', line)
                
                if len(line.split()) < 4 : #skip the line
                    continue
                if sum(c.isdigit() for c in line)/len(line) >= 0.2: #More than 20% characters of the line are digits
                    continue
                if re.search(r'(C\(\d\d\d\))', line):
                    continue
                if re.search(r'% %', line):
                    continue
                if re.search(r'-?\d,? -?\d,? -?\d,? -?\d', line): #patterns like -1, 1, 1 1 etc.
                    continue
                if re.search(r'-?[A-Za-z],? -?[A-Za-z],? -?[A-Za-z]', line): #patterns like -1, 1, 1 1 etc.
                    continue
                
                if line[-2:] == 'TS' or line[:-2] == 'TR':
                    section_at_next = False
                else:
                    section_at_next = True
                
                if line != '' and line != '.':
                    line_set = re.findall(r'[^.]+[.]*', line)
                    for i, sentence in enumerate(line_set):
                        self.line_count += 1
                        file_line_count += 1
                        file_word_count += len(sentence.split())
                        if i != len(line_set)-1:
                            sentence = sentence + '\n'
                            self.output_file.write(sentence)
                        else:
                            self.output_file.write(sentence)

            #end of document
            self.output_file.write('\n')
        #task is a list of preprocessing tasks you want to carry on
        
        

In [13]:
input_path = "LTE_NAS.txt"
step_1_output_path = "step1processed.txt"
output_dir = '/home/haehnel'

### Outputs

In [14]:
dataProcessor = Preprocessor(step_1_output_path)
dataProcessor.processAll(input_path)

# Second step


In [15]:
final_output_path = os.path.join(output_dir, 'processed_LTE_NAS.txt')
token_count_vs_sent_count = {} #key: count of token in a sentence, value: number of sentences with that particular count
token_count = []
reduction_count = 0
with open(final_output_path, 'w') as out_file:
    with open(step_1_output_path) as file:
        lines = file.readlines()
        key = 0
        for line in lines:
            #print(line + '\n')
            if len(line.split())< 4 and line != '\n' or line == '\n':
                reduction_count += 1
                continue
            line = re.sub(r'\n\s*\n', '\n\n', line)
            line = re.sub(r' +', ' ', line)
            line = re.sub(r'^ +', '', line)
            if len(line.split()) > 4 and line[-2] != '.':
                line = line[:-2] + '.\n'
            if line[-5:] == 'e.g.\n' or line[-5:] == 'i.e.\n':
                line = line[:-2] + ' '
                reduction_count += 1
            if line == ' ':
                reduction_count += 1
                continue
            key += len(line.split())
            if len(line) > 1 and line[-1] == '\n': #end of a proper line, write in dictionary
                token_count.append(key)
                key = 0

            out_file.write(line)
print("Number of line reduced: ", reduction_count)

Number of line reduced:  2535
