In [132]:
from tqdm import tqdm
import nltk
import re
import contractions
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daffa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daffa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [133]:
def text_preprocess(text, options={}):
    
    exclude_words = ["above", "after", "before", "between", "both", "but", "by", "during", "each", "for", "from",
                    "further", "if", "in", "into", "more", "most", "not", "now", "off", "on", "once", "only", "or",
                    "other", "out", "over", "so", "some", "such", "than", "that", "then", "there", "these", "under",
                    "untill", "when", "where", "which", "while", "will", "with", "because"]
    # Lowercase text
    text = text.lower()
    
    # Remove Contraction
    text = contractions.fix(text)
    
    # Remove Punction
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    if options.get("remove_stopwords", True):
        stop_words = set(stopwords.words('english'))

        # Exclude specific conditional words
        if options.get("exclude_conditional_stopwords", True):
            stop_words = set([word for word in stop_words if word not in exclude_words])
        
        tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize text
    if options.get("lemmatize", True):
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Stem text
    if options.get("stem", True):
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into text string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [134]:
def recount_tokens(preproccesing_text):
    tokens = []
    words = preproccesing_text.split()
    start = 0
    for i, word in enumerate(words):
        # Check if word has leading whitespace
        ws = True
        # Add token to list
        tokens.append({
            'text': word,
            'start': start,
            'end': start+len(word),
            'id': i,
            'ws': ws
        })
        # Move start index to next word
        start += len(word) + 1
        
    # Set the ws of the last token to False
    if tokens:
        tokens[-1]['ws'] = False
        
    return tokens

In [135]:
def get_token_index(text, string_index):
    tokens = text.split()
    token_start_index = 0
    for token in tokens:
        token_end_index = token_start_index + len(token)
        if string_index >= token_start_index and string_index <= token_end_index:
            return tokens.index(token)
        token_start_index = token_end_index + 1
    return -1

In [136]:
def recount_spans(dataset, preprocess_text, options):
    new_spans = []
    
    if 'spans' in dataset:
    
        for span in dataset['spans']:
            
            start = span['start']
            end = span['end']
            text_pre = text_preprocess(dataset['text'][start:end], options)
            
            new_start = preprocess_text.find(text_pre)
            new_end = new_start + len(text_pre)
            
        
            new_token_start = get_token_index(preprocess_text, new_start)
            new_token_end = get_token_index(preprocess_text, new_end)
            
            new_spans.append({
                'start': new_start,
                'end': new_end,
                'token_start':  new_token_start,
                'token_end': new_token_end,
                'label': span['label']
            })
            
        return new_spans
    return []

In [137]:
def dataset_preprocessing(input_datasets, options):
    pre_datasets = input_datasets
    for dataset in tqdm(pre_datasets):
        preprocessing_text = text_preprocess(dataset['text'], options=options)
        new_spans = recount_spans(dataset, preprocessing_text, options=options)
        new_tokens = recount_tokens(preprocessing_text)
        
        dataset['text'] = preprocessing_text
        dataset['spans'] = new_spans
        dataset['tokens'] = new_tokens
        
    return pre_datasets

In [138]:
import json


dataset = []
# Open the file in read mode
with open('./annotated/annotations.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        # Load the JSON data from the file
        dataset.append(json.loads(line.strip()))

In [139]:
options = {
    'remove_stopwords': True, 
    'lemmatize': True, 
    'stem': True,
    'exclude_conditional_stopwords': True
    }

dataset_preprocessing = dataset_preprocessing(dataset, options)

100%|██████████| 316/316 [00:00<00:00, 470.38it/s]


In [140]:
dataset_preprocessing

[{'text': 'within shunt group shall possibl for onli one member group transmit link assur signal time',
  'spans': [{'start': 0,
    'end': 18,
    'token_start': 0,
    'token_end': 2,
    'label': 'PRECONDITION'},
   {'start': 7, 'end': 18, 'token_start': 1, 'token_end': 2, 'label': 'ACTOR'},
   {'start': 19,
    'end': 90,
    'token_start': 3,
    'token_end': 14,
    'label': 'POSTCONDITION'}],
  'meta': {'pattern': '282'},
  '_input_hash': 456570573,
  '_task_hash': 1981750695,
  'tokens': [{'text': 'within', 'start': 0, 'end': 6, 'id': 0, 'ws': True},
   {'text': 'shunt', 'start': 7, 'end': 12, 'id': 1, 'ws': True},
   {'text': 'group', 'start': 13, 'end': 18, 'id': 2, 'ws': True},
   {'text': 'shall', 'start': 19, 'end': 24, 'id': 3, 'ws': True},
   {'text': 'possibl', 'start': 25, 'end': 32, 'id': 4, 'ws': True},
   {'text': 'for', 'start': 33, 'end': 36, 'id': 5, 'ws': True},
   {'text': 'onli', 'start': 37, 'end': 41, 'id': 6, 'ws': True},
   {'text': 'one', 'start': 42, 'en