In [1]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

language = ['english']
#language = ['en']

stop_words = stopwords.words(language)

[nltk_data] Downloading package stopwords to /home/adi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re

def create_positions_dict(text):
    
    tokens_position = {}
    
    if (text is None):
        return tokens_position
    
    #remove delimiters
    delimiters = ",.!?/&-:; "
    new_text = ' '.join(w for w in re.split("["+"\\".join(delimiters)+"]", text) if w)
    
    #get the tokens
    tokens = new_text.split()
    
    curr_position = 0
    
    while (curr_position < len(tokens)):
        if (tokens[curr_position] not in tokens_position):    
            tokens_position[tokens[curr_position]] = set()
        
        tokens_position[tokens[curr_position]].add(curr_position)
        
        curr_position = curr_position + 1
    
    return tokens_position

In [None]:
def test_consecutive_tokens(set1, set2):
    
    for elem1 in set1:
        if (elem1+1) in set2:
            return True
    
    return False

In [None]:
def test_highlighting(text):
    if ('<em>' in text) and ('</em>' in text):
        return True
    else:
        return False

In [None]:
def get_highlighted_text(text):
    
    text = text.replace('<em>', '')
    text = text.replace('</em>', '')
    
    return text

In [None]:
#Algorithm used in get_sequences(searched_text, original_es_highlight):

#Go through each word from original_es_highlight, one by one.

#At every step, we need to know which is the begining of the subsequence of tokens from original_es_highlight, 
#that are consecutive in searched_text, that is ending in the current token from original_es_highlight.

#In order to do so, we hold in a variable the index of the token that is at the begining of this subsequence.

#When we advance to a new token from original_es_highlight, we need first to check if the token can continue the already found sequence until the current moment
#meaning if the current token is marked (with <em> , </em>) and if it is successive in the searched_text.

#if the current token is marked and is consecutive to the previous token (we determine that by looking at the precedence of tokens in searched_text),
#then we just go on to the next token, the begining of the sequence remains the same.

#if not (the token is not highlighted because is not part of searched_text, or, if it is, is not successive to the precedent token,
#then it means that the current subsequence ends at the previous token (including the previous token).

#in the process we also verify each token from the sequences so that we know if a sequence contains only stop words or not.

In [None]:
def get_sequences(searched_text, original_es_highlight):
    
    #store a list of tuples (start_seq, end_seq, status) in highlighted_sequences
    #a tuple represent the start and the end of a highlighted sequence and if the sequence has only stop words ('r' - from remove) or not ('k' - keep)
    highlighted_sequences = []
    
    if (searched_text is None) or (original_es_highlight is None):
        return highlighted_sequences
    
    searched_tokens_positions = create_positions_dict(searched_text)
    
    highlighted_tokens = original_es_highlight.split()
    
    curr_position = 0
    
    start_seq = 0
    end_seq = 0
    only_stop_words_in_seq = True
    
    while (curr_position < len(highlighted_tokens)):
        
        curr_token = highlighted_tokens[curr_position]
            
        if (test_highlighting(curr_token)):
            dehighlighted_curr_token = get_highlighted_text(curr_token)
                
            if (curr_position == 0):
                if (dehighlighted_curr_token not in stop_words):
                    only_stop_words_in_seq = False
                curr_position = curr_position + 1
            else:
                prev_token = highlighted_tokens[curr_position-1]
                    
                if (test_highlighting(prev_token)):
                    
                    dehighlighted_prev_token = get_highlighted_text(prev_token)
                    
                    if (dehighlighted_curr_token in searched_tokens_positions) and (dehighlighted_prev_token in searched_tokens_positions):
                        
                        curr_token_set = searched_tokens_positions[dehighlighted_curr_token]
                        prev_token_set = searched_tokens_positions[dehighlighted_prev_token]
                
                        if (test_consecutive_tokens(prev_token_set, curr_token_set)):
                            curr_position = curr_position + 1
                            if (dehighlighted_curr_token not in stop_words):
                                only_stop_words_in_seq = False
                            
                        else:
                            end_seq = curr_position - 1
                            if (start_seq <= end_seq):
                                if(only_stop_words_in_seq):
                                    highlighted_sequences.append((start_seq, end_seq,'r'))
                                else:
                                    highlighted_sequences.append((start_seq, end_seq,'k'))
                            
                            start_seq = curr_position
                            only_stop_words_in_seq = True
                            curr_position = curr_position + 1
                    else:
                        end_seq = curr_position - 1
                        if (start_seq <= end_seq):
                            if(only_stop_words_in_seq):
                                highlighted_sequences.append((start_seq, end_seq,'r'))
                            else:
                                highlighted_sequences.append((start_seq, end_seq,'k'))

                        curr_position = curr_position + 1
                        start_seq = curr_position
                        only_stop_words_in_seq = True
                else:
                    start_seq = curr_position
                    only_stop_words_in_seq = True
                    curr_position = curr_position + 1
                    
        else:
            end_seq = curr_position - 1
            if (start_seq <= end_seq):
                if(only_stop_words_in_seq):
                    highlighted_sequences.append((start_seq, end_seq,'r'))
                else:
                    highlighted_sequences.append((start_seq, end_seq,'k'))
            
            curr_position = curr_position + 1
            start_seq = curr_position
            only_stop_words_in_seq = True
    
    return highlighted_sequences

In [None]:
def check_if_only_stop_words(tokens, start_seq, end_seq):
    #check if a given sequence contains only stop words
    
    cursor = start_seq
    while (cursor <= end_seq):
        curr_token = tokens[cursor]
        dehighlighted_curr_token = get_highlighted_text(curr_token)
        
        if (dehighlighted_curr_token not in stop_words):
            return False
        
        cursor = cursor + 1
    
    return True 

In [None]:
def get_removable_tags_occurrences(searched_text, original_es_highlight):

    sequences = get_sequences(searched_text, original_es_highlight)
    
    occurrences_of_removable_tags = []
    
    if (sequences):
        nb_of_tags = 0
        for seq in sequences:
            
            if (seq[2] == 'r'):
                cursor = 0
                while (cursor <= seq[1]-seq[0]):
                    occurrences_of_removable_tags.append(nb_of_tags + cursor + 1)                    
                    cursor = cursor + 1
            
            nb_of_tags = nb_of_tags + seq[1] - int(seq[0]) + 1
    
    return occurrences_of_removable_tags

In [None]:
def replace_nth_occurence(string, sub, replacement, n):
    where = [m.start() for m in re.finditer(sub, string)][n-1]
    before = string[:where]
    after = string[where:]
    after = after.replace(sub, replacement, 1)
    newString = before + after
    
    return newString

In [None]:
def get_processed_text(searched_text, original_es_highlight):
    processed_text = original_es_highlight
    
    removable_tags_occurrences = get_removable_tags_occurrences(searched_text, original_es_highlight)
    
    nb_of_replacements = 0
    
    if (removable_tags_occurrences):
        for occurence in removable_tags_occurrences:
            processed_text = replace_nth_occurence(processed_text, '<em>', '', occurence - nb_of_replacements)
            processed_text = replace_nth_occurence(processed_text, '</em>', '', occurence - nb_of_replacements)
            
            nb_of_replacements = nb_of_replacements + 1
    
    return processed_text

In [None]:
#test1
searched_text1 = "this is the gas emission from agriculture"
original_es_highlight1 = """... anything before ... <em>this</em> <em>is</em> <em>the</em> <em>gas</em> <em>emission</em> <em>from</em> <em>agriculture</em> 
                          ... anything between ... <em>this</em> <em>is</em> a statement about <em>gas</em> <em>emission</em> ... anything between ...
                          <em>this</em> <em>is</em> <em>from</em> Romania"""
processed_text1 = get_processed_text(searched_text1, original_es_highlight1)
print(processed_text1)

In [13]:
#test2
searched_text2 = "this is the gas emission from agriculture"
original_es_highlight2 = """... anything before ... <em>this</em> <em>is</em> <em>the</em> <em>gas</em> <em>emission</em> <em>from</em> <em>agriculture</em> 
                          ... anything between ... <em>this</em> <em>is</em> a statement about <em>gas</em> <em>emission</em> ... anything between ...
                          <em>this</em> <em>is</em> <em>the</em> <em>gas</em> after"""
processed_text2 = get_processed_text(searched_text2, original_es_highlight2)
print(processed_text2)

... anything before ... <em>this</em> <em>is</em> <em>the</em> <em>gas</em> <em>emission</em> <em>from</em> <em>agriculture</em> 
                          ... anything between ... this is a statement about <em>gas</em> <em>emission</em> ... anything between ...
                          <em>this</em> <em>is</em> <em>the</em> <em>gas</em> after


In [14]:
#test3
searched_text3 = "this is the gas emission from agriculture"
original_es_highlight3 = """<em>the</em> <em>gas</em> <em>emission</em>... anything between ... 
<em>this</em> <em>is</em> <em>the</em> <em>agriculture</em> 
                          ... anything after ..."""
processed_text3 = get_processed_text(searched_text3, original_es_highlight3)
print(processed_text3)

<em>the</em> <em>gas</em> emission... anything between ... 
this is the <em>agriculture</em> 
                          ... anything after ...
