## Improving Lexical Simplification Using State of the Art Lexical Complexity Prediction Models
#### Demo notebook
This notebook provides a demonstration of simplifying sentence with multi-word expressions. It is tested on a system with the following specifications:
<ol>
    <li>OS: Linux x86-64</li>
    <li>CPU: 3.30 Ghz x 8</li>
    <li>RAM: 40 GiB</li>
    <li>Hard Drive: 20 GiB</li>
    <li>GPU: NVIDIA Corporation GA104M (CUDA compute capability: 8.6)</li>
 </ol>

#### Run the following two cells to import packages and settings

In [1]:
 # imports
import numpy as np
import torch
from transformers import BertTokenizer
from tqdm import tqdm
import re
import codecs
import nltk

from CWIs.complex_labeller import Complexity_labeller
from plainifier.plainify import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
# settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 1234
np.random.seed(seed)
torch.manual_seed(seed)

two_gram_mwes_list = './CWIs/2_gram_mwe_50.txt'
three_gram_mwes_list = './CWIs/3_gram_mwe_25.txt'
four_gram_mwes_list = './CWIs/4_gram_mwe_8.txt'
pretrained_model_path = './CWIs/cwi_seq.model'
temp_path = './CWIs/temp_file.txt'

path = './plainifier/'
premodel = 'bert-large-uncased-whole-word-masking'
bert_dict = 'tersebert_pytorch_1_0.bin'
embedding = 'crawl-300d-2M-subword.vec'
unigram = 'unigrams-df.tsv'
tokenizer = BertTokenizer.from_pretrained(premodel)
Complexity_labeller_model = Complexity_labeller(pretrained_model_path, temp_path)

2022-04-04 14:39:52.903752: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-04 14:39:52.907072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 14:39:52.908017: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 14:39:52.908127: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [3]:
# loading bert model, word embeddings and unigrams. This process takes 7 minutes
model, similm, tokenfreq, embeddings, vocabulary2 = load_all(path, premodel, bert_dict, embedding, unigram, tokenizer)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading Embeddings


100%|██████████████████████████████| 2000000/2000000 [01:31<00:00, 21897.10it/s]


Loaded Embeddings
Loading Unigrams


100%|██████████████████████████████| 8394369/8394369 [06:05<00:00, 22970.60it/s]


Loaded Unigrams


#### Run the following cell to construct the sentence class

In [94]:
class ComplexSentence:
    # Sentence class
    def __init__(self, sentence, label_model, tokeniser, verbose=True, beam_width=3, alpha=(1/9, 6/9, 2/9)):
        self.sentence = sentence
        self.tokenised_sentence = self.generate_tokenised_sentence()
        
        self.label_model = label_model
        self.verbose = verbose
        self.beam_width = beam_width
        self.alpha = alpha

        if self.verbose:
            print(f'Untokenised sentence: {self.sentence}')
            print(f'Tokenised sentence: {self.tokenised_sentence}')
        
        self.label_complex_words()
    
    def generate_NER_filter(self, init=True):
        # Generate case sensitive tokens
        case_sensitive_tokens = nltk.tokenize.word_tokenize(self.sentence)
        pos_tags = nltk.pos_tag(case_sensitive_tokens)
        
        list_of_NERs = []
        for x in pos_tags:
            if x[1] == 'NNP':
                list_of_NERs.append(x[0].lower())
        
        # NER mask: np array, 0 if is_NER, 1 if not
        NER_mask = np.ones_like(self.tokenised_sentence, dtype=np.int64)
        for i in range(len(self.tokenised_sentence)):
            if self.tokenised_sentence[i] in list_of_NERs:
                NER_mask[i] = 0
        assert len(NER_mask) == len(self.tokenised_sentence)
        
        if self.verbose:
            if init and len(list_of_NERs) > 0:
                print('Found NERs:', list_of_NERs)
            elif init and len(list_of_NERs) == 0:
                print('No NER found.')
        return NER_mask
        
    def generate_tokenised_sentence(self):
        tokens = tokeniseUntokenise(self.sentence, tokenizer)['tokens']
        word_idx = tokeniseUntokenise(self.sentence, tokenizer)['words']
        tokenised_sentence_list = []
        for idx_list in word_idx:
            if len(idx_list)==1:
                tokenised_sentence_list.append(np.array(tokens)[idx_list[0]])
            else:
                word_untokenised = ''
                for idx_list_untokenised in idx_list:
                    word_untokenised += np.array(tokens)[idx_list_untokenised].replace('##', '')
                tokenised_sentence_list.append(word_untokenised)
        return tokenised_sentence_list
    
    def known_complexity(self):
        tokens = tokeniseUntokenise(self.sentence, tokenizer)['tokens']
        word_idx = tokeniseUntokenise(self.sentence, tokenizer)['words']
        known_index = []
        for idx_list in word_idx:
            if len(idx_list)==1 and not re.match(r'^[_\W]+$', tokens[idx_list[0]]):
                #If known label as True
                known_index.append(True)
            else:
                #If unknown label as False
                known_index.append(False)
        return known_index
    
    def label_complex_words(self, init=True):
        
        # applying complexity labeller to the sentence
        Complexity_labeller.convert_format_string(self.label_model, self.sentence)
        if init:
            self.bin_labels = Complexity_labeller.get_bin_labels(self.label_model)[0]
        self.probs = Complexity_labeller.get_prob_labels(self.label_model)
        
        # apply known complexity and NER mask
        self.bin_labels = np.multiply(self.bin_labels, self.known_complexity())
        self.bin_labels = np.multiply(self.bin_labels, self.generate_NER_filter(init=init))
        self.probs = np.multiply(self.probs, self.known_complexity())
        self.probs = np.multiply(self.probs, self.generate_NER_filter(init=False))
        
        self.is_complex = True if np.sum(self.bin_labels) >= 1 else False

        self.complexity_ranking = np.argsort(np.array(self.bin_labels) * np.array(self.probs))[::-1]
        self.most_complex_word = self.tokenised_sentence[self.complexity_ranking[0]]

        if self.verbose and init:
            print(f'Complex probs: {self.probs}')
            print(f'Binary complexity labels: {self.bin_labels}')

        if self.is_complex:
            print(f'\t Most complex word: {self.most_complex_word} \n')

        if not self.is_complex:
            print(f'\t Simplificaiton complete or no complex expression found.\n')
    
    def find_MWEs_w_most_complex_word(self, n_gram, filepath):
        # finds the n-gram mwe of the most complex word in the sentence, if any
        # returns: mwe positions or complex word positions
        
        complex_word_pos = self.complexity_ranking[0]

        if complex_word_pos - n_gram + 1 > 0:
            sliding_start = complex_word_pos - n_gram + 1
        else:
            sliding_start = 0
        
        if complex_word_pos + n_gram - 1 < len(self.complexity_ranking):
            sliding_end = complex_word_pos
        else:
            sliding_end = len(self.complexity_ranking) - n_gram

        with open(filepath, 'r') as f:
            mwes = set(f.read().split('\n')) # make set
            avg_mwe_complexity = 0
            for pos in range(sliding_start, sliding_end + 1):
                possible_mwe = ' '.join(self.tokenised_sentence[pos: pos + n_gram])
                
                if possible_mwe in mwes:
                    
                    if np.mean(self.probs[pos:pos+n_gram]) > avg_mwe_complexity:
                        avg_mwe_complexity = np.mean(self.probs[pos:pos+n_gram])
                        valid_mwes_idx = np.arange(pos, pos+n_gram, 1)
                        mwe_found = possible_mwe
                    else:
                        continue
                        
        if avg_mwe_complexity > 0:
            self.idx_to_plainify = valid_mwes_idx
        else:
            self.idx_to_plainify = [complex_word_pos]
        
    
    def find_all_ngram_mwes(self):
        # returns: self.idx_to_plainify the indices of the longest mwe found
        
        if not self.is_complex:
            raise ValueError('Sentence is not complex')
        
        # give priority to longer MWEs
        n_gram_files = {2: two_gram_mwes_list, 3: three_gram_mwes_list, 4:four_gram_mwes_list}
        
        for n in reversed(range(2,5)):
            self.find_MWEs_w_most_complex_word(n, n_gram_files[n])
            
            if len(self.idx_to_plainify) == n: # if such mwe is found
                break
    
    def one_step_plainify(self):
        idx_start = self.idx_to_plainify[0]
        idx_end = self.idx_to_plainify[-1]+1
        complex_word_name = " ".join(self.tokenised_sentence[idx_start:idx_end])
        print(f'Found complex word or expression: ### {complex_word_name} ###. Plainifying...')
        processed_sentence = tokeniseUntokenise(self.sentence, tokenizer)
        forward_result = getTokenReplacement(processed_sentence, idx_start, len(self.idx_to_plainify), 
                                  tokenizer, model, similm, tokenfreq, embeddings, vocabulary2,
                                  verbose=False, backwards=False, maxDepth=3, maxBreadth=16, alpha=self.alpha)
        backward_result = getTokenReplacement(processed_sentence, idx_start, len(self.idx_to_plainify),
                                  tokenizer, model, similm, tokenfreq, embeddings, vocabulary2, 
                                  verbose=False, backwards=True, maxDepth=3, maxBreadth=16, alpha=self.alpha)
        words, scores = aggregateResults((forward_result, backward_result))
        words = [w.replace('#', '') for w in words]
        print(f'Suggested top 5 subtitutions: {words[:5]}')
        
        return words[0].split(' ')
        
    
    def sub_in_sentence(self, substitution):
        # plugs a substitution in the sentence, then updates complexity scores
        substitution_len = len(substitution)
        
        idx_start = self.idx_to_plainify[0]
        idx_end = self.idx_to_plainify[-1]+1
        
        self.tokenised_sentence = self.tokenised_sentence[:idx_start] + substitution + self.tokenised_sentence[idx_end:]
        self.sentence = ' '.join(self.tokenised_sentence)
        self.bin_labels = list(self.bin_labels[:idx_start]) + [0] * substitution_len + list(self.bin_labels[idx_end:])
        self.label_complex_words(init=False)
        print(f'\t Sentence after substitution: {self.sentence}\n')
        
    def recursive_greedy_plainify(self, max_steps=float('inf'), test=False):
        n = 1
        sub_details_list = []
        while self.is_complex and n <= max_steps:
            self.find_all_ngram_mwes()
            sub = self.one_step_plainify()
            self.sub_in_sentence(sub)
            #append subtitution details
            sub_details = {"iteration":n,"sub_word":sub[0],"idx_start":self.idx_to_plainify[0],"idx_end":self.idx_to_plainify[-1]+1}
            sub_details_list.append(sub_details)
            n += 1
        print(f'Simplification complete.')
        
        if test:
            return self.sentence, sub_details_list
        else:
            return self.sentence
    
    def recursive_beam_search_plainfy(self, beam_width):
        pass

#### Example sentences
Run each cell below to see output after lexical simplification.

In [95]:
input_sentence = 'A machine learning dream team comprised of Teddy, Daniel, Freddy and Theo.'
sentence = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
sentence.recursive_greedy_plainify()

Untokenised sentence: A machine learning dream team comprised of Teddy, Daniel, Freddy and Theo.
Tokenised sentence: ['a', 'machine', 'learning', 'dream', 'team', 'comprised', 'of', 'teddy', ',', 'daniel', ',', 'freddy', 'and', 'theo', '.']
Found NERs: ['teddy', 'daniel', 'freddy', 'theo']
Complex probs: [1.52347216e-04 5.97340725e-02 3.08847755e-01 4.28031012e-02
 7.29917362e-03 8.85468304e-01 4.54008587e-05 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 6.67599888e-05 0.00000000e+00 0.00000000e+00]
Binary complexity labels: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
	 Most complex word: comprised 

Found complex word or expression: ### comprised of ###. Plainifying...
Suggested top 5 subtitutions: ['of', 'consisting of', 'composed of', 'that consists of', 'which consists of']
	 Simplificaiton complete or no complex expression found.

	 Sentence after substitution: a machine learning dream team of teddy , daniel , freddy and theo .

Simplification complete.


'a machine learning dream team of teddy , daniel , freddy and theo .'

In [96]:
input_sentence = "I took a sip of my drink and kept working."
sentence = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
sentence.recursive_greedy_plainify()

Untokenised sentence: I took a sip of my drink and kept working.
Tokenised sentence: ['i', 'took', 'a', 'sip', 'of', 'my', 'drink', 'and', 'kept', 'working', '.']
No NER found.
Complex probs: [8.84567271e-05 4.09524131e-04 1.02484584e-04 7.69472897e-01
 5.30700163e-05 3.52106465e-04 3.07241362e-02 7.18582814e-05
 3.67573537e-02 1.86073082e-03 0.00000000e+00]
Binary complexity labels: [0 0 0 1 0 0 0 0 0 0 0]
	 Most complex word: sip 

Found complex word or expression: ### took a sip of ###. Plainifying...
Suggested top 5 subtitutions: ['finished', 'took', 'took back', 'took out', 'finished off']
	 Simplificaiton complete or no complex expression found.

	 Sentence after substitution: i finished my drink and kept working .

Simplification complete.


'i finished my drink and kept working .'

In [97]:
input_sentence = "We will first introduce several fundamental concepts."
sentence = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
sentence.recursive_greedy_plainify()

Untokenised sentence: We will first introduce several fundamental concepts.
Tokenised sentence: ['we', 'will', 'first', 'introduce', 'several', 'fundamental', 'concepts', '.']
No NER found.
Complex probs: [1.09041139e-04 7.78184694e-05 1.84507377e-03 8.66100013e-01
 1.30162365e-03 9.16373432e-01 7.82740057e-01 0.00000000e+00]
Binary complexity labels: [0 0 0 1 0 1 1 0]
	 Most complex word: fundamental 

Found complex word or expression: ### fundamental ###. Plainifying...
Suggested top 5 subtitutions: ['fundamental', 'new', 'basic', 'important', 'key']
	 Most complex word: introduce 

	 Sentence after substitution: we will first introduce several fundamental concepts .

Found complex word or expression: ### introduce ###. Plainifying...
Suggested top 5 subtitutions: ['introduce', 'establish', 'define', 'discuss', 'present']
	 Most complex word: concepts 

	 Sentence after substitution: we will first introduce several fundamental concepts .

Found complex word or expression: ### concept

'we will first introduce several fundamental concepts .'

In [98]:
input_sentence = "Gable also earned an academy award nomination when he portrayed Fletcher Christian in 1935 ' s mutiny on the bounty ."
sentence = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
sentence.recursive_greedy_plainify()

Untokenised sentence: Gable also earned an academy award nomination when he portrayed Fletcher Christian in 1935 ' s mutiny on the bounty .
Tokenised sentence: ['gable', 'also', 'earned', 'an', 'academy', 'award', 'nomination', 'when', 'he', 'portrayed', 'fletcher', 'christian', 'in', '1935', "'", 's', 'mutiny', 'on', 'the', 'bounty', '.']
Found NERs: ['gable', 'fletcher', 'christian']
Complex probs: [0.00000000e+00 4.39180527e-04 7.13182911e-02 2.01667819e-04
 3.79925758e-01 1.26280099e-01 9.61359859e-01 1.41725759e-04
 1.29709108e-04 8.90345275e-01 0.00000000e+00 0.00000000e+00
 3.56000528e-05 5.31803817e-04 0.00000000e+00 3.95917363e-04
 7.23846614e-01 5.05726639e-05 9.24077394e-05 7.14692175e-01
 0.00000000e+00]
Binary complexity labels: [0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0]
	 Most complex word: nomination 

Found complex word or expression: ### nomination ###. Plainifying...
Suggested top 5 subtitutions: ['nomination', 'for acting', 'for sound', 'as well', 'award nomination'

"gable also earned an academy award nomination when he portrayed fletcher christian in 1935 ' s mutiny on the bounty ."

In [99]:
input_sentence = "Probability is the branch of mathematics concerning numerical descriptions of how likely an event is to occur, or how likely it is that a proposition is true."
sentence = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
sentence.recursive_greedy_plainify()

Untokenised sentence: Probability is the branch of mathematics concerning numerical descriptions of how likely an event is to occur, or how likely it is that a proposition is true.
Tokenised sentence: ['probability', 'is', 'the', 'branch', 'of', 'mathematics', 'concerning', 'numerical', 'descriptions', 'of', 'how', 'likely', 'an', 'event', 'is', 'to', 'occur', ',', 'or', 'how', 'likely', 'it', 'is', 'that', 'a', 'proposition', 'is', 'true', '.']
No NER found.
Complex probs: [8.30549002e-01 6.26499386e-05 1.10454857e-04 5.26121318e-01
 6.59589932e-05 8.48085880e-01 8.09274793e-01 6.57362401e-01
 9.07784224e-01 4.00023018e-05 1.80853662e-04 9.50911082e-03
 1.38362753e-04 1.05769299e-01 5.40117035e-05 5.99568411e-05
 2.35088870e-01 0.00000000e+00 8.21707072e-05 3.28921247e-04
 1.09109739e-02 1.23351288e-04 5.08484009e-05 8.06485332e-05
 1.08139444e-04 8.89525056e-01 5.58543907e-05 1.00519070e-02
 0.00000000e+00]
Binary complexity labels: [1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

'probability theory is the mathematics concerning the measures of how likely an event is to occur , or how likely it is that it is true .'

### Tests
Run any tests below.

In [102]:
# grid search alpha
alphas = []
for a1 in range(10):
    for a2 in range(10):
        alphas.append((a1/9, a2/9, (9-a1-a2)/9))

In [103]:
test_sentences = ["one side of the armed conflicts is composed mainly of the sudanese military and the Janjaweed , a sudanese militia group recruited mostly from the afro - arab Abbala tribes of the northern Rizeigat region in Sudan .",
"Jeddah is the principal gateway to Mecca , Islam ' s holiest city , which able - bodied muslims are required to visit at least once in their lifetime .",
"the great dark spot is thought to represent a hole in the methane cloud deck of Neptune .",
"his next work , saturday , follows an especially eventful day in the life of a successful neurosurgeon .",
"the tarantula , the trickster character , spun a black cord and , attaching it to the ball , crawled away fast to the east , pulling on the cord with all his strength .",
"there he died six weeks later , on 13 january 888 ",
"they are culturally akin to the coastal peoples of Papua New Guinea .",
"since 2000 , the recipient of the Kate Greenaway Medal has also been presented with the Colin Mears Award to the value of £ 5000 .",
"following the drummers are dancers , who often play the sogo  (  a tiny drum that makes almost no sound  )  and tend to have more elaborate — even acrobatic — choreography .",
"the spacecraft consists of two main elements : the Nasa Cassini Orbiter , named after the italian - french astronomer Giovanni Domenico Cassini , and the Esa Huygens Probe , named after the dutch astronomer , mathematician and physicist Christiaan Huygens ."]

In [106]:
import pickle
for alpha in alphas:
    
    final_sentence = []
    sentence_details_list = []
    
    for input_sentence in test_sentences:
    
        input_sentence = norm_test_dat[i]
        s = ComplexSentence(input_sentence, label_model=Complexity_labeller_model, tokeniser=tokenizer, verbose=True)
        sentence_final, sentence_detail = s.recursive_greedy_plainify()
        sentence.append(sentence_final)
        sentence_details = {'sentence_id': i, 'details': sentence_detail}
        sentence_details_list.append(sentence_details)
    
    with 

(0.0, 0.0, 1.0)
(0.0, 0.1111111111111111, 0.8888888888888888)
(0.0, 0.2222222222222222, 0.7777777777777778)
(0.0, 0.3333333333333333, 0.6666666666666666)
(0.0, 0.4444444444444444, 0.5555555555555556)
(0.0, 0.5555555555555556, 0.4444444444444444)
(0.0, 0.6666666666666666, 0.3333333333333333)
(0.0, 0.7777777777777778, 0.2222222222222222)
(0.0, 0.8888888888888888, 0.1111111111111111)
(0.0, 1.0, 0.0)
(0.1111111111111111, 0.0, 0.8888888888888888)
(0.1111111111111111, 0.1111111111111111, 0.7777777777777778)
(0.1111111111111111, 0.2222222222222222, 0.6666666666666666)
(0.1111111111111111, 0.3333333333333333, 0.5555555555555556)
(0.1111111111111111, 0.4444444444444444, 0.4444444444444444)
(0.1111111111111111, 0.5555555555555556, 0.3333333333333333)
(0.1111111111111111, 0.6666666666666666, 0.2222222222222222)
(0.1111111111111111, 0.7777777777777778, 0.1111111111111111)
(0.1111111111111111, 0.8888888888888888, 0.0)
(0.1111111111111111, 1.0, -0.1111111111111111)
(0.2222222222222222, 0.0, 0.777777