**Importing Necessary Packages**

In [2]:
import os,glob,pathlib
import pandas as pd
import numpy as np
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm,trange
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError
import nltk
from nltk.stem import WordNetLemmatizer 
import re
import itertools
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle
from tensorflow.keras.optimizers import Adam
import keras.backend as K

**Downloading Packages for Github cloning. Cloning the Dataset from Princton Repo**

In [3]:
!pip install gitpython
import git
nltk.download('wordnet')
repo = git.Repo.clone_from("https://github.com/rubenIzquierdo/wsd_corpora.git", "./data/raw/wsd_corpora")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


**Setting displaying options**

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

**Parsing The Semcor 3.0 Dataset**

In [None]:
def xml_parse(_fpath):

    sctree = ET.parse(_fpath)

    # Iterates over list of words in files    
    dct_list1 = []
    for node in sctree.iter('wf'):
        attributes = node.attrib
        attributes['text'] = node.text
        dct_list1.append(attributes)

    # Iterates over terms to find senses and corresponding sense references
    dct_list2 = []
    for term in sctree.iter('term'):
        lemma = term.attrib.get('lemma')
        wordid = term.find('span/target').attrib.get('id')
        pos = ''

        wnsn = '0'
        senseid=''
        if term.findall('externalReferences/externalRef'):
            wnsn = term.findall('externalReferences/externalRef')[0].attrib.get('reference')
            senseid = term.findall('externalReferences/externalRef')[1].attrib.get('reference')
        dct_list2.append({'id':wordid,'lemma':lemma,'wn_sense_num':wnsn,'lexical_key':senseid,'pos':term.attrib['pos']})

    word_df = pd.DataFrame(dct_list1)
    sense_ref_df = pd.DataFrame(dct_list2)   
    
    return pd.merge(word_df,sense_ref_df,on='id')

In [None]:
def gen_file_list(_basepath,ext='*.naf'):
    
    file_list = []
    fla = glob.glob(os.path.join(_basepath,ext))
    flb = glob.glob(os.path.join(_basepath,'*',ext))
    flc = glob.glob(os.path.join(_basepath,'**',ext))
    files = set(fla+flb+flc)
    for fileref in files: #search recursively for files
        parent_folder_name = pathlib.Path(fileref).parent.name
        file_name = pathlib.Path(fileref).name.split('.')[0]
        
        file_list.append( {'file_path':fileref,
                           'parent_folder':parent_folder_name,
                           'file_name':file_name})
    return pd.DataFrame(file_list)

In [None]:
def parse_corpus(_basepath,filter_validation = False):

   # generate dataframe with references to all files
    _fpath_df = gen_file_list(_basepath)
    
    # filter to remove validation files
    filtered_file_df = _fpath_df
    if filter_validation:
         filtered_file_df = _fpath_df[_fpath_df.parent_folder != 'brownv']
    
    _dflist = []
    for i,file_entry in tqdm(filtered_file_df.iterrows(), total=filtered_file_df.shape[0]):
        _parsed_file_df = xml_parse(file_entry.file_path)
        #print(_parsed_file_df.head())
        _parsed_file_df['file'] = file_entry.file_name
        _dflist.append(_parsed_file_df)

    return pd.concat(_dflist)

In [None]:
def build_corpus(_basepath,verbose=True,**kwargs):
    if verbose: print('Parsing corpus')
    base_corpus = parse_corpus(_basepath,**kwargs)

    # Build wordnet ref key using wordnet lemma
    if verbose: print('Preprocessing indexes...',end="")
    base_corpus['wn_index'] = base_corpus['lemma']+'%'+base_corpus['lexical_key']

    base_corpus.loc[base_corpus.lexical_key == '','wn_index'] = ''
    base_corpus.drop('lexical_key',axis=1,inplace=True)
    if verbose: print('Done!')
    print(base_corpus.shape)
    return base_corpus

In [None]:
def wordnet_get_glosses(_word,_sense_id):
    _sense_id = int(_sense_id)
    if not _word: 
        return ''
    try:
        all_synsets = wn.synsets(_word)
        target_gloss = []
        other_glosses = []
        for syn in all_synsets:
            split = syn.name().split('.')
            wn_lemma = split[0]
            sense_num = int(split[-1])
            if sense_num == _sense_id:
                target_gloss.append(syn.definition()) 
            else:
                other_glosses.append(syn.definition())                
        return target_gloss,other_glosses
    except (AttributeError,WordNetError,ValueError) as err:
        return 'WN Error',None

In [None]:
def wordnet_gloss_helper(_word,_sense_id):
    if not _word or not _sense_id:
        return '',''
    senseidlist = _sense_id.split(';')
    if len(senseidlist) == 1:
        return wordnet_get_glosses(_word,int(_sense_id))
    elif len(senseidlist) > 1:
        list_proper_glosses = []
        other_gloss_set = set()
        for senseid in senseidlist:
            gloss, other_glosses =  wordnet_get_glosses(_word,int(senseid))
            if gloss:
                list_proper_glosses.append(gloss)
                other_gloss_set.update(set(other_glosses))
        # if one of the glosses is bogus return only one
        if len(list_proper_glosses) == 1:
            return list_proper_glosses[0], other_gloss_set
        return list_proper_glosses, other_gloss_set
    else:
        return  'WN Error',[]   

In [None]:
def add_wordnet_gloss(_semcordf,verbose=True):

    if verbose: print('Adding wordnet glosses')
    _semcordf['idx'] = list(range(len(_semcordf))) #adding index for merging
    tqdm.pandas(desc="Gloss preprocessing") 
    _glosses = _semcordf[_semcordf.wn_sense_num != '0'].progress_apply(lambda _row: (*wordnet_gloss_helper(_row['lemma'],_row['wn_sense_num'])\
                                                                        ,_row['idx']),axis=1 )
    _df_glosses = pd.DataFrame(_glosses.values.tolist(),columns=['gloss','other_glosses','idx'])
    _merged = pd.merge(_semcordf,_df_glosses,on='idx',how='left').fillna('')
    # for now take only first gloss
    _merged['gloss'] = _merged.gloss.apply(lambda x: x[0] if x else '')
    # tag how many other glosses there are
    _merged['other_glossesnum'] = _merged.other_glosses.apply(lambda x: len(x))   
    if verbose: print('Done!')   
    return _merged

In [None]:
def gen_sentence_context_pairs(_df):
    sentence = _df.text.str.cat(sep = ' ').replace(" '","'")
    basedct = {'context':sentence,
               'file':_df.iloc[0].file}

    semcor_sentences = []

    # Make sure there are other glosses and that the gloss column is not null
    for i,(j,line) in enumerate(_df[(_df.other_glossesnum > 0) & (_df.gloss != 'WN Error') & (_df.gloss != '')].iterrows()): 

        newbasedct = basedct.copy()
        newbasedct['target_word'] = line.text
        newbasedct['gloss'] = line.gloss
        newbasedct['is_proper_gloss'] = True
        semcor_sentences.append(newbasedct)
        # Then append all different contexes with False labels
        for other_glosses in line.other_glosses:
            newbasedct = basedct.copy()
            newbasedct['target_word'] = line.text
            newbasedct['gloss'] = other_glosses
            newbasedct['is_proper_gloss'] = False
            semcor_sentences.append(newbasedct)
                
    return semcor_sentences

In [None]:
def build_joint_dataset(_df):
    groupbyobj = _df.groupby(['sent','file'])
    full_dict_list = []
    for [sentnum,file],gp in tqdm(groupbyobj,total=len(groupbyobj)):
        full_dict_list.extend(gen_sentence_context_pairs(gp))
    cols = ['file','context','target_word','gloss','is_proper_gloss']
    return pd.DataFrame(full_dict_list)[cols]

In [None]:
def build_corpus_dataset(_basepath,verbose=True,byref=False):
    
    corpus_df = build_corpus(_basepath,verbose=verbose)
    corpus_df = add_wordnet_gloss(corpus_df,verbose=verbose)
    if verbose: print('Processing adn labeling joint context-gloss pairs...',end="")
    final_corpus = build_joint_dataset(corpus_df)
    if verbose: print('Done!')    
    return final_corpus

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

**Saving the Final Corpus**

In [None]:
fpath = "./notebooks/data/raw/semcor3.0/"
savepath = r"./notebooks/data/preprocessed/semcor_gloss_BiLSTM.pkl"
final_corpus = build_corpus_dataset(fpath)
final_corpus.to_pickle(savepath)

Parsing corpus


100%|██████████| 352/352 [00:33<00:00, 10.46it/s]


Preprocessing indexes...Done!
(676546, 10)
Adding wordnet glosses


Gloss preprocessing: 100%|██████████| 226040/226040 [00:24<00:00, 9158.75it/s] 


Done!
Processing adn labeling joint context-gloss pairs...

100%|██████████| 37168/37168 [01:57<00:00, 316.22it/s]


Done!


**Reading the Final Saved Corpus**

In [None]:
final_corpus = pd.read_pickle(savepath)

In [None]:
orig_final_corpus = final_corpus
orig_final_corpus.shape

(1589759, 5)

In [None]:
final_corpus[final_corpus.is_proper_gloss==True].shape,final_corpus[final_corpus.is_proper_gloss==False].shape

((178570, 5), (1411189, 5))

**Under Sampling the data**

In [None]:
X_df, y_df  = final_corpus.iloc[:, :-1] , final_corpus.iloc[:, -1]

under = RandomUnderSampler(sampling_strategy=0.4)
X_df, y_df = under.fit_resample(X_df, y_df)

over = RandomOverSampler(sampling_strategy=.5)
X_df,y_df = over.fit_resample(X_df , y_df)

X_df['is_proper_gloss'] = y_df
final_corpus = X_df

In [None]:
final_corpus[final_corpus.is_proper_gloss==True].shape,final_corpus[final_corpus.is_proper_gloss==False].shape

((223212, 5), (446425, 5))

In [None]:
final_corpus = final_corpus.sample(frac=1)

**Train and Test Split**

In [None]:
train_df, val_df =  train_test_split(final_corpus, 
                                        random_state=None, 
                                        test_size=.1)
val_df, test_df =  train_test_split(val_df, 
                                        random_state=None, 
                                        test_size=.1)

In [None]:
train_df.shape , val_df.shape, test_df.shape

((602673, 5), (60267, 5), (6697, 5))

**Data Loader**

In [None]:
class MLDataGen(tf.keras.utils.Sequence):
    
    def __init__(self, df, batch_size = 64, gen_type=  'train', model_file_lang1 = None, model_file_lang2 = None):
        self.context =list(map(lambda row:  ''.join(char for char in row[2] if char.isalnum())+' '+row[1]+' '+ ''.join(char for char in row[2] if char.isalnum()),df.to_numpy())) 
        self.tagged_sense = list(map(lambda row: str(row[3]),df.to_numpy())) 
        self.label= df['is_proper_gloss'].astype(int).to_numpy()
        self.gen_type = gen_type
        self.batch_size = batch_size
        
        self.max_seq_len=50
        self.x0_vocab_size =0
        self.x1_vocab_size =0
        self.x0_tokenizer = None
        self.x1_tokenizer = None
        self._x0=[]
        self._x1=[]
        self._y=[]
        self._initialize_variables()
        self.model_file_lang1 = model_file_lang1
        self.model_file_lang2 = model_file_lang2


    def _initialize_variables(self):         
        
        if self.gen_type == 'train':
            tokenized = [line.split() for line in self.context]
            flattendata = list(itertools.chain.from_iterable(tokenized))
            word_freq = dict(Counter(flattendata))
            tot_cnt  = len(word_freq)
            self.x0_tokenizer = Tokenizer(num_words = tot_cnt+1 ,oov_token= '<OOV>')
            # saving
            with open('tokenizer_lang1.pickle', 'wb') as handle:
                pickle.dump(self.x0_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:

            # loading
            with open('tokenizer_lang1.pickle', 'rb') as handle:
                self.x0_tokenizer = pickle.load(handle)
                
            if self.x0_tokenizer  ==None:
                print('Vocabulary is not set. Try training mode')
                
        self.x0_tokenizer.fit_on_texts(list(self.context))
        self.x0_vocab_size = self.x0_tokenizer.num_words + 1
        
        # Convert text sequences to integer sequences 
        x0_train_seq = self.x0_tokenizer.texts_to_sequences(self.context) 

        # Pad zero upto maximum length
        x0_train = pad_sequences(x0_train_seq,  maxlen=self.max_seq_len, padding='post')
        
        if self.gen_type == 'train':
            tokenized = [line.strip().split() for line in self.tagged_sense]
            flattendata = list(itertools.chain.from_iterable(tokenized))
            word_freq = dict(Counter(flattendata))
            tot_cnt  = len(word_freq)
            self.x1_tokenizer = Tokenizer(num_words = tot_cnt ,oov_token= '<OOV>')
            # saving
            with open('tokenizer_lang2.pickle', 'wb') as handle:
                pickle.dump(self.x1_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:

            # loading
            with open('tokenizer_lang2.pickle', 'rb') as handle:
                self.x1_tokenizer = pickle.load(handle)
                
            if self.x1_tokenizer  ==None:
                print('Vocabulary is not set. Try training mode')


 
        self.x1_tokenizer.fit_on_texts(list(self.tagged_sense))
        self.x1_vocab_size = self.x1_tokenizer.num_words + 1
        
        # Convert text sequences to integer sequences 
        x1_train_seq = self.x1_tokenizer.texts_to_sequences(self.tagged_sense)

        # Pad zero upto maximum length
        x1_train = pad_sequences(x1_train_seq,  maxlen=self.max_seq_len, padding='post')

        self._x0 = x0_train
        self._x1 = x1_train
        self._y = self.label
    
    
        
    def get_data_batch(self,i):           
       
        x0 = self._x0[i * self.batch_size:(i + 1) * self.batch_size]
        x1 = self._x1[i * self.batch_size:(i + 1) * self.batch_size]       
        y = self._y[i * self.batch_size:(i + 1) * self.batch_size].reshape(-1,1)
        return [np.array(x0), np.array(x1)],np.array(y)
    
    def __getitem__(self, index):
        X, y = self.get_data_batch(index)
        return X,y
     
    def __len__(self):
        return len(self._x0) // self.batch_size

**Validation and Train Data**

In [None]:
data_gen = MLDataGen(train_df,batch_size=256)
val_gen = MLDataGen(val_df, gen_type='val',batch_size=64)

**Downloading Glove Embedding**

In [4]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2022-05-01 15:17:14--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-05-01 15:17:15--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-01 15:17:15--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

**Embeddings of the data using Glove**

In [None]:
path_to_glove_file = './glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
word_index_0 = data_gen.x0_tokenizer.word_index
word_index_1 = data_gen.x1_tokenizer.word_index

num_tokens_0 = (data_gen.x0_vocab_size) + 2
num_tokens_1 = (data_gen.x1_vocab_size) + 2

**Embedding Matrix**

In [None]:
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix_0 = np.zeros((num_tokens_0, embedding_dim))
for word, i in word_index_0.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix_0[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 33186 words (6794 misses)


In [None]:
embedding_matrix_1 = np.zeros((num_tokens_1, embedding_dim))
for word, i in word_index_1.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix_1[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 50245 words (7989 misses)


**Nueral Network Architecture**

In [None]:
latent_dim = 64
embedding_dim = 50

# Encoder
encoder_inputs = Input(shape=(data_gen.max_seq_len, ))

# Embedding layer
enc_emb = Embedding(num_tokens_0 , 
                    embedding_dim, 
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_0),
                    trainable=False,
                    mask_zero=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.1,recurrent_dropout=0.1)
(encoder_output1, state_h, state_c) = encoder_lstm1(enc_emb)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(num_tokens_1 , 
                          embedding_dim, 
                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_1),
                          trainable=False,
                          mask_zero=True)

dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, dropout=0.1,recurrent_dropout=0.1)
decoder_outputs = decoder_lstm(dec_emb, initial_state=[state_h, state_c])


# Dense layer
decoder_dense1 = Dense(64, activation='relu')#Dense(2, activation='softmax')
decoder_outputs = decoder_dense1(decoder_outputs)
decoder_dense2 = Dense(1, activation='sigmoid')
decoder_outputs = decoder_dense2(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 50)       2824150     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     1210400     ['input_2[0][0]']                
                                                                                              

**F1 Score Calculation**

In [None]:
def get_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

**Compiling and Saving checkpoints. Also added Early Stopping**

In [None]:
adam = Adam(learning_rate=0.01)
model.compile( loss=tf.keras.losses.BinaryCrossentropy(), 
                optimizer='rmsprop', 
                metrics=[get_score,'accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

from keras.callbacks import Callback,ModelCheckpoint

checkpoint_path = "./data/preprocessed/checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
   checkpoint_path, verbose=1, save_weights_only=True,
   # Save weights, every epoch.
   save_freq='epoch')

**Fitting the Model**

In [None]:
history = model.fit(
    data_gen,
    validation_data=val_gen,
    epochs=10,
    callbacks=[es,cp_callback],
    verbose=1
    )

Epoch 1/10
Epoch 1: saving model to ./data/preprocessed/checkpoints/cp-0001.ckpt
Epoch 2/10
Epoch 2: saving model to ./data/preprocessed/checkpoints/cp-0002.ckpt
Epoch 3/10
Epoch 3: saving model to ./data/preprocessed/checkpoints/cp-0003.ckpt
Epoch 3: early stopping


In [None]:
reverse_target_word_index = data_gen.x0_tokenizer.index_word
reverse_source_word_index = data_gen.x0_tokenizer.index_word
target_word_index = data_gen.x1_tokenizer.word_index

In [None]:
# Inference Models

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_output1,
                      state_h, state_c])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(data_gen.max_seq_len, latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2 = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense1(decoder_outputs2)
decoder_outputs2 = decoder_dense2(decoder_outputs2)


# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2])

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ,
                "N": wn.NOUN,
                "V": wn.VERB,
                "R": wn.ADV}

    return tag_dict.get(tag, wn.NOUN)


**Sense Prediction**

In [None]:
def predictSense(sentence, target):
    targets=[]
    if '_' in target:
        targets = target.split('_')
    else:
        targets=[target]
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for t in targets:
        lemmas.append(lemmatizer.lemmatize(t, get_wordnet_pos(t)))
    lemma = '_'.join(lemmas)
    all_synsets = wn.synsets(lemma)
    glosses = []
    for syn in all_synsets:
        glosses.append(syn.definition())
    
    if len(glosses)==0:
        return ''    
    
    x0_test_seq = data_gen.x0_tokenizer.texts_to_sequences([sentence+' '+target]*len(glosses))
    x0_test = pad_sequences(x0_test_seq,  maxlen=data_gen.max_seq_len, padding='post')

    x1_test_seq = data_gen.x1_tokenizer.texts_to_sequences(glosses)
    x1_test = pad_sequences(x1_test_seq,  maxlen=data_gen.max_seq_len, padding='post')
    
    #return x0_test , x1_test    
    try:
        (e_out, e_h, e_c) = encoder_model.predict(x0_test)

        ypred = decoder_model.predict([x1_test] + [e_out, e_h, e_c])
    except:
        print(lemma,targets)
    return glosses[np.argmax(ypred)]

**Test Set Data preprations**

In [None]:
def build_pred_dataset(_df):
    context_target = _df[['context','target_word']].drop_duplicates()
    print(context_target.shape,_df.shape)
    basedct = {}
    predictions = []
    context_target['predicted_gloss'] = context_target[['context','target_word']].apply(lambda row: predictSense(row[0],row[1]),axis=1)
    prediction_df = pd.merge(_df,context_target,on=['context','target_word'],how='right').fillna('')
    prediction_df['Predicitons'] = prediction_df[['gloss','predicted_gloss']].apply(lambda row: True if row[0] == row[1] else False,axis=1)
    
    return prediction_df.drop(columns=['predicted_gloss'])

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

**Results**

In [None]:
pred = build_pred_dataset(test_df[:500])

(499, 2) (500, 5)


In [None]:
pred_df = pred[pred.is_proper_gloss==True]

In [None]:
pred_df.shape , pred_df[pred_df.is_proper_gloss ==pred_df.Predicitons].shape

((170, 6), (98, 6))