**Downloading Dependencies**

In [1]:
!pip install  pytorch-transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 5.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 35.2 MB/s 
Collecting boto3
  Downloading boto3-1.22.4-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 51.0 MB/s 
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.1 MB/s 
[?25hCollecting botocore<1.26.0,>=1.25.4
  Downloading botocore-1.25.4-py3-none-any.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 43.1 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.

In [3]:
import os,glob,pathlib
import pandas as pd
import numpy as np
import shutil
import os
import pytorch_transformers
import xml.etree.ElementTree as ET
from tqdm import tqdm,trange
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError
from keras.preprocessing.sequence import pad_sequences
from pytorch_transformers import BertTokenizer
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, SubsetRandomSampler
import torch
from torch.autograd import Function
from torch import nn
from torch import arange, zeros_like
from pytorch_transformers import BertModel, BertConfig
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

**Downloading Packages for Github cloning. Cloning the Dataset from Princton Repo**

In [4]:
!pip install gitpython
import git
repo = git.Repo.clone_from("https://github.com/rubenIzquierdo/wsd_corpora.git", "./data/raw/wsd_corpora")

Collecting gitpython
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 5.1 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
Collecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.9 gitpython-3.1.27 smmap-5.0.0


**Setting displaying options**

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

**Parsing The Semcor 3.0 Dataset**

In [6]:
def xml_parse(_fpath):

    
    sctree = ET.parse(_fpath)

    # Iterates over list of words in files    
    dct_list1 = []
    for node in sctree.iter('wf'):
        attributes = node.attrib
        attributes['text'] = node.text
        dct_list1.append(attributes)

    # Iterates over terms to find senses and corresponding sense references
    dct_list2 = []
    for term in sctree.iter('term'):
        lemma = term.attrib.get('lemma')
        wordid = term.find('span/target').attrib.get('id')
        pos = ''

        wnsn = '0'
        senseid=''
        if term.findall('externalReferences/externalRef'):
            wnsn = term.findall('externalReferences/externalRef')[0].attrib.get('reference')
            senseid = term.findall('externalReferences/externalRef')[1].attrib.get('reference')
        dct_list2.append({'id':wordid,'lemma':lemma,'wn_sense_num':wnsn,'lexical_key':senseid,'pos':term.attrib['pos']})

    word_df = pd.DataFrame(dct_list1)
    sense_ref_df = pd.DataFrame(dct_list2)   
    
    return pd.merge(word_df,sense_ref_df,on='id')


In [7]:
def gen_file_list(_basepath,ext='*.naf'):    
    file_list = []
    fla = glob.glob(os.path.join(_basepath,ext))
    flb = glob.glob(os.path.join(_basepath,'*',ext))
    flc = glob.glob(os.path.join(_basepath,'**',ext))
    files = set(fla+flb+flc)
    for fileref in files: #search recursively for files
        parent_folder_name = pathlib.Path(fileref).parent.name
        file_name = pathlib.Path(fileref).name.split('.')[0]
        
        file_list.append( {'file_path':fileref,
                           'parent_folder':parent_folder_name,
                           'file_name':file_name})

    return pd.DataFrame(file_list)

In [20]:
def parse_corpus(_basepath,filter_validation = False):

   # generate dataframe with references to all files
    _fpath_df = gen_file_list(_basepath)
    
    # filter to remove validation files
    filtered_file_df = _fpath_df
    if filter_validation:
         filtered_file_df = _fpath_df[_fpath_df.parent_folder != 'brownv']
    
    _dflist = []
    for i,file_entry in tqdm(filtered_file_df.iterrows(), total=filtered_file_df.shape[0]):
        _parsed_file_df = xml_parse(file_entry.file_path)
        _parsed_file_df['file'] = file_entry.file_name
        _dflist.append(_parsed_file_df)

    return pd.concat(_dflist)[:5000]

In [9]:
def build_corpus(_basepath,verbose=True,**kwargs):

    if verbose: print('Parsing corpus')
    base_corpus = parse_corpus(_basepath,**kwargs)

    # Build wordnet ref key using wordnet lemma
    if verbose: print('Preprocessing indexes...',end="")
    base_corpus['wn_index'] = base_corpus['lemma']+'%'+base_corpus['lexical_key']

    base_corpus.loc[base_corpus.lexical_key == '','wn_index'] = ''
    base_corpus.drop('lexical_key',axis=1,inplace=True)
    if verbose: print('Done!')
    return base_corpus

In [10]:
def wordnet_get_glosses(_word,_sense_id):

    _sense_id = int(_sense_id)
    if not _word: # if ref is empty
        return ''
    try:
        all_synsets = wn.synsets(_word)
        target_gloss = []
        other_glosses = []
        for syn in all_synsets:
            split = syn.name().split('.')
            wn_lemma = split[0]
            sense_num = int(split[-1])
            if sense_num == _sense_id:
                target_gloss.append(syn.definition()) 
            else:
                other_glosses.append(syn.definition())                
        return target_gloss,other_glosses[:2]
    except (AttributeError,WordNetError,ValueError) as err:
        return 'WN Error',None

In [11]:
def wordnet_gloss_helper(_word,_sense_id):

  if not _word or not _sense_id:
      return '',''
  senseidlist = _sense_id.split(';')
  if len(senseidlist) == 1:
      return wordnet_get_glosses(_word,int(_sense_id))
  elif len(senseidlist) > 1:
      list_proper_glosses = []
      other_gloss_set = set()
      for senseid in senseidlist:
          gloss, other_glosses =  wordnet_get_glosses(_word,int(senseid))
          if gloss:
              list_proper_glosses.append(gloss)
              other_gloss_set.update(set(other_glosses))
      # if one of the glosses is bogus return only one
      if len(list_proper_glosses) == 1:
          return list_proper_glosses[0], other_gloss_set
      return list_proper_glosses, other_gloss_set
  else:
      return  'WN Error',[]   

In [12]:
def add_wordnet_gloss(_semcordf,verbose=True):

    #if verbose: print('Adding wordnet glosses')
    _semcordf['idx'] = list(range(len(_semcordf))) #adding index for merging
    tqdm.pandas(desc="Gloss preprocessing") 
    _glosses = _semcordf[_semcordf.wn_sense_num != '0'].progress_apply(lambda _row: (*wordnet_gloss_helper(_row['lemma'],_row['wn_sense_num'])\
                                                                        ,_row['idx']),axis=1 )
    _df_glosses = pd.DataFrame(_glosses.values.tolist(),columns=['gloss','other_glosses','idx'])
    _merged = pd.merge(_semcordf,_df_glosses,on='idx',how='left').fillna('')
    # for now take only first gloss
    _merged['gloss'] = _merged.gloss.apply(lambda x: x[0] if x else '')
    # tag how many other glosses there are
    _merged['other_glossesnum'] = _merged.other_glosses.apply(lambda x: len(x))   
    if verbose: print('Done!')
    return _merged

In [13]:
def gen_sentence_context_pairs(_df):

    concatenated_sentence = _df.text.str.cat(sep = ' ').replace(" '","'")
    basedct = {'context':concatenated_sentence,
               'file':_df.iloc[0].file}

    semcor_sentences = []

    # Make sure there are other glosses and that the gloss column is not null
    for i,line in _df[(_df.other_glossesnum > 0) & (_df.gloss != 'WN Error') & (_df.gloss != '')].iterrows(): 

        # First append the proper context to dct with label True
        newbasedct = basedct.copy()
        newbasedct['target_word'] = line.text
        newbasedct['gloss'] = line.gloss
        newbasedct['is_proper_gloss'] = True
        semcor_sentences.append(newbasedct)
        # Then append all different contexes with False labels
        for other_glosses in line.other_glosses:
            newbasedct = basedct.copy()
            newbasedct['target_word'] = line.text
            newbasedct['gloss'] = other_glosses
            newbasedct['is_proper_gloss'] = False
            semcor_sentences.append(newbasedct)
                
    return semcor_sentences

In [14]:
def build_joint_dataset(_df):

    groupbyobj = _df.groupby(['sent','file'])
    full_dict_list = []
    for [sentnum,file],gp in tqdm(groupbyobj,total=len(groupbyobj)):
        full_dict_list.extend(gen_sentence_context_pairs(gp))
    cols = ['file','context','target_word','gloss','is_proper_gloss']
    return pd.DataFrame(full_dict_list)[cols]

In [17]:
def build_joint_corpus(_basepath,verbose=True,byref=False):

    semcor_corpus_df = build_corpus(_basepath,verbose=verbose)
    semcor_corpus_df = add_wordnet_gloss(semcor_corpus_df,verbose=verbose)
    if verbose: print('Processing adn labeling joint cintext-gloss pairs...',end="")
    final_corpus = build_joint_dataset(semcor_corpus_df)
    if verbose: print('Done!')
    final_corpus['gloss'] = final_corpus.gloss.apply(lambda x: x[0] if type(x) == list else x)
    return final_corpus

**Saving the Final Corpus**

In [21]:
fpath = "./data/raw/wsd_corpora/semcor3.0/"
#os.mkdir("./data/raw/wsd_corpora/preprocessed")
savepath = r"./data/raw/wsd_corpora/preprocessed/semcor_gloss_BERT.pkl"
final_corpus = build_joint_corpus(fpath)
final_corpus.to_pickle(savepath)

Parsing corpus


100%|██████████| 352/352 [00:38<00:00,  9.14it/s]


Preprocessing indexes...Done!


Gloss preprocessing: 100%|██████████| 2499/2499 [00:06<00:00, 382.38it/s] 


Done!
Processing adn labeling joint cintext-gloss pairs...

100%|██████████| 352/352 [00:00<00:00, 418.79it/s]

Done!





**HyperParameters**

In [22]:
data_path=savepath
default_save_path='..\data'
weak_supervision=True
preprocess_inputs=True
token_layer='sent-cls-ws'
batch_size=128
val_check_interval=0.05
model_type='bert-base-uncased'
lr=2e-5
weight_decay=0.01
epochs=4
input_len=128

**Load Data**

In [30]:
tokenizer = BertTokenizer.from_pretrained(model_type)
train_dataset = pd.read_pickle(data_path)

**BERT SETENCE Format**

In [24]:
def format_sentences_BERT(_row):
    return '[CLS] '+_row.loc['context']+' [SEP] '+_row.loc['gloss']+' [SEP]'


In [25]:
def tokenize_and_index(_df,output_len=128,tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),
                       display_progress = True,formatting_method=format_sentences_BERT):
   
    tqdm.pandas(desc="Sentence preprocessing")    
    _df.loc[:,'preproc_sent'] = _df.progress_apply(formatting_method,axis=1)
    tqdm.pandas(desc="Sentence Tokenization")
    _df.loc[:,'tokenized_sent'] = _df.preproc_sent.progress_apply(tokenizer.tokenize)
    tqdm.pandas(desc="Tokenizing target words")
    _df.loc[:,'tokenized_target_word'] = _df.target_word.progress_apply(lambda row: tokenizer.tokenize(row)[0])
    tqdm.pandas(desc="Converting tokens to embeddings")
    _df.loc[:,'input_ids'] = _df.tokenized_sent.progress_apply(tokenizer.convert_tokens_to_ids)
    
    padded_input_ids = pad_sequences(_df['input_ids'], 
                                    maxlen=output_len, dtype="long",padding = "post", truncating = "post")
    _df.loc[:,'input_ids'] = np.split(padded_input_ids, _df.shape[0], axis=0)


100%|██████████| 231508/231508 [00:00<00:00, 2929853.36B/s]


In [26]:
def gen_sentence_indexes(_df,output_len=128):
    
    def get_index_of_sep(_row):
        _index_sep_tokens = [i for i,word  in enumerate(_row['tokenized_sent']) \
                           if word == '[SEP]']
        _sentence_indexes = np.array([0]*(_index_sep_tokens[0]+1)\
                                     +[1]*(_index_sep_tokens[1]-_index_sep_tokens[0]))
        return _sentence_indexes
    
    tqdm.pandas(desc="Indexing sentences") 
    _df.loc[:,'sent_indexes'] = _df.progress_apply(get_index_of_sep,axis=1)
    padded_sent_idx = pad_sequences(_df['sent_indexes'],
                                               maxlen=output_len, dtype="long",
                                               padding = "post", truncating = "post",value=1)
    _df.loc[:,'sent_indexes'] = np.split(padded_sent_idx, _df.shape[0], axis=0)


In [27]:
def find_index_of_target_token(_df):
    """
    looks for index of target token in the corresponding tokenized sentence
    
    """
    find_token = lambda  _row: [i for i,word  in \
                         enumerate(_row['tokenized_sent']) \
                         if word == _row['tokenized_target_word'].lower()]
    tqdm.pandas(desc="Finding target token in sentence") 
    _df.loc[:,'target_token_idx'] = _df.progress_apply(find_token,axis=1)

In [31]:
def preprocess_model_inputs(_df,sample_size=None, filter_bad_rows=True,
                            output_len=128,
                            tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),**kwargs):

    
    _smpldf = _df
    if sample_size:
        _smpldf = _df.sample(sample_size)
    
    tokenize_and_index(_smpldf,output_len=output_len,
                       tokenizer=tokenizer)
    gen_sentence_indexes(_smpldf,output_len=output_len)
    find_index_of_target_token(_smpldf)

        
    if filter_bad_rows: # rows where the target word index is not found due to cutoff or exceeds tensor size 
        _smpldf = _smpldf[_smpldf.target_token_idx.apply(lambda x: len(x) !=  0)]
        _smpldf = _smpldf[_smpldf.target_token_idx.apply(lambda x: x[0] <  output_len)]

    
    return _smpldf

In [32]:
df_train = preprocess_model_inputs(train_dataset,tokenizer=tokenizer,output_len=input_len)

Sentence preprocessing: 100%|██████████| 5654/5654 [00:00<00:00, 30073.14it/s]
Sentence Tokenization: 100%|██████████| 5654/5654 [00:04<00:00, 1332.91it/s]
Tokenizing target words: 100%|██████████| 5654/5654 [00:00<00:00, 11170.81it/s]
Converting tokens to embeddings: 100%|██████████| 5654/5654 [00:00<00:00, 30129.61it/s]
Indexing sentences: 100%|██████████| 5654/5654 [00:00<00:00, 46172.29it/s]
Finding target token in sentence: 100%|██████████| 5654/5654 [00:00<00:00, 6944.65it/s]


**DATASET**

In [33]:
class CorpusDataset(Dataset):
    """
    pytorch dataset handling class    
    """
    def __init__(self, data):
        self.corpus_dataframe = data

    def __len__(self):
        return self.corpus_dataframe.shape[0]

    def __getitem__(self, idx):
        row = self.corpus_dataframe.iloc[idx]
        return (torch.tensor(row['input_ids'][0]),  # Input token encodings
                torch.tensor(row['sent_indexes'][0], dtype=torch.int64), # Sentence encoding
                torch.tensor(row['target_token_idx'][0], dtype=torch.int64), # Target token indexes
                torch.tensor(row['is_proper_gloss'],dtype=torch.int64)) # Labels

In [34]:
class TokenClsFunction(Function):
  
    @staticmethod
    def forward(ctx, input, target_token_tensor):
        ctx.save_for_backward(input,target_token_tensor)
        target_token_tensor.requires_grad = False
        flattened_target_tensor = target_token_tensor.flatten()
        return input[arange(flattened_target_tensor.shape[0]),flattened_target_tensor,:]
        
    @staticmethod
    def backward(ctx, grad_output):
        input1,target_token_tensor = ctx.saved_tensors
        grad = zeros_like(input1)
        flattened_target_tensor = target_token_tensor.flatten()
        # gradient only flows to specific indexes of target tensor
        grad[arange(flattened_target_tensor.shape[0]),flattened_target_tensor,:] = grad_output
        return grad, zeros_like(target_token_tensor)  


In [35]:
class TokenClsLayer(nn.Module):
    def __init__(self):
        super(TokenClsLayer, self).__init__()
        self.tcf = TokenClsFunction.apply
        
    def forward(self, features, token_indexes):        
        return self.tcf(features,token_indexes)


**BERT Classifier**

In [36]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()
        self.config = BertConfig()
        self.num_labels = 2
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.tokenselectlayer = TokenClsLayer()
        self.linear = nn.Linear(768, self.num_labels)
        self.softmax = nn.Softmax(dim=1)
        nn.init.xavier_normal_(self.linear.weight)
        #self.relu = nn.ReLU()
        
    def forward(self, input_id, mask,_target_token_ids):
        _encoded_layers, pooled_output = self.bert(input_id, mask)
        _target_token_embeddings = self.tokenselectlayer(_encoded_layers,_target_token_ids)
        dropout_output = self.dropout(_target_token_embeddings)
        linear_output = self.linear(dropout_output)
        final_layer = self.softmax(linear_output)
        return final_layer

In [37]:
df = df_train[['input_ids','sent_indexes','target_token_idx','is_proper_gloss']]

train_df, val_df =  train_test_split(df, 
                                        random_state=None, 
                                        test_size=.2)

In [38]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = CorpusDataset(train_data), CorpusDataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    q = False
    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for b_tokens_tensor, b_sentence_tensor, b_target_token_tensor,train_label  in tqdm(train_dataloader):
                b_tokens_tensor=b_tokens_tensor.to(device)
                b_sentence_tensor=b_sentence_tensor.to(device)
                b_target_token_tensor=b_target_token_tensor.to(device)
                train_label=train_label.to(device)

                output = model(b_tokens_tensor, 
                               b_sentence_tensor, 
                               b_target_token_tensor)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_b_tokens_tensor, val_b_sentence_tensor, val_b_target_token_tensor,val_label in val_dataloader:

                    val_b_tokens_tensor = val_b_tokens_tensor.to(device)
                    val_b_sentence_tensor = val_b_sentence_tensor.to(device)
                    val_b_target_token_tensor = val_b_target_token_tensor.to(device)
                    val_label = val_label.to(device)
                    

                    output = model(val_b_tokens_tensor, val_b_sentence_tensor, val_b_target_token_tensor)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
   


In [39]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

100%|██████████| 433/433 [00:00<00:00, 182105.05B/s]
100%|██████████| 440473133/440473133 [00:08<00:00, 49826729.68B/s]


**Training the Model**

In [40]:
train(model, train_df, val_df, LR, EPOCHS)

  0%|          | 8/2262 [00:41<3:16:57,  5.24s/it]


KeyboardInterrupt: ignored

**SAVE Model**

In [None]:
# create an iterator object with write permission - model.pkl
import pickle
modelsavepath='./model.pkl'
with open(modelsavepath, 'wb') as files:
    pickle.dump(model, files)