In [0]:
!wget 'http://lcl.uniroma1.it/wsdeval/data/WSD_Unified_Evaluation_Datasets.zip'
!wget 'http://lcl.uniroma1.it/wsdeval/data/WSD_Evaluation_Framework.zip'

!mkdir resources
!rm -rf sample_data

!unzip  'WSD_Evaluation_Framework.zip'
!unzip  'WSD_Unified_Evaluation_Datasets.zip'

In [0]:
import logging
import os
import warnings
import re
import pickle
import numpy as np
import json

from lxml.etree import iterparse
from nltk.corpus import wordnet as wn
from tensorflow.keras.preprocessing.text import (Tokenizer,
                                                 text_to_word_sequence)
from tqdm import tqdm_notebook as tqdm

In [0]:
def save_json(save_to, save_what):
    with open(save_to, 'w+') as json_file:
        json.dump(save_what, json_file)


def save_pickle(save_to, save_what):
    with open(save_to, mode='wb') as f:
        pickle.dump(save_what, f)


def load_pickle(load_from):
    with open(load_from, 'rb') as f:
        return pickle.load(f)

In [0]:
def configure_tf():
    warnings.filterwarnings('ignore')
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto()
    # dynamically grow the memory used on the GPU
    config.gpu_options.allow_growth = True
    # to log device placement (on which device the operation ran)
    config.log_device_placement = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.8
    # (nothing gets printed in Jupyter, only if you run it standalone)
    sess = tf.Session(config=config)
    # set this TensorFlow session as the default session for Keras
    set_session(sess)

In [0]:
def initialize_logger():
    """
    Customize the logger, and fixes seed
    """
    np.random.seed(0)
    logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s",
                        datefmt='%H:%M:%S', level=logging.INFO)

In [0]:
def build_dict(file_name, save_to=None):
    '''
    Builds and saves dictionary from text file
    This dictionary contains all the senses of all words
    '''
    if save_to is not None and os.path.exists(save_to) and os.path.getsize(save_to) > 0:
        file_dict = load_pickle(save_to)
        logging.info("Dictionary is loaded")
    else:
        file_dict = dict()
        with open(file_name, mode='r') as file:
            lines = file.read().splitlines()
            for line in tqdm(lines, desc='Building dictionary'):
                synset_id, synset = line.split()[0], line.split()[1]
                file_dict[synset_id] = synset
        logging.info("Dictionary is built")
        if save_to is not None:
            save_pickle(save_to, file_dict)
        logging.info("Dictionary is saved")
    return file_dict

In [0]:
def parse_dataset(file_name, gold_dict, save_to_paths=None):
    '''
    Starts with reading xml file, only sentence tags, iterates over children
    of sentences' tags, if it word-format or instance we add lemma to form
    our sentence, if instance we add it in form of lemma_synsetID.

    :param file_name: string points to path of xml file
    :param gold_dict: dict contains all senses as per xml file
    :return sentences: list of strings contains all data unlabeled
    :return sentences_labeled: list of strings contains all data labeled
    '''
    if save_to_paths is not None and os.path.exists(save_to_paths[0]) and os.path.getsize(save_to_paths[0]) > 0 and os.path.exists(save_to_paths[1]) and os.path.getsize(save_to_paths[1]) > 0:
        sentences_list = load_pickle(save_to_paths[0])
        lbld_sentences_list = load_pickle(save_to_paths[1])
        logging.info("Parsed Dataset is loaded")
    else:
        # read file contents in terms of sentences
        context = iterparse(file_name, tag="sentence")
        sentences_list, lbld_sentences_list = [], []
        # iterating over the sentences
        for _, elements in tqdm(context, desc="Parsing corpus"):
            sentence, sentence_labeled = [], []
            for elem in list(elements.iter()):
                if elem is not None:
                    # if tag is word-format (wf) or instance
                    if (elem.tag == 'wf' or elem.tag == 'instance') and elem.text is not None:
                        elem_lemma = elem.attrib['lemma']
                        sentence.append(elem_lemma)
                        sentence_labeled.append(elem_lemma)
                    if elem.tag == 'instance' and elem.text is not None:
                        elem_id = elem.attrib['id']
                        elem_lemma = elem.attrib['lemma']
                        sense_key = str(gold_dict.get(elem_id))
                        if sense_key is not None:
                            synset = wn.lemma_from_key(sense_key).synset()
                            synset_id = f"wn:{str(synset.offset()).zfill(8)}{synset.pos()}"
                            sentence_labeled[-1] = f'{synset_id}'
            # if the sentence is not empty
            if len(sentence) and len(sentence_labeled):
                sentences_list.append(sentence)
                lbld_sentences_list.append(sentence_labeled)
            elements.clear()
        logging.info("Parsed the dataset")

        if save_to_paths is not None:
            save_x_to, save_y_to = save_to_paths[0], save_to_paths[1]
            save_pickle(save_x_to, sentences_list)
            save_pickle(save_y_to, lbld_sentences_list)
            logging.info("Saved the dataset")

    return sentences_list, lbld_sentences_list

In [0]:
def process_dataset(data_x, data_y, save_tokenizer=None, save_data=None):
    if (save_data[0] is not None
            and os.path.exists(save_data[0])
            and os.path.getsize(save_data[0]) > 0):
        data_x = load_pickle(save_data[0])
        logging.info("data_x is loaded")
    if (save_data[1] is not None
            and os.path.exists(save_data[1])
            and os.path.getsize(save_data[1]) > 0):
        data_y = load_pickle(save_data[1])
        logging.info("data_y is loaded")

    if (save_tokenizer is not None
            and os.path.exists(save_tokenizer)
            and os.path.getsize(save_tokenizer) > 0):
        tokenizer = load_pickle(save_tokenizer)
        logging.info("Tokenizer is loaded")
        tokenizer.fit_on_texts(data_x)
        tokenizer.fit_on_texts(data_y)
    else:
        filters = '!"#$%&()*+,-./;<=>?@[\\]^_`{|}~\'\t'
        tokenizer = Tokenizer(filters=filters, oov_token='<OOV>', lower=True)
        tokenizer.fit_on_texts(data_x)
        tokenizer.fit_on_texts(data_y)
        if save_tokenizer is not None:
            save_pickle(save_tokenizer, tokenizer)
            logging.info("Tokenizer Saved")

    if save_data is not None:
        save_pickle(save_data[0], data_x)
        save_pickle(save_data[1], data_y)
        logging.info("Processed Data is Saved")

    data_x_ = tokenizer.texts_to_sequences(data_x)
    data_y_ = tokenizer.texts_to_sequences(data_y)

    return data_x_, data_y_

In [0]:
def load_dataset():
    # Building the gold dictionary
    cwd = os.getcwd()
    data_path = os.path.join(cwd, 'data')
    resources_path = os.path.join(cwd, 'resources')

    # Building the gold dictionary for training set
    file_path = os.path.join(
        cwd, 'WSD_Evaluation_Framework', 'Training_Corpora',
        'SemCor', 'semcor.gold.key.txt')
    save_to = os.path.join(resources_path, 'gold_dict.pkl')
    gold_dict = build_dict(file_path, save_to)

    # parsing the dataset
    path = os.path.join(cwd, 'WSD_Evaluation_Framework', 'Training_Corpora', 'SemCor', 'semcor.data.xml')
    save_data = [os.path.join(resources_path, 'train_x.pkl'),
                 os.path.join(resources_path, 'train_y.pkl')]
    (data_x, data_y) = parse_dataset(path, gold_dict, save_to_paths=save_data)

    save_tokenizer = os.path.join(resources_path, 'tokenizer.pkl')
    train_x, train_y = process_dataset(data_x, data_y, save_tokenizer=save_tokenizer, save_data=save_data)
        
    # Building the gold dictionary for dev set
    eval_path = os.path.join('WSD_Unified_Evaluation_Datasets', 'ALL', 'ALL.data.xml')

    eval_gold = os.path.join('WSD_Unified_Evaluation_Datasets', 'ALL', 'ALL.gold.key.txt')

    # Parsing the gold dict
    save_eval_to = os.path.join(resources_path, 'eval_dict.pkl')
    eval_dict = build_dict(eval_gold, save_eval_to)

    # Parsing the dev dataset
    save_data = [os.path.join(resources_path, 'dev_x.pkl'),
                 os.path.join(resources_path, 'dev_y.pkl')]
    (data_x, data_y) = parse_dataset(eval_path, eval_dict,
                                     save_to_paths=save_data)
    save_tokenizer = os.path.join(resources_path, 'tokenizer.pkl')
    dev_x, dev_y = process_dataset(data_x, data_y,
                                   save_tokenizer=save_tokenizer,
                                   save_data=save_data)
    
    tokenizer = load_pickle(save_tokenizer)
    word_tokens = [
        word for word in tokenizer.word_index if not word.startswith('wn:')]
    sense_tokens = [
        word for word in tokenizer.word_index if word.startswith('wn:')]

    vocabulary_size = len(word_tokens) + 1
    output_size = vocabulary_size + len(sense_tokens) + 1

    dataset = {
        'train_x': train_x,
        'train_y': train_y,
        'dev_x': dev_x,
        'dev_y': dev_y,
        'tokenizer': load_pickle(save_tokenizer),
        'vocabulary_size': vocabulary_size,
        'output_size': output_size
    }
    
    return dataset

In [0]:
initialize_logger()

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
dataset = load_dataset()

INFO - 17:05:39: Dictionary is loaded
INFO - 17:05:39: Parsed Dataset is loaded
INFO - 17:05:39: data_x is loaded
INFO - 17:05:40: data_y is loaded
INFO - 17:05:40: Tokenizer is loaded
INFO - 17:05:41: Processed Data is Saved
INFO - 17:05:43: Dictionary is loaded
INFO - 17:05:43: Parsed Dataset is loaded
INFO - 17:05:43: data_x is loaded
INFO - 17:05:43: data_y is loaded
INFO - 17:05:43: Tokenizer is loaded
INFO - 17:05:43: Processed Data is Saved


In [0]:
train_x, train_y = dataset.get('train_x'), dataset.get('train_y')
dev_x, dev_y = dataset.get('dev_x'), dataset.get('dev_y')
vocabulary_size, output_size = dataset.get('vocabulary_size'), dataset.get('output_size')
del dataset

In [0]:
!pip install keras_self_attention

In [13]:
import warnings

import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TimeDistributed
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model, to_categorical

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
from keras_self_attention import SeqSelfAttention

Using TensorFlow backend.


In [0]:
def baseline_model(vocabulary_size, hidden_size,
                   embedding_size, output_size,
                   lstm_layers=1, visualize=False, plot=False):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size,
                        output_dim=embedding_size, mask_zero=True))
    for _ in range(lstm_layers):
        model.add(Bidirectional(LSTM(hidden_size, dropout=0.2,
                                     recurrent_dropout=0.2,
                                     return_sequences=True,
                                     input_shape=(None, None, embedding_size)),
                                merge_mode='sum'))
    model.add(TimeDistributed(Dense(output_size, activation='softmax')))
    # Defining Adam optimizer
    optimizer = Adam(lr=1e-6)
    # Compiling the model
    model.compile(loss="categorical_crossentropy", optimizer=optimizer,
                  metrics=["accuracy"])
    # To visualize the model
    if visualize:
        print('\nModel Summary: \n')
        model.summary()
    # Plot the model to have an image for it (report purposes)
    if plot:
        plot_model(model, to_file='BiLSTM Model.png')
        logging.info("BiLSTM model image saved")
    logging.info('BiLSTM model is created & compiled')
    return model

In [0]:
def attention_model(vocabulary_size, hidden_size,
                    embedding_size, output_size,
                    depth=2, visualize=False,
                    plot=False):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size,
                        output_dim=embedding_size, mask_zero=True))
    for _ in range(depth):
        model.add(Bidirectional(LSTM(hidden_size, dropout=0.2,
                                     recurrent_dropout=0.2,
                                     return_sequences=True,
                                     input_shape=(None, None, embedding_size)),
                                merge_mode='sum'))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(TimeDistributed(Dense(output_size, activation='softmax')))
    # Defining Adam optimizer
    optimizer = Adam(lr=1e-6)
    # Compiling the model
    model.compile(loss="categorical_crossentropy", optimizer=optimizer,
                  metrics=["accuracy"])
    # To visualize the model
    if visualize:
        print('\nModel Summary: \n')
        model.summary()
    # Plot the model to have an image for it (report purposes)
    if plot:
        plot_model(model, to_file='Attention_BiLSTM_Model.png')
        logging.info("Attention_BiLSTM model image saved")
    logging.info('Attention_BiLSTM model is created & compiled')
    return model

In [0]:
configure_tf()

In [0]:
epochs = 1
batch_size = 64
hidden_size = 128
embedding_size = 400

In [0]:
def create_model(type_flag, vocabulary_size, hidden_size,
                 embedding_size, output_size, tokenizer=None,
                 encode_decoder_data=None):
    model = None
    if type_flag == 'baseline':
        model = baseline_model(vocabulary_size, hidden_size,
                                embedding_size, output_size, tokenizer)
    elif type_flag == 'attention':
        model = attention_model(vocabulary_size, hidden_size,
                                embedding_size, output_size)
    elif type_flag == 'seq2seq' and encode_decoder_data is not None:
        # TODO: To Implement
        model = seq2seq_model(encode_decoder_data, vocabulary_size,
                                hidden_size, embedding_size,
                                visualize=True, plot=True)
    return model

In [0]:
type_flag = ['baseline', 'attention', 'seq2seq']
model = create_model(type_flag[0], vocabulary_size, hidden_size,
                     embedding_size, output_size, None)
atten_model = create_model(type_flag[0], vocabulary_size, hidden_size,
                           embedding_size, output_size, None)

In [0]:
%matplotlib inline

# Visualize training history
import matplotlib.pyplot as plt


def plot_history(history, save_to=None):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1, len(history.history[loss_list[0]]) + 1)

    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label=f'Training loss ({history.history[l][-1]:.5f})')
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label=f'Validation loss ({history.history[l][-1]:.5f})')
    
    plt.title('Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    if save_to is not None:
        plt.savefig(f'{save_to}_loss.png')
    plt.show()

    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label=f'Training accuracy ({history.history[l][-1]:.5f})')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label=f'Validation accuracy ({history.history[l][-1]:.5f})')

    plt.title('Training Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    if save_to is not None:
        plt.savefig(f'{save_to}_acc.png')
    plt.show()

In [0]:
logger = TensorBoard("logging/")  # Log the model training process

In [0]:
def data_generator(data_x, data_y, batch_size, output_size):
    for start in range(0, len(data_x), batch_size):
        end = start + batch_size
        data_x_, data_y_ = data_x[start:end], data_y[start:end]
        max_len = len(max(data_x_, key=len))
        data_x_ = pad_sequences(np.array(data_x_), padding='post',
                                 maxlen=max_len)
        data_y_ = pad_sequences(np.array(data_y_), padding='post',
                                 maxlen=max_len)
        # categorize outputs
        data_y_ = to_categorical(data_y_, num_classes=output_size)

        yield data_x_, data_y_

In [0]:
batch_size = 16
try:
    history = model.fit_generator(data_generator(train_x, train_y, batch_size, output_size),
                            verbose=1, shuffle=True, epochs=epochs,
                            workers=0, use_multiprocessing=True,
                            steps_per_epoch=len(train_x)//batch_size,
                            callbacks=[logger])
    history_path = os.path.join(os.getcwd(), 'resources', 'history.pkl')
    save_pickle(history_path, history.history)
    plot_history(history, os.path.join(os.getcwd(), 'resources', 'history'))
    model.save_weights(os.path.join(os.getcwd(), 'resources', 'model_weights.h5'))
except KeyboardInterrupt:
    model.save_weights('model_weights.h5')

Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.




In [0]:
batch_size = 16
try:
    history = atten_model.fit_generator(data_generator(train_x, train_y, batch_size, output_size),
                                        verbose=1, shuffle=True, epochs=epochs,
                                        workers=0, use_multiprocessing=True,
                                        steps_per_epoch=len(train_x)//batch_size,
                                        callbacks=[logger])
    history_path = os.path.join(os.getcwd(), 'resources', 'atten_history.pkl')
    save_pickle(history_path, history.history)
    plot_history(history, os.path.join(os.getcwd(), 'resources', 'atten_history'))
    model.save_weights(os.path.join(os.getcwd(), 'resources', 'atten_model_weights.h5'))
except KeyboardInterrupt:
    model.save_weights('atten_model_weights.h5')