In [None]:
import numpy as np
import pandas as pd
import os, sys, random, argparse
import pickle
from collections import defaultdict as ddict
import tensorflow as tf
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform, randint
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt

# Set precision for numpy
np.set_printoptions(precision=4)

seed = 1234

tf.random.set_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
dataset_name = "riedel"
path2datafile = "data/{}_bert_processed.pkl".format(dataset_name)
models_dir = "models/"
results_dir = "results/{}/".format(dataset_name)

model_name = '{}_best_model'.format(dataset_name)

In [None]:
def padData(data, max_sent_in_bag, max_alias, max_alias_len, subj_max, obj_max, aliases_count):
    
    print("padding data...")

    for i, bag in enumerate(data):
        # padding embeds
        while len(bag['embeds']) <= max_sent_in_bag:
            bag['embeds'].append(np.zeros(768))

        if len(bag['embeds']) > max_sent_in_bag:
            bag['embeds'] = bag['embeds'][:max_sent_in_bag]

        # padding alias
        while len(bag['ProbY']) <= max_sent_in_bag:
            bag['ProbY'].append([0])

        if len(bag['ProbY']) > max_sent_in_bag:
            bag['ProbY'] = bag['ProbY'][:max_sent_in_bag]

        pad_alias = np.zeros((len(bag['ProbY']), max_alias_len), np.int32)
        for j, alias in enumerate(bag['ProbY']):
            pad_alias[j, :len(alias)] = alias[:max_alias_len]
        
        # padding subject
        while len(bag['SubType']) < subj_max:
            bag['SubType'].append(0)
            
        # padding object
        while len(bag['ObjType']) < obj_max:
            bag['ObjType'].append(0)
        
        while len(bag['Y']) > 1:
            bag['Y'].pop()
        
        bag['ProbY'] = pad_alias
    
    return data, len(data), aliases_count

def getData(data):
    max_sent_in_bag, max_alias, max_alias_len, subj_max, obj_max, aliases_count = 0, 0, 0, 0, 0, 0 
    
    for i, bag in enumerate(data):
        del bag['X']
        del bag['DepEdges']
        del bag['Pos1']
        del bag['Pos2']
        del bag['SubPos']
        del bag['ObjPos']
        bag['embeds'] = bag['embeds'].tolist()

        if i==0: print("Keys: ", bag.keys())

        embeds = bag['embeds']
        aliases = bag['ProbY']
        
        max_sent_in_bag = 5
        
        max_alias = 5
        max_alias_len = 20
        for alias in aliases:  # number of relation aliases for each sentence in bag
            for a in alias:
                aliases_count = max(aliases_count, a)
        
        subj_max = max(subj_max, len(bag['SubType']))
        obj_max = max(obj_max, len(bag['ObjType']))
    
    print("bag:", max_sent_in_bag)
    print("alias:", max_alias)
    print("max_alias_len:", max_alias_len)
    print("alias count", aliases_count)
    print("SubType:", subj_max)
    print("ObjType:", obj_max)
    
    return padData(data, max_sent_in_bag, max_alias, max_alias_len, subj_max, obj_max, aliases_count+1)

def get_chunks(data, batch_size):
    count_bags = len(data)
    for indx in range(0, count_bags, batch_size):
        yield data[indx:min(indx + batch_size, count_bags)]
        
def create_batches(data, batch_size):

    batches = []
    for chunk in get_chunks(data, batch_size):
        batch = ddict(list)
        num = 0
        
        bert = []
        alias = []
        subj = []
        obj = []
        out = []
        
        for i, bag in enumerate(chunk):
            bert.append(bag['embeds'])
            alias.append(bag['ProbY'])
            subj.append(bag['SubType'])
            obj.append(bag['ObjType'])
            out.append(bag['Y'])
        batches.append([bert, alias, subj, obj, out])
        
    return batches, len(batches)
    
def get_batches(batches):
    while True:
        for batch in batches:
            yield ({'bert_input': np.array(batch[0], dtype='float32'), 
                    'alias': np.array(batch[1], dtype='int32'), 
                    'subj': np.array(batch[2], dtype='int32'), 
                    'obj': np.array(batch[3], dtype='int32')}, 
                   {'output': np.array(batch[4], dtype='int32')})

In [None]:
def data(path2datafile, models_dir, model_name):
    batch_size = 128
    type_dim = 150
    alias_dim = 150
    embedding_dim = 768
    L2 = 0.001
    regularizer = tf.keras.regularizers.l2(l=0.5 * (L2))
    
    print("loading dataset...")
    
    data = pickle.load(open(path2datafile, 'rb'))
    
    type_count  = len(data['type2id'])
    num_class   = len(data['rel2id'])
    
    print("getData Train")
    train, train_bags, alias_count = getData(data['train'])
    
    print("getData Validation")
    test, test_bags, alias_count = getData(data['test'])
    
    print("getData Test")
    dev, dev_bags, alias_count = getData(data['dev'])
    
    data['train'] = train
    data['test'] = test
    data['dev'] = dev

    return data, embedding_dim, num_class, alias_dim, regularizer, type_count, type_dim, batch_size, alias_count, models_dir, model_name


def model(data, embedding_dim, num_class, alias_dim, regularizer, 
          type_count, type_dim, batch_size, max_pos, alias_count, models_dir, model_name):
    
    # embed input
    embed_input = tf.keras.Input(shape=(None, embedding_dim,), name="bert_input", dtype='float32')
    
    # embed alias
    alias_input = tf.keras.Input(shape=(None, None,), name="alias", dtype='int32')
    embed_alias = tf.keras.layers.Embedding(input_dim=alias_count, 
                                            output_dim=alias_dim, trainable=True,
                                            embeddings_initializer=tf.keras.initializers.VarianceScaling(
                                                scale=1.0, mode="fan_avg", distribution="uniform"), 
                                            embeddings_regularizer=regularizer, name='alias_embed')(alias_input)
    
    alias_av = tf.math.reduce_sum(input_tensor=embed_alias, axis=2, keepdims=False, name='alias_mean') / tf.cast(tf.shape(embed_alias)[1], tf.float32)
    
    # sentence representations
    sent_reps = tf.keras.layers.concatenate([embed_input, alias_av], axis=2, name='bert_alias_concat')
    
    
    # Input subj-obj types
    subj_input = tf.keras.layers.Input(shape=(None,), name="subj", dtype='int32')
    obj_input = tf.keras.layers.Input(shape=(None,), name="obj", dtype='int32')
    
    # embed subj-obj types
    embed_type = tf.keras.layers.Embedding(input_dim=type_count, output_dim=type_dim, 
                                           embeddings_regularizer=regularizer,
                                           embeddings_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                                               scale=1.0, mode="fan_avg", distribution="uniform"), name='type_embed')
    subj_embed = embed_type(subj_input)
    obj_embed = embed_type(obj_input)
    
    # average of types representations
    subj_type_av = tf.math.reduce_mean(input_tensor=subj_embed, axis=1, name='subj_mean')
    obj_type_av = tf.math.reduce_mean(input_tensor=obj_embed, axis=1, name='obj_mean')
    
    # concatenate subject and object to one single representation
    concat_type = tf.keras.layers.concatenate([subj_type_av, obj_type_av], axis=1, name='type_concat')
    
    # weights for querying attention layer 
    # samples are drawn from a uniform distribution within [-limit, limit], with limit = sqrt(3 * scale / n)
    sent_atten_q = tf.keras.initializers.VarianceScaling(scale=1.0, 
                                                         mode="fan_avg", 
                                                         distribution="uniform")(shape=(1, 1, alias_dim+embedding_dim))  # alias_dim+
    
    # Bag Attention sentence level
    bag_attention = tf.keras.layers.Attention(name='attention')([sent_atten_q, sent_reps])
    
    mean_bag_attention = tf.math.reduce_mean(bag_attention, axis=1)
    
    bag_reps = tf.keras.layers.concatenate([mean_bag_attention, concat_type], axis=1, name='bag_rep_type_concat')
    
    # fully connected
    fc1 = tf.keras.layers.Dense(units={{choice([48, 96, 192, 384, 768])}}, 
                                activation="relu", 
                                kernel_regularizer=regularizer, name='fully1')(bag_reps)
    
    drop1 = tf.keras.layers.Dropout(rate={{uniform(0,1)}}, name='drop1')(fc1)
    
    fc2 = tf.keras.layers.Dense(units={{choice([6, 12, 24, 48])}}, 
                                activation="relu", 
                                kernel_regularizer=regularizer, name='fully2')(drop1)
    
    drop = tf.keras.layers.Dropout(rate={{uniform(0,1)}}, name='drop2')(fc2)
    
    # output layer
    label = tf.keras.layers.Dense(num_class, activation='softmax', kernel_regularizer=regularizer, name='output')(drop)
    
    model = tf.keras.Model([embed_input, alias_input, subj_input, obj_input], label)
    
    print(model.summary())
    
    optim = tf.keras.optimizers.SGD(lr={{uniform(0,1)}}, name='SGD')

    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer=optim,
                  metrics=['accuracy'])
    
    train_batches, train_batches_len = create_batches(data['train'], batch_size)
    val_batches, val_batches_len = create_batches(data['dev'], batch_size)
    test_batches, test_batches_len = create_batches(data['test'], batch_size)
    
    checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath=models_dir+ model_name + "-es_best.h5", verbose=1, 
                                   save_best_only=True, save_weights_only=True)
    earlystopper = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    
    history = model.fit_generator(get_batches(train_batches), steps_per_epoch=train_batches_len, 
                        epochs=100, verbose=2, validation_steps=val_batches_len, 
                        validation_data=get_batches(val_batches), callbacks=[checkpointer, earlystopper])
    
    score, acc = model.evaluate_generator(generator=get_batches(test_batches), 
                                          steps=test_batches_len, verbose=0)
    print('Test accuracy:', acc)
    
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [None]:
# chose better parameters to model
trials = Trials()
best_run, best_model = optim.minimize(model=model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=3,
                                      trials=trials,
                                      functions=[padData, getData, get_chunks, get_batches, create_batches],
                                      notebook_name='RIEDEL_bert_side',
                                      data_args=(path2datafile,models_dir, model_name)
                                      )

print("Best performing model chosen hyper-parameters:")
print(best_run)

tf.keras.utils.plot_model(best_model, to_file=models_dir + model_name + ".png", show_shapes=True, show_layer_names=True)
best_model.save(models_dir + model_name + '.h5')

In [None]:
def getData2(data):
    max_sent_in_bag, max_alias, max_alias_len, subj_max, obj_max, aliases_count = 0, 0, 0, 0, 0, 0 
    
    for i, bag in enumerate(data):
        aliases = bag['ProbY']
        
        max_sent_in_bag = 5
        
        max_alias = 5 
        max_alias_len = 20
        for alias in aliases:  # number of relation aliases for each sentence in bag
            for a in alias:
                aliases_count = max(aliases_count, a)
        
        subj_max = max(subj_max, len(bag['SubType']))
        obj_max = max(obj_max, len(bag['ObjType']))
    
    print("bag:", max_sent_in_bag)
    print("alias:", max_alias)
    print("max_alias_len:", max_alias_len)
    print("alias count", aliases_count)
    print("SubType:", subj_max)
    print("ObjType:", obj_max)
    
    return padData(data, max_sent_in_bag, max_alias, max_alias_len, subj_max, obj_max, aliases_count+1)

def split_label(dataset):
    batches = []
    labels = []
    for d in dataset:
        batches.append([np.array(d["embeds"], dtype='float32'), np.array(d["ProbY"], dtype='int32'), 
                        np.array(d["SubType"], dtype='int32'), np.array(d["ObjType"], dtype='int32')])
        labels.append(d["Y"])
        
    return np.array(batches), np.array(labels)

def getPdataOne(data):
    
    p_one = []
    
    for bag in data:
        if len(bag['embeds']) < 2: continue
        
        indx = list(range(len(bag['embeds'])))
        random.shuffle(indx)
        
        p_one.append({
            'embeds':   [bag['embeds'][indx[0]]],
            'ProbY': 	[bag['ProbY'][indx[0]]],
            'Y':    	bag['Y'],
            'SubType':	bag['SubType'],
            'ObjType':	bag['ObjType']
        })
    
    return getData2(p_one)

def getPdataTwo(data):
    
    p_two = []
    
    for bag in data:
        if len(bag['embeds']) < 2: continue
        
        indx = list(range(len(bag['embeds'])))
        random.shuffle(indx)

        p_two.append({
            'embeds':   [bag['embeds'][indx[0]], bag['embeds'][indx[1]]],
            'ProbY': 	[bag['ProbY'][indx[0]], bag['ProbY'][indx[1]]],
            'Y':   	 	bag['Y'],
            'SubType':	bag['SubType'],
            'ObjType':	bag['ObjType']
        })
    
    
    return getData2(p_two)

def getPscore(p_n_data):
    
    data_one, y_true = split_label(p_n_data)

    y_pred_indx = []
    y_pred_prob = []
    for bert, alias, subj, obj in data_one:
        prediction = model.predict([[bert], [alias], [subj], [obj]])
        y_pred_indx.append(np.argmax(prediction))
        y_pred_prob.append(np.amax(prediction))
    
    y_pred_indx = np.array(y_pred_indx)
    y_prob = np.reshape(np.array(y_pred_prob), (-1))
    y_true = np.reshape(np.array(y_true), (-1))
    order = np.argsort(-y_prob)
    
    def p_score(n):
        sum_correct_pred = 0.0
        for i in order[:n]:
            sum_correct_pred += 1.0 if (y_true[i] == y_pred_indx[i]) else 0
        return sum_correct_pred / n
        
    return p_score(100), p_score(200), p_score(300)

def savePredictions2File(dataset_name, algorithm_name, data):
    validation, count_bags, alias_count = getData(data['test'])
    classes_ = len(data['rel2id'])
    #print(validation.keys())
    data, y_true = split_label(validation)
    
    logit_list = []
    for bert, alias, subj, obj in data:
        logit_list.append((model.predict([[bert], [alias], [subj], [obj]])[0]).tolist())

    y_flatten = y_true.flatten().tolist()
    y_actual_hot = (tf.keras.utils.to_categorical(y_flatten, num_classes=classes_)).tolist()
    
    pickle.dump({'logit_list': logit_list, 'y_hot': y_actual_hot}, 
                open(results_dir + "{}/{}/precision_recall.pkl".format(dataset_name, algorithm_name), 'wb'))

In [None]:
model = tf.keras.models.load_model(models_dir + model_name + '.h5')

data = pickle.load(open(path2datafile, 'rb'))

data = data['test']

print("Save data to plot precision-recall curve")

savePredictions2File("riedel", "BERT-SIDE", data)

print("precision_recall.pkl created")

In [None]:
print("\nP@N results")

model = tf.keras.models.load_model(models_dir + model_name + '.h5')

data = pickle.load(open(path2datafile, 'rb'))

data = data['test']

one_ = getPdataOne(data)

one, one_bags_count, _ = one_

print('=============== test one =============================')
one_100, one_200, one_300 = getPscore(one)
print("P@100: {}, P@200: {}, P@300: {}".format(one_100, one_200, one_300))

In [None]:
data = pickle.load(open(path2datafile, 'rb'))

data = data['test']

two_ = getPdataTwo(data)
two, two_bags_count, _ = two_

print('=============== test two =============================')
two_100, two_200, two_300 = getPscore(two)
print("P@100: {}, P@200: {}, P@300: {}".format(two_100, two_200, two_300))

In [None]:
data = pickle.load(open(path2datafile, 'rb'))

data = data['test']

print('=============== test all =============================')
all_, count_bags, _ = getData(data)
all_100, all_200, all_300 = getPscore(all_)
print("P@100: {}, P@200: {}, P@300: {}".format(all_100, all_200, all_300))