In [1]:
from __future__ import print_function

import tensorflow as tf

import os
import sys
import numpy as np
import pickle5 as pickle
import tempfile

import keras
from keras.preprocessing.text import Tokenizer
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Input, Embedding, Activation, Softmax
from keras.layers import CuDNNLSTM
from keras.models import Model, Sequential
from keras.layers import Conv1D,Conv2D, ZeroPadding2D, MaxPooling1D, MaxPooling2D
from keras.layers import RepeatVector, Permute, Add, Concatenate, Reshape, Dot
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray
from gensim.models import Word2Vec
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt

tf.config.experimental_run_functions_eagerly(True)
np.random.seed(100)

BASE_DIR = '.'
GLOVE_DIR = '/media/drived/Dev/Glove'
# TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
DATA_DIR = "/home/bhargav/nlu_project/keras_n20/data/Es_Rc"
DOC_PKL = "document_list.pick"
TARGET_PKL = "target_list.pick"

OUTPUT_PATH = './learning_beyond_datasets/node2vec/lstm_and_kg/'

MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

NUM_RELATIONS_PER_CLUSTER = 67
NUM_ENTITIES_PER_CLUSTER = 400
NUM_CLUSTERS = 20

EMBEDDING_DIM = 300

def get_clusters(cluster_file, num_things_per_cluster):
    clusters = []  # np.ones(shape=(NUM_RELATIONS_PER_CLUSTER,KG_EMBEDDING_DIM))

    with open(cluster_file, 'r', encoding='utf8') as f:
        lines = []
        for line in f:
            elements = line.split()
            x = [[e] for e in elements]
            lines.append(x)

    for i in range(0, len(lines) - num_things_per_cluster + 1, num_things_per_cluster):
        # print("appending: {} to {}".format(i,i+num_things_per_cluster))
        clusters.append(lines[i:i + num_things_per_cluster])

    clusters = np.asarray(clusters, dtype='float32')
    return clusters


def pickler(path, pkl_name, obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(path, pkl_name):
    with open(os.path.join(path, pkl_name), 'rb') as f:
        obj = pickle.load(f)
    return obj

def get_labels(data):
    labels = []
    for d in data:
        labels.append(d['label']) if d['label'] not in labels else None

    return labels


def get_x_and_y(data):
    x, y = [], []
    for d in data:
        for dd in d['data']:
            tmp = dd['text'].replace('\n', '').replace('_', '')  # clean
            x.append({'label': d['label'], 'dbpedia_uri': dd['dbpedia_uri'], 'text': tmp, 'graph': dd['graph']}) if len(tmp) > 0 else None
            y.append(d['label']) if len(tmp) > 0 else None

    return x, y


def get_label_index(label):
    return [index for index, _label in enumerate(unique_labels) if label == _label][0]


Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


### LOAD DATA

In [2]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

with open(r'data/classification_data_with_graphs_v4.pkl', 'rb') as pickle_file:
    data = pickle.load(pickle_file)

# raw_docs = unpickler(DATA_DIR, DOC_PKL)
# labels = unpickler(DATA_DIR, TARGET_PKL)

unique_labels = get_labels(data)

x, y = get_x_and_y(data)
raw_docs = [d['text'] for d in x]
labels = [get_label_index(label) for label in y]

for doc in raw_docs:
    texts.append(" ".join(doc.split()[:MAX_SEQUENCE_LENGTH]))

print('Found %s texts.' % len(texts))

Found 6716 texts.


### GET NODE EMBEDDINGS/GRAPH

In [3]:
with open(r'data/node_embeddings_v3.pkl', 'rb') as pickle_file:
    node_embeddings = pickle.load(pickle_file)
with open(r'data/graph.pkl', 'rb') as pickle_file:
    graph = pickle.load(pickle_file)

### GET GRAPH CLUSTERS

In [4]:
def get_graph_clusters():
    clusters = []
    for i,d in enumerate(data):
        print('%.2f%%' % ((i * 100) / len(data))) if i % 10 == 0 else None
        nodes, edges, edge_types = [], [], []
        for dd in d['data']:
            graph = dd['graph']
            for node in graph['nodes']:
                nodes.append(node)
            
            for edge, edge_type in zip(graph['edges'], graph['edge_types']):
                if not any(edge['source'] == e['source'] and edge['target'] == e['target'] for e in edges):
                    edges.append({'source': edge['source'], 'target': edge['target']}) 
                    edge_types.append(edge_type)
        
                                       
        clusters.append({'nodes': nodes, 'edges': edges, 'edge_types': edge_types})
        
    with open('data/node_clusters_v3.pkl', 'wb') as outp:
        pickle.dump(clusters, outp, pickle.HIGHEST_PROTOCOL)
                                       
    return clusters

graph_embedding_size=100

def get_embedding(G):
        walk_length = 10
        rw = BiasedRandomWalk(G)
        walks = rw.run(
            nodes=G.nodes(),  # root nodes
            length=walk_length,  # maximum length of a random walk
            n=10,  # number of random walks per root node
            p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
            q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
            weighted=False,  # for weighted random walks
            seed=42,  # random seed fixed for reproducibility
        )

        model = Word2Vec(
            walks,  vector_size=graph_embedding_size, window=5, min_count=0, sg=1, workers=1
        )

        return model.wv.vectors
    
    
def get_embeddings(clusters):
    all_embeddings = []
    for i, cluster in clusters:
        edges_ = pd.DataFrame({
                'source': [e['source'] for e in cluster['edges']],
                'target': [e['target'] for e in cluster['edges']],
                'type': cluster['edge_types']
            })

        G = StellarGraph(IndexedArray(index=graph['nodes']), edges_, edge_type_column="type")

        node_embeddings = get_embedding(G)
        
        all_embeddings.append(node_embeddings)
    
    with open('data/node_embeddings_v4.pkl', 'wb') as outp:
        pickle.dump(all_embeddings, outp, pickle.HIGHEST_PROTOCOL)
        
    return all_embeddings

In [5]:
clusters_v1 = unpickler('data','node_clusters.pkl')
clusters_v2 = unpickler('data','node_clusters_v2.pkl')
clusters_v3 = unpickler('data','node_clusters_v3.pkl')
# clusters_v3 = get_graph_clusters()
# cluster_embeddings = get_embeddings(clusters)



In [6]:
# len(set(clusters_v3[0]['nodes']))
# cluster = clusters_v3[[i for i,c in enumerate(unique_labels) if c == 'Art-Sculpting'][0]]
# counts = [(el,cluster['nodes'].count(el)) for el in set(cluster['nodes'])]
# counts = sorted(counts, key=lambda c: c[1],reverse=True)
# counts[0:100]    


In [7]:
def filter_clusters(clusters):
    result = []
    for cluster in clusters:
        new_cluster = []
        counts = [(el,cluster['nodes'].count(el)) for el in set(cluster['nodes'])]
        for count in counts:
            new_cluster.append(count[0]) if count[1] > 13 else None
                
        result.append({'nodes':new_cluster})
    
    with open('data/node_clusters_v3_filtered.pkl', 'wb') as outp:
        pickle.dump(result, outp, pickle.HIGHEST_PROTOCOL)
    
    return result

# clusters_v3_filtered = filter_clusters(clusters_v3)
clusters_v3_filtered = unpickler('data','node_clusters_v3_filtered.pkl')

In [8]:
def cluster_with_count(clusters):
    result = []
    for cluster in clusters:
        new_cluster = []
        counts = [(el,cluster['nodes'].count(el)) for el in set(cluster['nodes'])]
        for count in counts:
            new_cluster.append(count)
                
        result.append({'nodes':new_cluster})
    
    with open('data/node_clusters_v3_with_count.pkl', 'wb') as outp:
        pickle.dump(result, outp, pickle.HIGHEST_PROTOCOL)
    
    return result

# clusters_v3_with_count = cluster_with_count(clusters_v3)
clusters_v3_filtered = unpickler('data','node_clusters_v3_with_count.pkl')

In [9]:
def get_node_cluster_mapping(clusters):
    mapping = {}
    for i, node in enumerate(graph['nodes']):
        print('%.2f%%' % ((i * 100) / len(graph['nodes']))) if i % 500 == 0 else None

        node_clusters = [unique_labels[i] for i,cluster in enumerate(clusters) if node in cluster['nodes']]
        mapping[node] = node_clusters if len(node_clusters) > 0 else 'None'
                    
    with open('data/node_cluster_mapping_v2.pkl', 'wb') as outp:
        pickle.dump(mapping, outp, pickle.HIGHEST_PROTOCOL)

    return mapping
    
node_cluster_mapping_v2 = unpickler('data','node_cluster_mapping_v2.pkl')
# node_cluster_mapping_v3 = get_node_cluster_mapping(clusters_v3_filtered)

In [10]:
def cluster_count(cluster,node):
    return [n[1] for n in cluster['nodes'] if n[0] == node][0]

def get_node_cluster_mapping_with_count(clusters):
    mapping = {}
    for i, node in enumerate(graph['nodes']):
        print('%.2f%%' % ((i * 100) / len(graph['nodes']))) if i % 500 == 0 else None

        node_clusters = [(unique_labels[i],cluster_count(cluster,node)) for i,cluster in enumerate(clusters) if node in [c[0] for c in cluster['nodes']]]
        node_clusters = sorted(node_clusters, key=lambda c: c[1], reverse=True)
        mapping[node] = node_clusters if len(node_clusters) > 0 else 'None'
                            
    with open('data/node_cluster_mapping_v3_with_count.pkl', 'wb') as outp:
        pickle.dump(mapping, outp, pickle.HIGHEST_PROTOCOL)

    return mapping

node_cluster_mapping_v3_with_count = unpickler('data','node_cluster_mapping_v3_with_count.pkl')
# node_cluster_mapping_v3_with_count = get_node_cluster_mapping_with_count(clusters_v3_with_count)

In [11]:
node_cluster_mapping_v3_with_count['Molecule']

[('Science-Chemistry', 6)]

### TOKENIZE TEXTS, PREPARE GLOVE EMBEDDINGS

In [4]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, '/media/drived/Dev/Glove/glove.6B.300d.txt'), encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [5]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_seq = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data_seq.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
# indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
indices = unpickler('data','indices.pkl')
y = np.array(y)[indices.astype(int)]
data = data_seq[indices]
labels = labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_t = data_seq[:-num_validation_samples]
y_t = labels[:-num_validation_samples]
y_str_t = y[:-num_validation_samples]
x_val = data_seq[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

num_test_samples = int(TEST_SPLIT * x_t.shape[0])

x_train = x_t[:-num_test_samples]
y_train = y_t[:-num_test_samples]
x_test = x_t[-num_test_samples:]
y_test = y_t[-num_test_samples:]
y_str_test = y_str_t[-num_test_samples:]

print('x_train:{} y_train:{} x_val:{} y_val:{} '.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 129395 unique tokens.
Shape of data tensor: (6716, 300)
Shape of label tensor: (6716, 78)
x_train:(4836, 300) y_train:(4836, 78) x_val:(671, 300) y_val:(671, 78) 
Preparing embedding matrix.


### PREPARE NEURAL NET MODEL SIMPLE


In [None]:
GRAPH CLUSTER      NODE_CLUSTER_MAPPING           GRAPH INPUT
0 Philosophy       Schopenhaur, Kant              Kant, Socrates
1 Music            Led Zeppelin, Pearl Jam

For each element in graph input, find the corresponding cluster, attend over it

1. Finding graph input node clusters:
    We know cluster indexes.
    Write a lambda function to convert graph inputs to clusters then encode them(one-hot encoded)

In [30]:
tensor = tf.constant([[10,20]])
tensor_array = tensor.numpy()
tf.py_function(find_cluster,tensor,'string')

<class 'tensorflow.python.framework.ops.EagerTensor'>
[10 20]


InvalidArgumentError: pyfunc_7 returns 0 values, but expects to see 1 values. [Op:EagerPyFunc]

In [90]:
tf.config.run_functions_eagerly(True)

def find_cluster(arr):
    arr = arr.astype('str')
    
    result = []
    for el in arr:
        all_mappings = node_cluster_mapping_v3_with_count[el] if el in node_cluster_mapping_v3_with_count else []
        filtered_mappings = [e[0] for e in all_mappings[:min(len(all_mappings),3)]]
        result.extend(filtered_mappings)
        
    print(result)
    
    tensor = tf.convert_to_tensor(result)
    
    return tensor
    
FindCluster = keras.layers.core.Lambda(lambda x: tf.numpy_function(find_cluster,[x],'string'))
# FindCluster = keras.layers.core.Lambda(lambda x: print(x.shape))

RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
cluster_names = unique_labels + ['None']

graph_input = Input(shape=(), dtype='string', name='graph_input')                           
x = FindCluster(graph_input)
x = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='multi_hot')(x)
m = Model(inputs=graph_input, outputs=x)
# m.compile(loss='categorical_crossentropy', optimizer='adam', run_eagerly=True, metrics=['acc'])

res = m.predict(['Atom','Florence'])
res

TypeError: '>' not supported between instances of 'NoneType' and 'int'

In [8]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 300

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

# entity_clusters = K.variable(embedding_clusters_2d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: node_embedding_clusters)
FindCluster = keras.layers.core.Lambda(lambda x: tf.map_fn(lambda el: node_cluster_mapping[el] if el in node_cluster_mapping else 0, x, dtype='string'))

cluster_names = unique_labels + ['None']
                                       
main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)

# get representation for entity clusters
e_clusters = Input(name='e_clusters')
# x = FakeEClusterIn(x)
# print("Shape", K.int_shape(x))
x = e_clusters
x = Conv1D(filters=1, kernel_size=5, strides=1, input_shape=node_embeddings.shape)(x)
x = MaxPooling1D(pool_size=5, strides=5)(x)
x = Conv1D(filters=1, kernel_size=5, strides=1)(x)
x = MaxPooling1D(pool_size=5, strides=1)(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

graph_input = Input(shape=(,), dtype='string', name='graph_input')                           
x = FindCluster(graph_input)
graph = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='one_hot')(x)
                               
# attention over entities
att_scores = DotProduct([entity_cluster_reps, graph])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=[main_input,e_clusters,graph_input], outputs=final_output)

m.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (89539, 100)


ValueError: Input 0 of layer max_pooling1d is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (89539, 100)

### PREDICT SIMPLE

In [26]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
m.save('node2vec_lstm_model')

Epoch 1/10
 349/2418 [===>..........................] - ETA: 1:03:41 - loss: 4.4996 - acc: 0.0000e+ - ETA: 1:50 - loss: 7.3437 - acc: 0.0000e+00   - ETA: 1:49 - loss: 7.8418 - acc: 0.0000e+0 - ETA: 1:48 - loss: 7.4011 - acc: 0.0000e+0 - ETA: 1:47 - loss: 7.0167 - acc: 0.0000e+0 - ETA: 1:47 - loss: 6.8506 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.8360 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.9715 - acc: 0.0000e+0 - ETA: 1:46 - loss: 7.2541 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.4608 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5667 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5049 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3822 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3906 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.2569 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1678 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1228 - acc: 0.0000e+0 - ETA: 1:45 - loss: 6.9626 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.8294 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7574 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7398 - acc: 0.0122    - ETA: 1:44 - los

KeyboardInterrupt: 

In [63]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



Test loss / test accuracy = 1.6824 / 0.6592


In [64]:
preds = m.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

In [65]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))

### PREPARE NEURAL NET MODEL WITH EMBEDDINGS


In [None]:
GRAPH CLUSTER      NODE_CLUSTER_MAPPING           GRAPH INPUT
0 Philosophy       Schopenhaur, Kant              Kant, Socrates
1 Music            Led Zeppelin, Pearl Jam

For each element in graph input, find the corresponding cluster, attend over it

1. Finding graph input node clusters:
    We know cluster indexes.
    Write a lambda function to convert graph inputs to clusters then encode them(one-hot encoded)

In [8]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 300

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

# entity_clusters = K.variable(embedding_clusters_2d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: node_embedding_clusters)
FindCluster = keras.layers.core.Lambda(lambda x: tf.map_fn(lambda el: node_cluster_mapping[el] if el in node_cluster_mapping else 0, x, dtype='string')

cluster_names = unique_labels + ['None']
                                       
main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)

# get representation for entity clusters
e_clusters = Input(name='e_clusters')
# x = FakeEClusterIn(x)
# print("Shape", K.int_shape(x))
x = e_clusters
x = Conv1D(filters=1, kernel_size=5, strides=1, input_shape=node_embeddings.shape)(x)
x = MaxPooling1D(pool_size=5, strides=5)(x)
x = Conv1D(filters=1, kernel_size=5, strides=1)(x)
x = MaxPooling1D(pool_size=5, strides=1)(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

graph_input = Input(shape=(,), dtype='string', name='graph_input')                           
x = FindCluster(x)
graph = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='one_hot')(x)
                               
# attention over entities
att_scores = DotProduct([entity_cluster_reps, graph])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=[main_input,e_clusters,graph_input], outputs=final_output)

m.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (89539, 100)


ValueError: Input 0 of layer max_pooling1d is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (89539, 100)

### PREDICT WITH EMBEDDINGS

In [26]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
m.save('node2vec_lstm_model')

Epoch 1/10
 349/2418 [===>..........................] - ETA: 1:03:41 - loss: 4.4996 - acc: 0.0000e+ - ETA: 1:50 - loss: 7.3437 - acc: 0.0000e+00   - ETA: 1:49 - loss: 7.8418 - acc: 0.0000e+0 - ETA: 1:48 - loss: 7.4011 - acc: 0.0000e+0 - ETA: 1:47 - loss: 7.0167 - acc: 0.0000e+0 - ETA: 1:47 - loss: 6.8506 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.8360 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.9715 - acc: 0.0000e+0 - ETA: 1:46 - loss: 7.2541 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.4608 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5667 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5049 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3822 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3906 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.2569 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1678 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1228 - acc: 0.0000e+0 - ETA: 1:45 - loss: 6.9626 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.8294 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7574 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7398 - acc: 0.0122    - ETA: 1:44 - los

KeyboardInterrupt: 

In [63]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



Test loss / test accuracy = 1.6824 / 0.6592


In [64]:
preds = m.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

In [65]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))