In [1]:
from __future__ import print_function

import tensorflow as tf

import os
import sys
import numpy as np
import pickle5 as pickle
import tempfile

import keras
from keras.preprocessing.text import Tokenizer
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Input, Embedding, Activation, Softmax
from keras.layers import CuDNNLSTM
from keras.models import Model, Sequential
from keras.layers import Conv1D,Conv2D, ZeroPadding2D, MaxPooling1D, MaxPooling2D
from keras.layers import RepeatVector, Permute, Add, Concatenate, Reshape, Dot
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray
from gensim.models import Word2Vec
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt

np.random.seed(100)

BASE_DIR = '.'
GLOVE_DIR = '/media/drived/Dev/Glove'
# TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
DATA_DIR = "/home/bhargav/nlu_project/keras_n20/data/Es_Rc"
DOC_PKL = "document_list.pick"
TARGET_PKL = "target_list.pick"

OUTPUT_PATH = './learning_beyond_datasets/node2vec/lstm_and_kg/'

MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

NUM_RELATIONS_PER_CLUSTER = 67
NUM_ENTITIES_PER_CLUSTER = 400
NUM_CLUSTERS = 20

EMBEDDING_DIM = 300

def get_clusters(cluster_file, num_things_per_cluster):
    clusters = []  # np.ones(shape=(NUM_RELATIONS_PER_CLUSTER,KG_EMBEDDING_DIM))

    with open(cluster_file, 'r', encoding='utf8') as f:
        lines = []
        for line in f:
            elements = line.split()
            x = [[e] for e in elements]
            lines.append(x)

    for i in range(0, len(lines) - num_things_per_cluster + 1, num_things_per_cluster):
        # print("appending: {} to {}".format(i,i+num_things_per_cluster))
        clusters.append(lines[i:i + num_things_per_cluster])

    clusters = np.asarray(clusters, dtype='float32')
    return clusters


def pickler(path, pkl_name, obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(path, pkl_name):
    with open(os.path.join(path, pkl_name), 'rb') as f:
        obj = pickle.load(f)
    return obj

def get_labels(data):
    labels = []
    for d in data:
        labels.append(d['label']) if d['label'] not in labels else None

    return labels


def get_x_and_y(data):
    x, y = [], []
    for d in data:
        for dd in d['data']:
            tmp = dd['text'].replace('\n', '').replace('_', '')  # clean
            x.append({'label': d['label'], 'dbpedia_uri': dd['dbpedia_uri'], 'text': tmp, 'graph': dd['graph']}) if len(tmp) > 0 else None
            y.append(d['label']) if len(tmp) > 0 else None

    return x, y


def get_label_index(label):
    return [index for index, _label in enumerate(unique_labels) if label == _label][0]


### LOAD DATA

In [2]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

with open(r'data/classification_data_with_graphs_v2.pkl', 'rb') as pickle_file:
    data = pickle.load(pickle_file)

# raw_docs = unpickler(DATA_DIR, DOC_PKL)
# labels = unpickler(DATA_DIR, TARGET_PKL)

unique_labels = get_labels(data)

x, y = get_x_and_y(data)
raw_docs = [d['text'] for d in x]
labels = [get_label_index(label) for label in y]

for doc in raw_docs:
    texts.append(" ".join(doc.split()[:MAX_SEQUENCE_LENGTH]))

print('Found %s texts.' % len(texts))

Found 6716 texts.


### GET NODE2VEC CLUSTERS

In [3]:
graph_embedding_size=100

def get_embedding(G):
        walk_length = 100
        rw = BiasedRandomWalk(G)
        walks = rw.run(
            nodes=G.nodes(),  # root nodes
            length=walk_length,  # maximum length of a random walk
            n=10,  # number of random walks per root node
            p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
            q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
            weighted=False,  # for weighted random walks
            seed=42,  # random seed fixed for reproducibility
        )

        model = Word2Vec(
            walks,  vector_size=graph_embedding_size, window=5, min_count=0, sg=1, workers=1
        )

        return model.wv.vectors
    
def get_graph():
    nodes, edges, edge_types = [], [], []
    for i1,d in enumerate(x):
        print('%.2f%%' % ((i1 * 100) / len(x))) if i1 % 100 == 0 else None
        g = d['graph']
        for node in g['nodes']:
            nodes.append(node) if not node in nodes else None

        for i, edge in enumerate(g['edges']):
            if not any(e['source'] == edge['source'] and e['target'] == edge['target'] for e in edges):
                edges.append({'source': edge['source'], 'target': edge['target']})
                edge_types.append(g['edge_types'][i])
    
    
    with open('graph.pkl', 'wb') as outp:
        pickle.dump({'nodes': nodes, 'edges': edges, 'edge_types': edge_types}, outp, pickle.HIGHEST_PROTOCOL)
        
    return {'nodes':nodes, 'edges': edges, 'edge_types': edge_types}

def get_embeddings():
    # graph = get_graph()
    graph = unpickler('.','graph.pkl')

    edges_ = pd.DataFrame({
            'source': [e['source'] for e in graph['edges']],
            'target': [e['target'] for e in graph['edges']],
            'type': graph['edge_types']
        })
    
    G = StellarGraph(IndexedArray(index=graph['nodes']), edges_, edge_type_column="type")

    node_embeddings = get_embedding(G)
    
    with open('node_embeddings_v3.pkl', 'wb') as outp:
        pickle.dump(node_embeddings, outp, pickle.HIGHEST_PROTOCOL)
        
    return node_embeddings, graph

In [4]:
with open(r'data/node_embeddings_v3.pkl', 'rb') as pickle_file:
    node_embeddings = pickle.load(pickle_file)
with open(r'data/graph.pkl', 'rb') as pickle_file:
    graph = pickle.load(pickle_file)
    
# node_embeddings, graph = get_embeddings()

In [None]:
# print(f'Node count: {node_count}, Edge count: {edge_count}')

In [6]:
any('Guernica' in g for g in graph['nodes'])

False

### CLUSTER NODE EMBEDDINGS

In [None]:
# from k_means_constrained import KMeansConstrained
# import math
# NUM_CLUSTERS = min(NUM_CLUSTERS,int(len(node_embeddings)/NUM_ENTITIES_PER_CLUSTER))
# NUM_ENTITIES_PER_CLUSTER = math.ceil(len(node_embeddings) / NUM_CLUSTERS)

# print(f'Num of clusters: {NUM_CLUSTERS}, Num entities per cluster: {NUM_ENTITIES_PER_CLUSTER}')
# def cluster():
#     clf = KMeansConstrained(
#          n_clusters=NUM_CLUSTERS,
#          size_min=NUM_ENTITIES_PER_CLUSTER / 2,
#          size_max=NUM_ENTITIES_PER_CLUSTER,
#          random_state=0
#     )
#     clf.fit_predict(node_embeddings)
#     print(clf.cluster_centers_)
#     print(clf.labels_)
    
#     pickler('.','clusters_object_v3.pkl',clf)
    
#     return clf

In [None]:
# clf = unpickler('.','clusters_object.pkl')
# clf = cluster()

In [9]:
from sklearn.cluster import KMeans
def cluster():
    kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(node_embeddings)

    num = np.zeros(NUM_CLUSTERS)
    for i in range(NUM_CLUSTERS):
        n = np.where(kmeans.labels_==i)[0]
        num[i]=len(n)
    
    NUM_ENTITIES_PER_CLUSTER = int(np.min(num))
    print('NUM_ENTITIES_PER_CLUSTER: %d' % NUM_ENTITIES_PER_CLUSTER)
    
    pickler('.','clusters_object_v3.pkl',kmeans)
    pickler('.','num_entities_per_cluster_v3.pkl', NUM_ENTITIES_PER_CLUSTER)

    return kmeans, int(NUM_ENTITIES_PER_CLUSTER)

In [10]:
clf = unpickler('data','clusters_object_v3.pkl')
NUM_ENTITIES_PER_CLUSTER = unpickler('data','num_entities_per_cluster_v3.pkl')
# clf,NUM_ENTITIES_PER_CLUSTER = cluster()

In [11]:
clusters_dict = {}

for i in clf.labels_:
    indexes = np.where(clf.labels_==i)[0]
    clusters_dict[i] = node_embeddings[indexes[:NUM_ENTITIES_PER_CLUSTER]]     

In [12]:
# pad clusters
embedding_clusters_2d,embedding_clusters_1d = [],[]
for key in range(NUM_CLUSTERS):
    if len(clusters_dict[key]) < NUM_ENTITIES_PER_CLUSTER:
        clusters_dict[key] = np.append(clusters_dict[key],[[0] * graph_embedding_size for i in range(0,NUM_ENTITIES_PER_CLUSTER - len(clusters_dict[key]))],axis=0)

    new_vecs_2d, new_vecs_1d = [], []
    for vec in clusters_dict[key]:
        new_vecs_2d.append([[val] for val in vec])
        new_vecs_1d.append(vec)
    
    embedding_clusters_2d.append(new_vecs_2d)
    embedding_clusters_1d.append(new_vecs_1d)
    
embedding_clusters_2d = np.asarray(embedding_clusters_2d, dtype='float32')
embedding_clusters_1d = np.asarray(embedding_clusters_1d, dtype='float32')

### ANALYSE GRAPH VECTORS

In [14]:
from numpy.linalg import norm

def get_similarity(cluster):
    sums, c, avg = 0, 0, 0
    for i1 in range(0,len(cluster)):
        A = cluster[i1]
        for i2 in range(i1,len(cluster)):
            B = cluster[i2]
            if norm(A) == 0 or norm(B) == 0: continue
            cosine = np.dot(A,B)/(norm(A)*norm(B))
            sums += cosine
            c += 1

    return sums / c

def get_similarity_v2(cluster,i3):
    i3 += 1
    sums, c, avg = 0, 0, 0
    for i1 in range(0,len(cluster)):
        A = [val[0] for val in cluster[i1]]
        for i2 in range(i1,len(cluster)):
            B = [val[0] for val in cluster[i2]]
            if norm(A) == 0 or norm(B) == 0: continue
            cosine = np.dot(A,B)/(norm(A)*norm(B))
            sums += cosine
            c += 1

    return sums / c

# avg in-cluster similarity
# sum([get_similarity(clusters_dict[i]) for i in clusters_dict]) / len(clusters_dict)#, sum([get_similarity_v2(c,i) for i,c in enumerate(embedding_clusters_2d)]) / len(embedding_clusters_2d)

In [15]:
# Random Similarity
import random

def random_similarity():
    avg = 0
    for i in range(200):
        index1,index2 = random.randint(0,len(clusters_dict) - 1), random.randint(0,len(clusters_dict[0]) - 1)
        index3,index4 = random.randint(0,len(clusters_dict) - 1), random.randint(0,len(clusters_dict[0]) - 1)
        # print(f'({index1},{index2}) - ({index3},{index4})')
        A = clusters_dict[index1][index2]
        B = clusters_dict[index3][index4]
        lenA = len([v for v in A if v != 0])
        lenB = len([v for v in B if v != 0])
        # print('%d %d' % (lenA, lenB))

        sim = np.dot(A,B)/(norm(A)*norm(B))
        avg += sim

    return avg / 200

In [16]:
import random

def cosine_sim(i1,i2):
    A = node_embeddings[i1]
    B = node_embeddings[i2]
    return np.dot(A,B)/(norm(A)*norm(B))

def node_index(name):
    i = [i for i,node in enumerate(graph['nodes']) if node == name]
    if len(i) == 0: return None
    return i[0]

def to_name(uri):
    return uri.replace('http://dbpedia.org/resource/','').replace('_',' ')

def topic_similarity():
    topic = [xx for xx in x if xx['label'] == 'Culture']

    sims, c = 0, 0
    for i1, node in enumerate(topic):
        for i2 in range(i1,len(topic)):
            node1, node2 = topic[i1],topic[i2]
            if len(node1['graph']['nodes']) == 0 or len(node2['graph']['nodes']) == 0: continue
            sims += cosine_sim(node_index(to_name(node1['dbpedia_uri'])),node_index(to_name(node2['dbpedia_uri'])))
            c += 1

    return sims / c

def intra_topic_similarity():
    topic1 = [xx for xx in x if xx['label'] == 'Science-Biology']
    topic2 = [xx for xx in x if xx['label'] == 'Culture']

    sims, c = 0, 0
    for i1, node in enumerate(topic1):
        for i2 in range(len(topic2)):
            node1, node2 = topic1[i1],topic2[i2]
            if len(node1['graph']['nodes']) == 0 or len(node2['graph']['nodes']) == 0: continue
            sims += cosine_sim(node_index(to_name(node1['dbpedia_uri'])),node_index(to_name(node2['dbpedia_uri'])))
            c += 1

    return sims / c

def random_topic_similarity():
    topic1 = [xx for xx in x if xx['label'] == 'Science-Biology']
    topic2 = [xx for xx in x if xx['label'] == 'Military-Aviation']
    
    i1,i2 = random.randint(0,len(topic1) - 1), random.randint(0,len(topic2[0]) - 1)

    node1, node2 = topic1[i1],topic2[i2]
    print(i1,node1['dbpedia_uri'],i2,node2['dbpedia_uri'])
    
    if len(node1['graph']['nodes']) == 0 or len(node2['graph']['nodes']) == 0: return None
    
    return cosine_sim(node_index(to_name(node1['dbpedia_uri'])),node_index(to_name(node2['dbpedia_uri'])))

# topic_similarity()

In [17]:
def second_order_class_sim_topic():
    topic1 = [xx for xx in x if xx['label'] == 'Philosophy']

    sims, sims_random,c = 0, 0, 0 
    for i1, node in enumerate(topic1):
        # 
        if len(node['graph']['nodes']) == 0: continue
        sims_random += cosine_sim(random.randint(0,len(node_embeddings) - 1),node_index(to_name(node['dbpedia_uri'])))
        sims += cosine_sim(node_index('Second-order Class'),node_index(to_name(node['dbpedia_uri'])))
        
        c += 1
    
    return sims/c, sims_random/c

# second_order_class_sim_topic()

In [18]:
def second_order_class_sim():
    topic1 = [xx for xx in x if xx['label'] == 'Science-Biology']
    i1 = random.randint(0,len(topic1) - 1)
    
    node1 = topic1[i1]
    print(i1,node1['dbpedia_uri'])
    
    if len(node1['graph']['nodes']) == 0: return None

    return cosine_sim(node_index('Second-order Class'),node_index(to_name(node1['dbpedia_uri'])))

# second_order_class_sim()

In [19]:
from sklearn.manifold import TSNE
from sklearn.manifold import TSNE

# Apply t-SNE transformation on node embeddings
tsne = TSNE(n_components=2, random_state=42)

def plot_embeddings(emb):
    # draw the points
    alpha = 0.7

    plt.figure(figsize=(10, 8))
    plt.scatter(
        emb[:, 0],
        emb[:, 1],
        # c=node_targets.cat.codes,
        cmap="jet",
        alpha=0.7,
    )
    plt.show()

In [124]:
node_embeddings_2d = tsne.fit_transform(node_embeddings[:10000])

In [125]:
plot_embeddings(node_embeddings_2d)

In [143]:
clstr_0_1_2d = tsne.fit_transform(clusters_dict[0] + clusters_dict[1])

In [144]:
plot_embeddings(clstr_0_1_2d)

In [55]:
clstr_2d = tsne.fit_transform(clusters_dict[19] )

In [141]:
show_embedding_plot(clusters_dict[1])

In [54]:
plot_embeddings(np.asarray([[10,20],[50,60],[100,60]]))

In [137]:
plot_embeddings(node_embeddings[:1])

In [136]:
plot_embeddings(node_embeddings_2d[:1])

In [22]:
def plot_topic(topic_name):
    topic = [xx for xx in x if xx['label'] == topic_name]

    embeddings = []
    for i1, node in enumerate(topic):
        emb = node_embeddings[node_index(to_name(node['dbpedia_uri']))]
        if len(emb) != 100: continue
        embeddings.append(emb)

    embeddings_2d = tsne.fit_transform(embeddings)
    plot_embeddings(embeddings_2d)
    
plot_topic('Science-Biology')

In [71]:
A,B = clusters_dict[19][169], clusters_dict[19][170]
np.dot(A,B)/(norm(A)*norm(B))

0.73899245

In [77]:
tmp = tsne.fit_transform([clusters_dict[19][169], clusters_dict[19][170],clusters_dict[19][175]])
tmp

array([[ -18.833384, -276.80054 ],
       [-210.60179 ,  143.00536 ],
       [-478.27985 , -232.9741  ]], dtype=float32)

In [76]:
plot_embeddings(np.asarray([clusters_dict[19][169], clusters_dict[19][170],clusters_dict[19][175]]))

In [95]:
v = [np.arange(100,100 + 100 * i,i) for i in range(1,101)]
mu, sigma = 0, 0.5
v = [np.random.normal(mu,sigma,100) for i in range(100)]
v_2d = tsne.fit_transform(v)
# plot_embeddings(np.asarray([clstr_2d[]]))
# for i, v in enumerate(clstr_2d):
#     print(i, v)
plot_embeddings(np.asarray(v_2d))


In [70]:
topic = [xx for xx in x if xx['label'] == 'Culture']

embeddings = []
for i1, node in enumerate(topic):
    embeddings.append(node_embeddings[node_index(to_name(node['dbpedia_uri']))])

len(embeddings[40])

100

In [17]:
node_embeddings[node_index(to_name('http://dbpedia.org/resource/OpenWorm'))]

array([ 3.10337961e-01, -4.78675812e-02, -6.10486895e-04, -5.64123541e-02,
       -1.79537952e-01,  3.86724360e-02,  1.17867261e-01, -9.98277962e-02,
       -1.93168864e-01, -1.45029142e-01,  2.80993193e-01, -1.63496941e-01,
        3.18608552e-01, -9.00978502e-03,  3.68569553e-01, -1.01031914e-01,
        8.63397941e-02, -5.90289712e-01,  5.60533941e-01,  1.16211779e-01,
        7.56691217e-01, -2.62195822e-02,  3.02382797e-01, -2.42153957e-01,
       -7.26450384e-01,  6.27838612e-01, -7.40742564e-01, -2.64454305e-01,
        2.52723671e-03, -1.56513557e-01,  2.91379422e-01,  2.49367833e-01,
        2.09260434e-01, -3.95488560e-01, -2.56410778e-01,  2.87723631e-01,
       -2.74626762e-01,  5.34079373e-01, -4.36195374e-01, -4.39266600e-02,
        1.33575290e-01, -1.56317517e-01, -5.58875620e-01,  2.02757284e-01,
       -2.45783076e-01,  3.03559117e-02, -1.19541414e-01, -1.17693782e-01,
        3.94478559e-01,  1.06877364e-01, -7.50201792e-02,  2.25401029e-01,
        3.80148411e-01, -

### TOKENIZE TEXTS, PREPARE GLOVE EMBEDDINGS

In [13]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, '/media/drived/Dev/Glove/glove.6B.300d.txt'), encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
# indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
indices = unpickler('data','indices.pkl')
y = np.array(y)[indices.astype(int)]
data = data[indices]
labels = labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_t = data[:-num_validation_samples]
y_t = labels[:-num_validation_samples]
y_str_t = y[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

num_test_samples = int(TEST_SPLIT * x_t.shape[0])

x_train = x_t[:-num_test_samples]
y_train = y_t[:-num_test_samples]
x_test = x_t[-num_test_samples:]
y_test = y_t[-num_test_samples:]
y_str_test = y_str_t[-num_test_samples:]

print('x_train:{} y_train:{} x_val:{} y_val:{} '.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 129395 unique tokens.
Shape of data tensor: (6716, 300)
Shape of label tensor: (6716, 78, 2)
x_train:(4836, 300) y_train:(4836, 78, 2) x_val:(671, 300) y_val:(671, 78, 2) 
Preparing embedding matrix.


### PREPARE NEURAL NET MODEL CONV2D

In [None]:
# embedding_clusters_2d = tf.random.normal(embedding_clusters_2d.shape)

In [25]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 300

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

entity_clusters = K.variable(embedding_clusters_2d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: embedding_clusters_2d)

main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(KG_EMBEDDING_DIM, return_sequences=True)(x)
x = Avg(x)
x = Dense(KG_EMBEDDING_DIM)(x)
x = Activation('relu')(x)
# entity_extraction = Reshape([KG_EMBEDDING_DIM])(x)
entity_extraction = Transpose(x)

x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)

# get representation for entity clusters
# e_clusters = Input(tensor=entity_clusters,name='e_clusters')
x = FakeEClusterIn(x)
print("Shape", K.int_shape(x))
# x = e_clusters
x = Conv2D(filters=1, kernel_size=(5, 1), strides=(5, 1), input_shape=embedding_clusters_2d.shape[1:])(x)
x = MaxPooling2D(pool_size=(6, 1), strides=(6, 1))(x)
x = Conv2D(filters=1, kernel_size=(5, 1), strides=(5, 1))(x)
x = MaxPooling2D(pool_size=(5, 1), strides=(5, 1))(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

# attention over entities
att_scores = DotProduct([entity_cluster_reps, entity_extraction])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=main_input, outputs=final_output)

m.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (20, 2585, 100, 1)
Before removing last col (20, 3, 100, 1)
after removing last col (20, 3, 100)
entity_cluster_reps(after reshape) (20, 300)
att_scores_entities (20, None)
att_normalized (20, None)
the_entity (300, None)
lstm_hidden_and_entity (500, None)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 300, 300)     6000000     main_input[0][0]                 
                                                                 main_input[0][0]                 
__________________________________________________________________________________________________
cu_dnnlstm

### PREDICT CONV2D

In [26]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
m.save('node2vec_lstm_model')

Epoch 1/10
 349/2418 [===>..........................] - ETA: 1:03:41 - loss: 4.4996 - acc: 0.0000e+ - ETA: 1:50 - loss: 7.3437 - acc: 0.0000e+00   - ETA: 1:49 - loss: 7.8418 - acc: 0.0000e+0 - ETA: 1:48 - loss: 7.4011 - acc: 0.0000e+0 - ETA: 1:47 - loss: 7.0167 - acc: 0.0000e+0 - ETA: 1:47 - loss: 6.8506 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.8360 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.9715 - acc: 0.0000e+0 - ETA: 1:46 - loss: 7.2541 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.4608 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5667 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5049 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3822 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3906 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.2569 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1678 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1228 - acc: 0.0000e+0 - ETA: 1:45 - loss: 6.9626 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.8294 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7574 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7398 - acc: 0.0122    - ETA: 1:44 - los

KeyboardInterrupt: 

In [63]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



Test loss / test accuracy = 1.6824 / 0.6592


In [64]:
preds = m.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

In [65]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))

### PREPARE NEURAL NET MODEL CONV2D WITH PRE-TRAINING

In [27]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 300

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

entity_clusters = K.variable(embedding_clusters_2d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: embedding_clusters_2d)

main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(KG_EMBEDDING_DIM, return_sequences=True)(x)
x = Avg(x)
x = Dense(KG_EMBEDDING_DIM)(x)
x = Activation('relu')(x)
# entity_extraction = Reshape([KG_EMBEDDING_DIM])(x)
entity_extraction = Transpose(x)

# get representation for entity clusters
# e_clusters = Input(tensor=entity_clusters,name='e_clusters')
x = FakeEClusterIn(x)
print("Shape", K.int_shape(x))
# x = e_clusters
x = Conv2D(filters=1, kernel_size=(5, 1), strides=(5, 1), input_shape=embedding_clusters_2d.shape[1:])(x)
x = MaxPooling2D(pool_size=(6, 1), strides=(6, 1))(x)
x = Conv2D(filters=1, kernel_size=(5, 1), strides=(5, 1))(x)
x = MaxPooling2D(pool_size=(5, 1), strides=(5, 1))(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
# entity_cluster_reps = x
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

# attention over entities
att_scores = DotProduct([entity_cluster_reps, entity_extraction])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(the_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m1 = Model(inputs=main_input, outputs=final_output)
m1.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
m1.summary()

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (20, 2585, 100, 1)
Before removing last col (20, 3, 100, 1)
after removing last col (20, 3, 100)
entity_cluster_reps(after reshape) (20, 300)
att_scores_entities (20, None)
att_normalized (20, None)
the_entity (300, None)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 300, 300)     6000000     main_input[0][0]                 
__________________________________________________________________________________________________
cu_dnnlstm_8 (CuDNNLSTM)        (None, 300, 300)     722400      embedding_4[0][0]                
_____________________________________________

In [28]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m1.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
# m.save('node2vec_lstm_model')

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)


lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
# lstm_hidden_and_entity = the_entity
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m2 = Model(inputs=main_input, outputs=final_output)

m2.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m2.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

In [None]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m2.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

preds = m2.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))

### PREPARE NEURAL NET MODEL CONV1D

In [14]:
# embedding_clusters_1d = tf.random.normal([20,2585,100])
embedding_clusters_1d.shape

(20, 2585, 100)

In [29]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 44

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

entity_clusters = K.variable(embedding_clusters_1d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: embedding_clusters_1d)

main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(KG_EMBEDDING_DIM, return_sequences=True)(x)
x = Avg(x)
x = Dense(KG_EMBEDDING_DIM)(x)
x = Activation('relu')(x)
# entity_extraction = Reshape([KG_EMBEDDING_DIM])(x)
entity_extraction = Transpose(x)
# entity_extraction = x
print("entity_extraction(after transpose)", K.int_shape(entity_extraction))

x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)
print("main_lstm_out", K.int_shape(main_lstm_out))

# get representation for entity clusters
# e_clusters = Input(tensor=entity_clusters,name='e_clusters')
x = FakeEClusterIn(x)
print("FakeEClusterIn", K.int_shape(x))
# x = e_clusters
x = Conv1D(filters=1, kernel_size=3, strides=3, input_shape=embedding_clusters_1d.shape[1:])(x)
x = MaxPooling1D(pool_size=3, strides=3)(x)
x = Conv1D(filters=1, kernel_size=2, strides=2)(x)
x = MaxPooling1D(pool_size=2, strides=1)(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
# entity_cluster_reps = x
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

# attention over entities
att_scores = DotProduct([entity_cluster_reps, entity_extraction])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
# lstm_hidden_and_entity = the_entity
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=main_input, outputs=final_output)

m.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
entity_extraction(after transpose) (44, None)
main_lstm_out (None, 200)
FakeEClusterIn (20, 827, 100)
Before removing last col (20, 44, 1)
after removing last col (20, 44)
entity_cluster_reps(after reshape) (20, 44)
att_scores_entities (20, None)
att_normalized (20, None)
the_entity (44, None)
lstm_hidden_and_entity (244, None)
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 300, 300)     6000000     main_input[0][0]                 
                                                                 main_input[0][0]                 
__________________________________________

### PREDICT CONV1D

In [16]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
# m.save('node2vec_lstm_model')

Epoch 1/10






Epoch 2/10

KeyboardInterrupt: 

In [110]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



Test loss / test accuracy = 1.8339 / 0.6510


In [111]:
preds = m.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

In [112]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))

Accuracy score: 0.65


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                              precision    recall  f1-score   support

                Architecture       0.93      0.87      0.90        15
      Architecture-Structure       0.74      0.93      0.82        15
                  Art-Cinema       0.73      0.57      0.64        14
            Art-Cinema-Actor       0.19      0.17      0.18        18
                   Art-Dance       1.00      0.77      0.87        13
            Art-Dance-Dancer       0.61      0.74      0.67        19
                 Art-Fashion       0.76      0.87      0.81        15
        Art-Fashion-Designer       0.75      0.43      0.55        21
           Art-Fashion-Model       1.00      0.25      0.40         4
              Art-Literature       0.80      0.57      0.67        14
       Art-Literature-Writer       0.39      0.64      0.49        14
                   Art-Music       0.78      1.00      0.88        18
        Art-Music-Instrument       0.88      1.00      0.93        14
                Art

### PREPARE NEURAL NET MODEL CONV1D WITH PRE-TRAINING

In [26]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 47

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

entity_clusters = K.variable(embedding_clusters_1d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: embedding_clusters_1d)

main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(KG_EMBEDDING_DIM, return_sequences=True)(x)
x = Avg(x)
x = Dense(KG_EMBEDDING_DIM)(x)
x = Activation('relu')(x)
# entity_extraction = Reshape([KG_EMBEDDING_DIM])(x)
entity_extraction = Transpose(x)

# get representation for entity clusters
# e_clusters = Input(tensor=entity_clusters,name='e_clusters')
x = FakeEClusterIn(x)
print("Shape", K.int_shape(x))
# x = e_clusters
x = Conv1D(filters=1, kernel_size=3, strides=3, input_shape=embedding_clusters_1d.shape[1:])(x)
x = MaxPooling1D(pool_size=3, strides=3)(x)
x = Conv1D(filters=1, kernel_size=3, strides=3)(x)
x = MaxPooling1D(pool_size=3, strides=2)(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
# entity_cluster_reps = x
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

# attention over entities
att_scores = DotProduct([entity_cluster_reps, entity_extraction])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(the_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m1 = Model(inputs=main_input, outputs=final_output)
m1.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
m1.summary()

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (20, 2585, 100)
Before removing last col (20, 47, 1)
after removing last col (20, 47)
entity_cluster_reps(after reshape) (20, 47)
att_scores_entities (20, None)
att_normalized (20, None)
the_entity (47, None)
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 300, 300)     6000000     main_input[0][0]                 
__________________________________________________________________________________________________
cu_dnnlstm_11 (CuDNNLSTM)       (None, 300, 47)      65612       embedding_7[0][0]                
__________________________________________________________

In [27]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m1.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
# m.save('node2vec_lstm_model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)


lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
# lstm_hidden_and_entity = the_entity
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m2 = Model(inputs=main_input, outputs=final_output)

m2.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m2.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

lstm_hidden_and_entity (247, None)
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 300, 300)     6000000     main_input[0][0]                 
                                                                 main_input[0][0]                 
__________________________________________________________________________________________________
cu_dnnlstm_11 (CuDNNLSTM)       (None, 300, 47)      65612       embedding_7[0][0]                
__________________________________________________________________________________________________
lambda_46 (Lambda)              (None, 47)           0   

In [29]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m2.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
# m.save('node2vec_lstm_model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m2.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

preds = m2.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))