In [1]:
from __future__ import print_function

import tensorflow as tf

import os
import sys
import numpy as np
import pickle5 as pickle
import tempfile

import keras
from keras.preprocessing.text import Tokenizer
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Input, Embedding, Activation, Softmax
from keras.layers import CuDNNLSTM
from keras.models import Model, Sequential
from keras.layers import Conv1D,Conv2D, ZeroPadding2D, MaxPooling1D, MaxPooling2D
from keras.layers import RepeatVector, Permute, Add, Concatenate, Reshape, Dot
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt

np.random.seed(100)

BASE_DIR = '.'
GLOVE_DIR = '/media/drived/Dev/Glove'
# TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
DATA_DIR = "/home/bhargav/nlu_project/keras_n20/data/Es_Rc"
DOC_PKL = "document_list.pick"
TARGET_PKL = "target_list.pick"

OUTPUT_PATH = './learning_beyond_datasets/node2vec/lstm_and_kg/'

MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

NUM_RELATIONS_PER_CLUSTER = 67
NUM_ENTITIES_PER_CLUSTER = 400
NUM_CLUSTERS = 20

EMBEDDING_DIM = 300

def get_clusters(cluster_file, num_things_per_cluster):
    clusters = []  # np.ones(shape=(NUM_RELATIONS_PER_CLUSTER,KG_EMBEDDING_DIM))

    with open(cluster_file, 'r', encoding='utf8') as f:
        lines = []
        for line in f:
            elements = line.split()
            x = [[e] for e in elements]
            lines.append(x)

    for i in range(0, len(lines) - num_things_per_cluster + 1, num_things_per_cluster):
        # print("appending: {} to {}".format(i,i+num_things_per_cluster))
        clusters.append(lines[i:i + num_things_per_cluster])

    clusters = np.asarray(clusters, dtype='float32')
    return clusters


def pickler(path, pkl_name, obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(path, pkl_name):
    with open(os.path.join(path, pkl_name), 'rb') as f:
        obj = pickle.load(f)
    return obj

def get_labels(data):
    labels = []
    for d in data:
        labels.append(d['label']) if d['label'] not in labels else None

    return labels


def get_x_and_y(data):
    x, y = [], []
    for d in data:
        for dd in d['data']:
            tmp = dd['text'].replace('\n', '').replace('_', '')  # clean
            x.append({'label': d['label'], 'dbpedia_uri': dd['dbpedia_uri'], 'context_data': dd['context_data'], 'text': tmp, 'graph': dd['graph']}) if len(tmp) > 0 else None
            y.append(d['label']) if len(tmp) > 0 else None

    return x, y


def get_label_index(label):
    return [index for index, _label in enumerate(unique_labels) if label == _label][0]


### LOAD DATA

In [2]:
def find_clusters(arr):    
    result = []
    for el in arr:
        all_mappings = node_cluster_mapping_v3_with_count[el] if el in node_cluster_mapping_v3_with_count else []
        filtered_mappings = [e[0] for e in all_mappings[:min(len(all_mappings),3)]]
        result.extend(filtered_mappings)
    
    return result

def get_context_data(data):
    for d in data:
        for dd in d['data']:
            dd['context_data'] = [c for c in set(find_clusters(dd['context_graph']['nodes']))]

In [3]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

with open(r'data/classification_data_with_graphs_v4.pkl', 'rb') as pickle_file:
    data = pickle.load(pickle_file)

unique_labels = get_labels(data)

node_cluster_mapping_v3_with_count = unpickler('data','node_cluster_mapping_v3_with_count.pkl')
get_context_data(data)

x, labels = get_x_and_y(data)


### GET NODE EMBEDDINGS/GRAPH

In [4]:
with open(r'data/node_embeddings_v3.pkl', 'rb') as pickle_file:
    node_embeddings = pickle.load(pickle_file)
with open(r'data/graph.pkl', 'rb') as pickle_file:
    graph = pickle.load(pickle_file)

### GET GRAPH CLUSTERS

In [5]:
clusters_v1 = unpickler('data','node_clusters.pkl')
clusters_v2 = unpickler('data','node_clusters_v2.pkl')
clusters_v3 = unpickler('data','node_clusters_v3.pkl')
# clusters_v3 = get_graph_clusters()
# cluster_embeddings = get_embeddings(clusters)

In [6]:
clusters_v3_filtered = unpickler('data','node_clusters_v3_filtered.pkl')
clusters_v3_filtered = unpickler('data','node_clusters_v3_with_count.pkl')
node_cluster_mapping_v2 = unpickler('data','node_cluster_mapping_v2.pkl')

### PARSE INPUT

In [7]:
context_data = []

x_train, x_test, labels_train, labels_test = train_test_split(x, labels, test_size=0.33, random_state=42)

y_train, y_test = [get_label_index(label) for label in labels_train],[get_label_index(label) for label in labels_test]

x_val, y_val,x_test, y_test = x_test[:500], y_test[:500], x_test[500:], y_test[500:]

x_train_text,x_test_text = [xx['text'] for xx in x_train], [xx['text'] for xx in x_test]

# print('x_train:{} y_train:{} x_val:{} y_val:{} '.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))

x_train_text,x_val_text,x_test_text = [xx['text'] for xx in x_train],[xx['text'] for xx in x_val], [xx['text'] for xx in x_test]
x_train_context, x_val_context, x_test_context = [xx['context_data'] for xx in x_train],[xx['context_data'] for xx in x_val], [xx['context_data'] for xx in x_test]

In [8]:
i = 55
x_train_text[i], y_train[i], labels_train[i], unique_labels[y_train[i]]

('{{}}A cruiseferry is a ship that combines the features of a cruise ship and a Ro-Pax ferry. Many passengers travel with the ships for the cruise experience, staying only a few hours at the destination port or not leaving the ship at all, while others use the ships as means of transportation.Cruiseferry traffic is mainly concentrated in the seas of Northern Europe, especially the Baltic Sea and the North Sea. However, similar ships traffic across the English Channel as well as the Irish Sea, Mediterranean and even on the North Atlantic. Cruiseferries also operate from India, China and Australia.Baltic Sea cruiseferriesIn the northern Baltic Sea, two major rival companies, Viking Line and Silja Line, have for decades competed on the routes between Turku and Helsinki in Finland and Sweden\'s capital Stockholm. Since the 1990s Tallink has also risen as a major company in the area, culminating with acquisition of Silja Line in 2006.List of largest cruiseferries of their timeThe term "crui

### PREPARE NEURAL NET MODEL SIMPLE


In [None]:
GRAPH CLUSTER      NODE_CLUSTER_MAPPING           GRAPH INPUT
0 Philosophy       Schopenhaur, Kant              Kant, Socrates
1 Music            Led Zeppelin, Pearl Jam

For each element in graph input, find the corresponding cluster, attend over it

1. Finding graph input node clusters:
    We know cluster indexes.
    Write a lambda function to convert graph inputs to clusters then encode them(one-hot encoded)

In [9]:
max_features = 10000
sequence_length = 200

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

vectorize_layer.adapt(x_train_text)

In [13]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 100
MAX_TOKENS_NUM = 500
EMBEDDING_DIMS = 100

Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))

cluster_names = unique_labels + ['None']
                             
text_input = tf.keras.Input(shape=(1,), dtype=tf.string)

text_layer = vectorize_layer(text_input)
text_layer = tf.keras.layers.Embedding(MAX_TOKENS_NUM + 1, EMBEDDING_DIMS)(text_layer)
text_layer = tf.keras.layers.GlobalAveragePooling1D()(text_layer)

# context_input = Input(shape=(1,), dtype='string', name='graph_input')                           
# context_layer = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='multi_hot')(context_input)

# concatenated = Concatenate(axis=1)([text_layer, context_layer])
# final_output = Dense(units=len(unique_labels), activation='softmax')(concatenated)
final_output = Dense(units=len(unique_labels))(text_layer)

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=[text_input], outputs=final_output)

m.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=tf.metrics.SparseCategoricalAccuracy())

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          50100     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 78)                7878      
Total params: 57,978
Trainable params: 57,978
Non-trainable params: 0
_________________________________________________________________


### PREDICT SIMPLE

In [14]:
i = 55
x_train_text[i], unique_labels[y_train[i]]

('{{}}A cruiseferry is a ship that combines the features of a cruise ship and a Ro-Pax ferry. Many passengers travel with the ships for the cruise experience, staying only a few hours at the destination port or not leaving the ship at all, while others use the ships as means of transportation.Cruiseferry traffic is mainly concentrated in the seas of Northern Europe, especially the Baltic Sea and the North Sea. However, similar ships traffic across the English Channel as well as the Irish Sea, Mediterranean and even on the North Atlantic. Cruiseferries also operate from India, China and Australia.Baltic Sea cruiseferriesIn the northern Baltic Sea, two major rival companies, Viking Line and Silja Line, have for decades competed on the routes between Turku and Helsinki in Finland and Sweden\'s capital Stockholm. Since the 1990s Tallink has also risen as a major company in the area, culminating with acquisition of Silja Line in 2006.List of largest cruiseferries of their timeThe term "crui

In [15]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

x_train_input = pd.DataFrame(x_train_text).values
context_data_train_input = pd.DataFrame(x_train_context).values
y_train_output = pd.DataFrame(y_train).values

for c in context_data_train_input:
    for i,v in enumerate(c):
        c[i] = 'None' if not v else v 

# x_train_input = np.asarray(['Test' for i in range(len(x_train))])
# context_data_train_input = np.asarray(['Art-Painting' for i in range(len(x_train_input))])
# y_train_output = np.asarray([0 for i in range(len(x_train_input))])

history = m.fit(x_train_text, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                # validation_data=(x_val_text, y_val)
               )
# m.save('node2vec_lstm_model')

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
plt.plot(history.history['loss'])
plt.show()
plt.plot(history.history['sparse_categorical_accuracy'])
plt.show()

In [69]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

x_test_input = pd.DataFrame(x_test_text).values
context_data_test_input = pd.DataFrame(context_data_test).values
y_test_output = pd.DataFrame(y_test).values

for c in context_data_test_input:
    for i,v in enumerate(c):
        c[i] = 'None' if not v else v 
        
loss, acc = m.evaluate([x_test_input], y_test_output, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Test loss / test accuracy = 13.5225 / 0.0000


In [70]:
preds = m.predict([x_test_input], batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

'Art-Painting'

In [75]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_test_str))
print(classification_report(y_test_str, result_labels))

Accuracy score: 0.00
                              precision    recall  f1-score   support

                Architecture       0.00      0.00      0.00      74.0
      Architecture-Structure       0.00      0.00      0.00      98.0
                  Art-Cinema       0.00      0.00      0.00       0.0
            Art-Cinema-Actor       0.00      0.00      0.00       0.0
                   Art-Dance       0.00      0.00      0.00       0.0
            Art-Dance-Dancer       0.00      0.00      0.00       0.0
                 Art-Fashion       0.00      0.00      0.00       0.0
        Art-Fashion-Designer       0.00      0.00      0.00       0.0
              Art-Literature       0.00      0.00      0.00       0.0
       Art-Literature-Writer       0.00      0.00      0.00       0.0
                   Art-Music       0.00      0.00      0.00       0.0
        Art-Music-Instrument       0.00      0.00      0.00       0.0
                Art-Painting       0.00      0.00      0.00       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### PREPARE NEURAL NET MODEL WITH EMBEDDINGS


In [None]:
GRAPH CLUSTER      NODE_CLUSTER_MAPPING           GRAPH INPUT
0 Philosophy       Schopenhaur, Kant              Kant, Socrates
1 Music            Led Zeppelin, Pearl Jam

For each element in graph input, find the corresponding cluster, attend over it

1. Finding graph input node clusters:
    We know cluster indexes.
    Write a lambda function to convert graph inputs to clusters then encode them(one-hot encoded)

In [8]:
LSTM_HIDDEN_SIZE = 200
LEARNING_RATE = 0.001
KG_EMBEDDING_DIM = 300

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Preparing model.')

# entity_clusters = K.variable(embedding_clusters_2d)

Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
DotProduct = keras.layers.core.Lambda(lambda x: K.dot(x[0], x[1]))  # , output_shape=(KG_EMBEDDING_DIM, ))
Sum = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1))  # , output_shape=(KG_EMBEDDING_DIM, ))
RemoveLastCol = keras.layers.core.Lambda(lambda x: K.sum(x, axis=-1))
Transpose = keras.layers.core.Lambda(lambda x: K.transpose(x))
FakeEClusterIn = keras.layers.core.Lambda(lambda x: node_embedding_clusters)
FindCluster = keras.layers.core.Lambda(lambda x: tf.map_fn(lambda el: node_cluster_mapping[el] if el in node_cluster_mapping else 0, x, dtype='string')

cluster_names = unique_labels + ['None']
                                       
main_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')

x = embedding_layer(main_input)
x = CuDNNLSTM(LSTM_HIDDEN_SIZE, return_sequences=True)(x)
Avg = keras.layers.core.Lambda(lambda x: K.mean(x, axis=1), output_shape=(LSTM_HIDDEN_SIZE,))
x = Avg(x)
x = Dense(LSTM_HIDDEN_SIZE)(x)
main_lstm_out = Activation('relu')(x)

# get representation for entity clusters
e_clusters = Input(name='e_clusters')
# x = FakeEClusterIn(x)
# print("Shape", K.int_shape(x))
x = e_clusters
x = Conv1D(filters=1, kernel_size=5, strides=1, input_shape=node_embeddings.shape)(x)
x = MaxPooling1D(pool_size=5, strides=5)(x)
x = Conv1D(filters=1, kernel_size=5, strides=1)(x)
x = MaxPooling1D(pool_size=5, strides=1)(x)
print("Before removing last col", K.int_shape(x))
x = RemoveLastCol(x)
print("after removing last col", K.int_shape(x))
entity_cluster_reps = Reshape([KG_EMBEDDING_DIM], name='entity_cluster_reps')(x)
print("entity_cluster_reps(after reshape)", K.int_shape(entity_cluster_reps))

graph_input = Input(shape=(,), dtype='string', name='graph_input')                           
x = FindCluster(x)
graph = tf.keras.layers.StringLookup(vocabulary=cluster_names, output_mode='one_hot')(x)
                               
# attention over entities
att_scores = DotProduct([entity_cluster_reps, graph])
print("att_scores_entities", K.int_shape(att_scores))
# att_normalized = Activation('softmax',name='entity_attention')(att_scores)
att_normalized = Softmax(axis=-1, name='entity_attention')(att_scores)
print("att_normalized", K.int_shape(att_normalized))
the_entity = DotProduct([Transpose(entity_cluster_reps), att_normalized])
print("the_entity", K.int_shape(the_entity))

lstm_hidden_and_entity = Concatenate(axis=0)([Transpose(main_lstm_out), the_entity])
print("lstm_hidden_and_entity", K.int_shape(lstm_hidden_and_entity))
# input("continue?")

final_output = Dense(units=len(unique_labels), activation='softmax')(Transpose(lstm_hidden_and_entity))

optimizer = Adam(lr=LEARNING_RATE, clipvalue=0.25)

m = Model(inputs=[main_input,e_clusters,graph_input], outputs=final_output)

m.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn, monitor='val_acc', save_best_only=True, save_weights_only=True)]

m.summary()

from tensorflow.keras.utils import plot_model
# plot_model(m, to_file='model.png')

_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [ModelCheckpoint(tmpfn,monitor='val_acc', save_best_only=True, save_weights_only=True)]

Preparing model.
Shape (89539, 100)


ValueError: Input 0 of layer max_pooling1d is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (89539, 100)

### PREDICT WITH EMBEDDINGS

In [26]:
BATCH_SIZE = 2
NUM_EPOCHS = 10

history = m.fit(x_train, y_train,
                batch_size=BATCH_SIZE,
                epochs=NUM_EPOCHS,
                validation_data=(x_val, y_val), 
                callbacks=callbacks)
m.save('node2vec_lstm_model')

Epoch 1/10
 349/2418 [===>..........................] - ETA: 1:03:41 - loss: 4.4996 - acc: 0.0000e+ - ETA: 1:50 - loss: 7.3437 - acc: 0.0000e+00   - ETA: 1:49 - loss: 7.8418 - acc: 0.0000e+0 - ETA: 1:48 - loss: 7.4011 - acc: 0.0000e+0 - ETA: 1:47 - loss: 7.0167 - acc: 0.0000e+0 - ETA: 1:47 - loss: 6.8506 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.8360 - acc: 0.0000e+0 - ETA: 1:46 - loss: 6.9715 - acc: 0.0000e+0 - ETA: 1:46 - loss: 7.2541 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.4608 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5667 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.5049 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3822 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.3906 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.2569 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1678 - acc: 0.0000e+0 - ETA: 1:45 - loss: 7.1228 - acc: 0.0000e+0 - ETA: 1:45 - loss: 6.9626 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.8294 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7574 - acc: 0.0000e+0 - ETA: 1:44 - loss: 6.7398 - acc: 0.0122    - ETA: 1:44 - los

KeyboardInterrupt: 

In [63]:
# Restore the best found model during validation
# m.load_weights(tmpfn)

loss, acc = m.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



Test loss / test accuracy = 1.6824 / 0.6592


In [64]:
preds = m.predict(x_test, batch_size=BATCH_SIZE, verbose=0,
                  steps=None)

In [65]:
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(preds)

print('Accuracy score: %.2f' % accuracy_score(result_labels, y_str_test))
print(classification_report(y_str_test, result_labels))