In [1]:
from __future__ import print_function

import tensorflow as tf

import os
import sys
import numpy as np
import pickle5 as pickle
import tempfile

import keras
from keras.preprocessing.text import Tokenizer
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Input, Embedding, Activation, Softmax
from keras.layers import CuDNNLSTM
from keras.models import Model, Sequential
from keras.layers import Conv1D,Conv2D, ZeroPadding2D, MaxPooling1D, MaxPooling2D
from keras.layers import RepeatVector, Permute, Add, Concatenate, Reshape, Dot
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
from graph_visualization import GraphVisualization

np.random.seed(100)

MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2

NUM_RELATIONS_PER_CLUSTER = 67
NUM_ENTITIES_PER_CLUSTER = 400
NUM_CLUSTERS = 20

def get_clusters(cluster_file, num_things_per_cluster):
    clusters = []  # np.ones(shape=(NUM_RELATIONS_PER_CLUSTER,KG_EMBEDDING_DIM))

    with open(cluster_file, 'r', encoding='utf8') as f:
        lines = []
        for line in f:
            elements = line.split()
            x = [[e] for e in elements]
            lines.append(x)

    for i in range(0, len(lines) - num_things_per_cluster + 1, num_things_per_cluster):
        # print("appending: {} to {}".format(i,i+num_things_per_cluster))
        clusters.append(lines[i:i + num_things_per_cluster])

    clusters = np.asarray(clusters, dtype='float32')
    return clusters


def pickler(path, pkl_name, obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def unpickler(path, pkl_name):
    with open(os.path.join(path, pkl_name), 'rb') as f:
        obj = pickle.load(f)
    return obj

def get_labels(data):
    labels = []
    for d in data:
        labels.append(d['label']) if d['label'] not in labels else None

    return labels


def get_x_and_y(data):
    x, y = [], []
    for d in data:
        for dd in d['data']:
            tmp = dd['text'].replace('\n', '').replace('_', '')  # clean
            x.append({'label': d['label'], 'dbpedia_uri': dd['dbpedia_uri'], 'context_data': dd['context_data'], 'text': tmp, 'graph': dd['graph']}) if len(tmp) > 0 else None
            y.append(d['label']) if len(tmp) > 0 else None

    return x, y


def get_label_index(label):
    return [index for index, _label in enumerate(unique_labels) if label == _label][0]


In [2]:
def find_clusters(arr):    
    result = []
    for el in arr:
        all_mappings = node_cluster_mapping_with_count[el] if el in node_cluster_mapping_with_count else []
        filtered_mappings = [e[0] for e in all_mappings[:min(len(all_mappings),3)]]
        result.extend(filtered_mappings)
    
    return result

def get_context_data(data):
    for d in data:
        for dd in d['data']:
            dd['context_data'] = [c for c in set(find_clusters(dd['context_graph']['nodes']))]

In [3]:
node_embeddings_v3 = unpickler('data','node_embeddings_v3.pkl')
node_embeddings_v4 = unpickler('data','node_embeddings_v4.pkl')
graph = unpickler('data','graph_v3.pkl')
clusters = unpickler('data','node_clusters_v5.pkl')
clusters_with_count = unpickler('data','node_clusters_v5_with_count.pkl')
node_cluster_mapping_with_count= unpickler('data','node_cluster_mapping_v5_with_count.pkl')

In [29]:
data = unpickler('data','classification_data_with_graphs_v5.pkl')

unique_labels = get_labels(data)

get_context_data(data)

x,y = get_x_and_y(data)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
x_val, y_val,x_test, y_test = x_test[:500], y_test[:500], x_test[500:], y_test[500:]
y_train_int, y_test_int = [get_label_index(label) for label in y_train],[get_label_index(label) for label in y_test]
x_train_text,x_val_text,x_test_text = [xx['text'] for xx in x_train],[xx['text'] for xx in x_val], [xx['text'] for xx in x_test]
x_train_context, x_val_context, x_test_context = [xx['context_data'] for xx in x_train],[xx['context_data'] for xx in x_val], [xx['context_data'] for xx in x_test]
x_train_text, x_test_text, y_train_int = np.array(x_train_text), np.array(x_test_text), np.array(y_train_int)

doc_indexes = list(range(0,len(x_train_text)))
# doc_indexes = list(map(lambda x: x + len(unique_labels), doc_indexes))
doc_indexes = np.array(doc_indexes)

In [30]:
doc_indexes

array([   0,    1,    2, ..., 4490, 4491, 4492])

## DNN

In [5]:
max_features = 10000
sequence_length = 200

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

with tf.device("/CPU:0"):
    vectorize_layer.adapt(x_train_text)

In [6]:
MAX_TOKENS_NUM = 5000  # Maximum vocab size.
EMBEDDING_DIMS = 100

text_input = tf.keras.Input(shape=(1,), dtype=tf.string)

vectorized_layer = vectorize_layer(text_input)

text_layer = tf.keras.layers.Embedding(MAX_TOKENS_NUM + 1, EMBEDDING_DIMS)(vectorized_layer)
text_layer = tf.keras.layers.GlobalAveragePooling1D()(text_layer)

text_layer_doc = tf.keras.layers.Embedding(MAX_TOKENS_NUM + 1, EMBEDDING_DIMS)(vectorized_layer)
text_layer_doc = tf.keras.layers.GlobalAveragePooling1D()(text_layer_doc)

text_layer_output = tf.keras.layers.Dense(units=len(unique_labels))(text_layer)
text_layer_doc_output = tf.keras.layers.Dense(units=len(x_train))(text_layer_doc)

# final_output = tf.keras.layers.Concatenate(axis=1)([text_layer_output,text_layer_doc_output])

model_simple = tf.keras.models.Model(inputs=[text_input], outputs=[text_layer_output,text_layer_doc_output])
model_simple.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.SparseCategoricalAccuracy())
model_simple.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
text_vectorization (TextVectori (None, 200)          0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 100)     500100      text_vectorization[0][0]         
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 100)     500100      text_vectorization[0][0]         
______________________________________________________________________________________________

In [31]:
epochs =  20
history = model_simple.fit(
    x_train_text,
    y=(y_train_int,doc_indexes),
    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
plt.plot(history.history['sparse_categorical_accuracy'])
plt.show()
plt.plot(history.history['loss'])
plt.show()

KeyError: 'sparse_categorical_accuracy'

In [27]:
results = model_simple.predict(x_test_text)
def get_result_labels(results):
    return [unique_labels[np.where(row==max(row))[0][0]] for row in results]

result_labels = get_result_labels(results[0])
print('Accuracy score: %.2f' % accuracy_score(result_labels, y_test))
print(classification_report(y_test, result_labels))


Accuracy score: 0.63
                              precision    recall  f1-score   support

                Architecture       0.84      0.84      0.84        25
      Architecture-Structure       0.77      0.77      0.77        26
                  Art-Cinema       0.60      0.50      0.55        24
            Art-Cinema-Actor       0.12      0.04      0.06        23
                   Art-Dance       0.88      0.67      0.76        21
            Art-Dance-Dancer       0.47      0.68      0.56        22
                 Art-Fashion       0.76      0.67      0.71        24
        Art-Fashion-Designer       0.56      0.58      0.57        24
           Art-Fashion-Model       0.00      0.00      0.00        11
              Art-Literature       0.60      0.54      0.57        28
       Art-Literature-Writer       0.52      0.53      0.52        30
                   Art-Music       0.82      0.87      0.84        31
        Art-Music-Instrument       0.87      0.80      0.83        2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
