In [42]:
import tensorflow as tf
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json
from loading_preprocessing_TC import *
import pickle
import json
import os
import requests

MODEL_DIR = 'out/data/semeval/models'
DATASET_PATH = 'resources/datasets/semeval/train/'
DATA_PATH = 'out/data/semeval/'
MODEL_PATH = 'out/data/semeval/models/'
NEURON_COUNT_PATH = 'out/data/semeval/neuron_count.json'
POS_PER_NEURON_PATH = 'out/data/semeval/pos_per_neuron.json'

MAX_LENGTH = 200
model = None
tokenizer = None
embeddings = None
vocabulary_encoded = None
vocabulary_inv = None
qa_pairs = None
answer_texts = None
graph = None

NEURON_MAX = 128


def load_data():
    """Load SemEval 2017 files from .xml and convert them into pandas dataframes.
    Args:

    Returns:
        train (pandas dataframe): QA-pairs in a format question - correct answers (ids) - pool (ids; incorrect answers).
        If there are multiple correct answers to a single question, they are split into multiple QA - pairs.
        answer_texts_train (pandas dataframe): answer texts and their ids.
    """
    files = [DATASET_PATH + 'SemEval2016-Task3-CQA-QL-train-part1-subtaskA.xml',
             DATASET_PATH + 'SemEval2016-Task3-CQA-QL-train-part2-subtaskA.xml']
    train_xml = read_xml(files)
    train, answer_texts_train = xml2dataframe_Labels(train_xml, 'train')
    answer_texts_train.set_index('answer_id', drop=False, inplace=True)
    return train, answer_texts_train


def load_model(new_model_filename):
    """Load a pretrained model from PyTorch / Keras checkpoint.
    Args:
        new_model_filename (string): the name of the model used when saving its weights and architecture to
        either a binary (PyTorch) or a .h5 and a .json (Keras)

    Returns:
        error (string): The error message displayed to a user. If empty, counts as no error.
    """
    global model, model_filename
    print("Loading model:", new_model_filename)
    try:
        json_file = open(MODEL_PATH + new_model_filename + '.json',
                         'r')
        loaded_model_json = json_file.read()
        json_file.close()
        model = model_from_json(loaded_model_json)
        global graph
        graph = tf.get_default_graph()
        # load weights into new model
        model.load_weights(MODEL_PATH + new_model_filename + ".h5")
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model_filename = new_model_filename
        return model
    except Exception as e:
        print(e)
        error = "<div class=\"alert alert-warning\"> Sorry, there is something wrong with the model: <br> " + str(
            e) + "</div>"
        return error
    

def load_environment():
    """Load documents index for search engine, pre-trained embeddings, vocabulary, parameters and the model."""
    global model, tokenizer, embeddings, vocabulary_encoded, vocabulary_inv, qa_pairs, answer_texts, graph
    with open(DATA_PATH + 'tokenizer.p', 'rb') as handle:
        tokenizer = pickle.load(handle)
    with open(DATA_PATH + 'embedding_matrix.p', 'rb') as handle:
        embeddings = pickle.load(handle)
    vocabulary_encoded = tokenizer.word_index
    vocabulary_inv = {v: k for k, v in vocabulary_encoded.items()}
    model = load_model('model_visualization_siamesedeeplstm')
    qa_pairs, answer_texts = load_data()

    return model


def prepare_data(texts):
    """Tokenize texts and pad resulting sequences of words using Keras functions."""
    global tokenizer, embeddings
    tokens = tokenizer.texts_to_sequences(texts)
    padded_tokens = pad_sequences(tokens, maxlen=MAX_LENGTH, value=embeddings.shape[0] - 1)
    return tokens, padded_tokens


def visualize_model_deep(model, question_lstm=True):
    """Retrieve weights of the second shared LSTM to visualize neuron activations."""
    recurrent_layer = model.get_layer('SharedLSTM2')
    output_layer = model.layers[-1]

    inputs = []
    inputs.extend(model.inputs)

    outputs = []
    outputs.extend(model.outputs)
    if question_lstm:
        outputs.append(recurrent_layer.get_output_at(1))
    else:
        outputs.append(recurrent_layer.get_output_at(0))

    global graph
    with graph.as_default():
        all_function = K.function(inputs, outputs)
        output_function = K.function([output_layer.input], model.outputs)
    return all_function, output_function


def get_neuron_attention_per_token(rnn_values, texts, tokens, neuron):
    result = []
    all_tokens = []
    for idx in range(0, len(texts)):
        current_neuron_values = rnn_values[idx, :, neuron]
        current_neuron_values = current_neuron_values[-len(tokens[idx]):]
        words = [vocabulary_inv[x] for x in tokens[idx]]
        current_strings = []
        for score, word in zip(current_neuron_values, words):
            current_string = (word, score)
            current_strings.append(current_string)
        result.append(current_strings)
        all_tokens.append(words)
    return result, all_tokens


def convert_from_ud_to_array(raw_ud_input):
    result = []
    for line in raw_ud_input.split('\n'):
        if not line.startswith('#') and line.strip() != '':
            result.append(line.split())
    return result

def align_tokens_and_ud(token_score_tuples, ud_output):
    result = []
    # print('token_score_tuples', token_score_tuples)
    # print('udpipe_output', ud_output)
    
    temp = convert_from_ud_to_array(ud_output)
    for idx in range(len(token_score_tuples)):
        result.append((token_score_tuples[idx][0], token_score_tuples[idx][1], temp[idx][3], temp[idx][4]))
    
    return result


def highlight_neuron(rnn_values, texts, tokens, scale, neuron):
    """Generate HTML code where each word is highlighted according to a given neuron activity on it."""
    tag_string = "<span data-toggle=\"tooltip\" title=\"SCORE\"><span style = \"background-color: rgba(COLOR, OPACITY);\">WORD</span></span>"
    old_texts = texts
    texts = []
    for idx in range(0, len(old_texts)):
        current_neuron_values = rnn_values[idx, :, neuron]
        current_neuron_values = current_neuron_values[-len(tokens[idx]):]
        words = [vocabulary_inv[x] for x in tokens[idx]]
        current_strings = []
        if scale:
            scaled = [
                ((x - min(current_neuron_values)) * (2) / (
                        max(current_neuron_values) - min(current_neuron_values))) + (
                    -1)
                for x in current_neuron_values]
        else:
            scaled = current_neuron_values
        for score, word, scaled_score in zip(current_neuron_values, words, scaled):
            if score > 0:
                color = '195, 85, 58'
            else:
                color = '63, 127, 147'
            current_string = tag_string.replace('SCORE', str(score)).replace('WORD', word).replace('OPACITY', str(
                abs(scaled_score))).replace('COLOR', color)
            current_strings.append(current_string)
        texts.append(' '.join(current_strings))
    return texts


model = load_environment()
print("Finished loading.")

Loading model: model_visualization_siamesedeeplstm
Threads:  1790
Questions:  1790
Comments:  17900
Finished loading.


In [3]:
# Generate pos_tagged_questions 
# This is only for testing the method when we generate answers in the next step.

# nltk.download('averaged_perceptron_tagger')
# POS_TAGGED_QUESTIONS_PATH = 'out/data/semeval/pos_tagged_questions.json'
# 
# udpipe_URL = 'http://lindat.mff.cuni.cz/services/udpipe/api/process'
# headers = {
#     'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8'
# }
# 
# print(len(qa_pairs))
# if not os.path.isfile(POS_TAGGED_QUESTIONS_PATH):
#     print('File not found! Creating new version...')
#     tagged_dict = {}
#     max_print = 20
#     current_print = 0
#     for i in range(len(qa_pairs) - 1750):
#         row = qa_pairs.iloc[i]
#         question = row['question']
#         
#         # tokens = nltk.word_tokenize(question)
#         # tagged = nltk.pos_tag(tokens)
#         
#         data = {
#             'data': question,
#             'model': 'english-gum-ud-2.4-190531',
#             'tokenizer': '',
#             'tagger': '--tag',
#             'parser': ''
#         }
#         response = requests.post(udpipe_URL, headers=headers, data=data)
#         response.encoding = 'utf-8'
#         udpipe_output = response.json()['result']
#         
#         tagged_dict[i] = udpipe_output
#         
#         if current_print < max_print:
#             print(udpipe_output)
#             current_print += 1
#         
#     print('Writing to file...')
#     with open(POS_TAGGED_QUESTIONS_PATH, 'w') as file:
#         json.dump(tagged_dict, file)
#         print('Finished!')
# else:
#     print('Loading existing file.')
#     with open(POS_TAGGED_QUESTIONS_PATH, 'r') as file:
#         tagged_dict = json.load(file)


1790
File not found! Creating new version...
# newdoc
# newpar
# sent_id = 1
# text = massage oil is there any place i can find scented massage oils in qatar?
1	massage	massage	NOUN	NN	Number=Sing	2	compound	_	_
2	oil	oil	NOUN	NN	Number=Sing	4	nsubj	_	_
3	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	cop	_	_
4	there	there	ADV	RB	PronType=Dem	9	advmod	_	_
5	any	any	DET	DT	_	7	det	_	_
6	place	place	NOUN	NN	Number=Sing	7	compound	_	_
7	i	i	NOUN	NN	Number=Sing	9	nsubj	_	_
8	can	can	AUX	MD	VerbForm=Fin	9	aux	_	_
9	find	find	VERB	VB	VerbForm=Inf	0	root	_	_
10	scented	scented	ADJ	JJ	Degree=Pos	12	amod	_	_
11	massage	massage	NOUN	NN	Number=Sing	12	compound	_	_
12	oils	oil	NOUN	NNS	Number=Plur	9	obj	_	_
13	in	in	ADP	IN	_	14	case	_	_
14	qatar	qatar	NOUN	NN	Number=Sing	9	obl	_	SpaceAfter=No
15	?	?	PUNCT	.	_	9	punct	_	SpaceAfter=No


# newdoc
# newpar
# sent_id = 1
# text = Philipino Massage center
1	Philipino	Philipino	PROPN	NNP	Number=Sing	3	compound	_	_
2	Massage	massage

In [43]:
indices = [0, 1]
neuron = 0

# Start actual visualization
all_highlighted_wrong_answers = []
all_wrong_answers = []

min_ca = 1

min_wa = 1
max_ca = -1
max_wa = -1

activated_words = []
activated_words_values = []
antiactivated_words = []
antiactivated_words_values = []

activation_per_word_data = {}
asked_questions = {}

all_function_deep, output_function_deep = visualize_model_deep(model, False)
nlp = spacy.load('en')


In [71]:

"""
neuron_counts

[
    {
        num: 0,
        # tokens = [('Yes', 'UH'), ('Try', 'VB'), ...]
        token_counts: {
            UH: 1,
            VB: 1,
            ...
        }
    },
    ...
]
"""
neuron_counts = []

if os.path.exists(NEURON_COUNT_PATH):
    print('Loading existing file...')
    with open(NEURON_COUNT_PATH, 'r') as file:
        neuron_counts = json.load(file)
    print('Done.')
else:
    print('Generating new file...')
    udpipe_URL = 'http://lindat.mff.cuni.cz/services/udpipe/api/process'
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8'
    }
    
    for neuron_num in range(NEURON_MAX - 127):
        print('neuron', neuron_num)
        neuron_count = {
            'neuron': neuron_num,
            # 'tokens': [],
            'token_counts': {}
        }
        for i in range(87, 89): #indices:
            print('Generating activations for QA pair', i)
            neuron_count = {
                'num': i,
                'token_counts': {}
            }
            row = qa_pairs.iloc[i]
            correct_answers = answer_texts.loc[row['answer_ids']]['answer'].values
            wrong_answers = answer_texts.loc[row['pool']]['answer'].values
            question = row['question']
            asked_questions[i] = question
            q_tokens, q_padded_tokens = prepare_data([question])
            ca_tokens, ca_padded_tokens = prepare_data(correct_answers)
            wa_tokens, wa_padded_tokens = prepare_data(wrong_answers)
            if len(correct_answers) > 0:
                scores_ca, rnn_values_ca = all_function_deep([q_padded_tokens * len(correct_answers), ca_padded_tokens])
        
                tuples, all_tokens = get_neuron_attention_per_token(rnn_values_ca, correct_answers, ca_tokens, neuron)
                for idx in range(len(all_tokens)):
                    data = {
                        'data': ' '.join(all_tokens[idx]),
                        'model': 'english-gum-ud-2.4-190531',
                        'tokenizer': '',
                        'tagger': '--tag',
                        'parser': ''
                    }
                    response = requests.post(udpipe_URL, headers=headers, data=data)
                    response.encoding = 'utf-8'
                    udpipe_output = response.json()['result']
                    current_pos_scores = align_tokens_and_ud(tuples[idx], udpipe_output)
                    # [('most', 0.73064023, 'ADJ', 'JJS'), ('of', 0.031687938, 'ADP', 'IN'), ('the', 0.008439351, 'DET', 'DT'), ('time', 7.566358e-05, 'NOUN', 'NN'), ('hijacking', 0.00023871037, 'VERB', 'VBG'), ('shifts', 0.00029278902, 'VERB', 'VBZ'), ('the', 0.00026579967, 'DET', 'DT'), ('main', 0.026925175, 'ADJ', 'JJ'), ('topic', 0.0046378975, 'NOUN', 'NN'), ('to', 0.0003025322, 'ADP', 'TO'), ('a', 0.00088415114, 'DET', 'DT'), ('different', 0.0012040904, 'ADJ', 'JJ'), ('one', 0.009830474, 'NUM', 'CD'), ('and', 0.00029199503, 'CCONJ', 'CC'), ('then', 0.0035359005, 'ADV', 'RB'), ('to', 0.00042696742, 'ADP', 'TO'), ('another', 0.00050246046, 'DET', 'DT'), ('different', 0.0010206797, 'ADJ', 'JJ'), ('one', 0.0062409323, 'NUM', 'CD'), ('and', 0.00012129463, 'CCONJ', 'CC'), ('so', 0.00026837253, 'ADV', 'RB'), ('on', 0.00025421553, 'ADP', 'IN')]
                    for current_tuple in current_pos_scores:
                        if current_tuple[3] in neuron_count['token_counts']:
                            neuron_count['token_counts'][current_tuple[3]] = neuron_count['token_counts'][current_tuple[3]] + abs(current_tuple[1])
                        else:
                            neuron_count['token_counts'][current_tuple[3]] = abs(current_tuple[1])
            else:
                pass
        
            if len(wrong_answers) > 0:
                scores_wa, rnn_values_wa = all_function_deep([q_padded_tokens * len(wrong_answers), wa_padded_tokens])
        
                tuples, all_tokens = get_neuron_attention_per_token(rnn_values_wa, wrong_answers, wa_tokens, neuron)
                for idx in range(len(all_tokens)):
                    data = {
                        'data': ' '.join(all_tokens[idx]),
                        'model': 'english-gum-ud-2.4-190531',
                        'tokenizer': '',
                        'tagger': '--tag',
                        'parser': ''
                    }
                    response = requests.post(udpipe_URL, headers=headers, data=data)
                    response.encoding = 'utf-8'
                    udpipe_output = response.json()['result']
                    current_pos_scores = align_tokens_and_ud(tuples[idx], udpipe_output)
                    # [('most', 0.73064023, 'ADJ', 'JJS'), ('of', 0.031687938, 'ADP', 'IN'), ('the', 0.008439351, 'DET', 'DT'), ('time', 7.566358e-05, 'NOUN', 'NN'), ('hijacking', 0.00023871037, 'VERB', 'VBG'), ('shifts', 0.00029278902, 'VERB', 'VBZ'), ('the', 0.00026579967, 'DET', 'DT'), ('main', 0.026925175, 'ADJ', 'JJ'), ('topic', 0.0046378975, 'NOUN', 'NN'), ('to', 0.0003025322, 'ADP', 'TO'), ('a', 0.00088415114, 'DET', 'DT'), ('different', 0.0012040904, 'ADJ', 'JJ'), ('one', 0.009830474, 'NUM', 'CD'), ('and', 0.00029199503, 'CCONJ', 'CC'), ('then', 0.0035359005, 'ADV', 'RB'), ('to', 0.00042696742, 'ADP', 'TO'), ('another', 0.00050246046, 'DET', 'DT'), ('different', 0.0010206797, 'ADJ', 'JJ'), ('one', 0.0062409323, 'NUM', 'CD'), ('and', 0.00012129463, 'CCONJ', 'CC'), ('so', 0.00026837253, 'ADV', 'RB'), ('on', 0.00025421553, 'ADP', 'IN')]
                    for current_tuple in current_pos_scores:
                        if current_tuple[3] in neuron_count['token_counts']:
                            neuron_count['token_counts'][current_tuple[3]] = neuron_count['token_counts'][current_tuple[3]] + abs(current_tuple[1])
                        else:
                            neuron_count['token_counts'][current_tuple[3]] = abs(current_tuple[1])
            else:
                pass
        
            print(neuron_count)
            neuron_counts.append(neuron_count)
        #%%
        for neuron in neuron_counts:
            total_attention = sum(neuron['token_counts'].values())
            print(total_attention)
            neuron['pos_percents'] = {}
            remainders = {}
            for key in neuron['token_counts']:
                rounded_value = round(neuron['token_counts'][key] / total_attention * 100, 2)
                remainders[key] = rounded_value - np.floor(rounded_value)
                neuron['token_counts'][key] = rounded_value
            # {'JJS': 10.55, 'IN': 7.46, 'DT': 10.7, 'NN': 7.2, 'VBG': 1.67, 'VBZ': 3.76, 'JJ': 4.19, 'TO': 0.09, 'CD': 4.2, 'CC': 2.26, 'RB': 7.34, 'LS': 6.37, 'VBD': 4.64, 'NNS': 1.94, 'VBN': 0.03, 'PRP': 7.16, 'MD': 9.69, 'VB': 3.65, 'VBP': 0.53, 'WP': 0.0, 'PRP$': 0.41, 'UH': 0.31, 'EX': 0.0, 'RP': 1.09, 'WRB': 0.0, 'JJR': 4.8}
            print(neuron['token_counts'])
            print(remainders)
            pos_percents_rounded = {}
            for key, value in neuron['token_counts'].items():
                pos_percents_rounded[key] =  np.floor(neuron['token_counts'][key])
            remainders_desc = sorted(remainders.items(), key=lambda x: x[1], reverse=True)
            print(sum(neuron['token_counts'].values()))
            # Largest Remainder Method to roughly split values to add up to 100%
            # Also, JSON does not encode float32.
            # https://en.wikipedia.org/wiki/Largest_remainder_method
            print(remainders_desc)
            print(pos_percents_rounded)
            print(100 - sum(pos_percents_rounded.values()))
            to_allocate = 100 - sum(pos_percents_rounded.values())
            for current_tuple in remainders_desc:
                if to_allocate == 0:
                    break
                else:
                    pos_percents_rounded[current_tuple[0]] += 1
                    to_allocate -= 1
            print(pos_percents_rounded)
            print(sum(pos_percents_rounded.values()))
            for key in pos_percents_rounded:
                neuron['token_counts'][key] = pos_percents_rounded[key]
    
     
    

Generating new file...
neuron 0
Generating activations for QA pair 87


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [69]:
with open(NEURON_COUNT_PATH, 'w') as file:
    json.dump(neuron_counts, file)

In [28]:
key_set = set()
for item in neuron_counts:
    for key in list(item['token_counts'].keys()):
        key_set.add(key)

# ['$', "''", 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
print(sorted(key_set))
key_list = ['$', "''", 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

neuron_nums = []
for num in range(128):
    neuron_nums.append('Neuron ' + str(num))
# print(neuron_nums)

key_per_neuron = {'$': [], "''": [], 'CC': [], 'CD': [], 'DT': [], 'EX': [],
                  'FW': [], 'IN': [], 'JJ': [], 'JJR': [], 'JJS': [], 'MD': [],
                  'NN': [], 'NNP': [], 'NNPS': [], 'NNS': [], 'PDT': [], 'POS': [],
                  'PRP': [], 'PRP$': [], 'RB': [], 'RBR': [], 'RBS': [], 'RP': [],
                  'TO': [], 'UH': [], 'VB': [], 'VBD': [], 'VBG': [], 'VBN': [],
                  'VBP': [], 'VBZ': [], 'WDT': [], 'WP': [], 'WP$': [], 'WRB': []
                  }
for neuron in neuron_counts:
    for key in key_list:
        if key not in neuron['token_counts']:
            key_per_neuron[key].append(0)
        else:
            key_per_neuron[key].append(neuron['token_counts'][key])
# print(key_per_neuron)

['CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'PDT', 'PRP', 'PRP$', 'RB', 'RBR', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WP', 'WRB']


In [10]:
# Values
trace_x = []
# Labels
trace_y = []

plotly_tsne = []
for key in key_list:
    trace_neuron = {
        'name': key,
        'x': key_per_neuron[key],
        'y': neuron_nums,
        'orientation': 'h',
        'type': 'bar'
    }
    plotly_tsne.append(trace_neuron)

plotly_tsne_as_json = pd.Series(plotly_tsne).to_json(orient='values')
with open(POS_PER_NEURON_PATH, 'w') as file:
    json.dump(plotly_tsne_as_json, file)