In [1]:
import glob
import h5py
import imp
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import sys
import tensorflow as tf
import tensorflow_hub as hub

import src.models.bilm
import src.models.bilm.data
# imp.reload(src.models.bilm.data)
# imp.reload(src.models.bilm)

from pathlib import Path
from pprint import pprint
from src import DATA_DIR
from src.models.bilm.model import dump_bilm_embeddings, dump_bilm_embeddings_with_tokens
from src.models.bilm.data import Vocabulary, UnicodeCharsVocabulary

print(DATA_DIR)

/home/z1079621/storage/embeddings


In [2]:
vocab_file = os.path.join(DATA_DIR, 'elmo', 'vocab_elmo.txt')
elmo_vocab = Vocabulary(vocab_file)

In [3]:
def get_latest_version_number():
    prefix_to_look_for = 'elmo_our_embeddings_'
    paths_and_times = []
    dirpath = os.path.join(DATA_DIR, 'elmo/%s*' % prefix_to_look_for)
    for name in glob.glob(dirpath):
        path = Path(name)
        paths_and_times.append((path.stat().st_mtime, name))

    paths_and_times = sorted(paths_and_times, key=lambda x: -x[0])
    if paths_and_times:
        _, paths_sorted = zip(*paths_and_times)
        latest_path = paths_sorted[0]
        latest_number = int(re.findall('%s(\d+)' % prefix_to_look_for, latest_path)[0])
    else:
        latest_number = -1
    return latest_number

In [4]:
def print_sentences(x1, x2, batch_size):
    for id_sample in range(batch_size):
        print("Premise:", elmo_vocab.decode(np.trim_zeros(x1[id_sample])-1))
        print("Hypo:", elmo_vocab.decode(np.trim_zeros(x2[id_sample])-1))

In [5]:
latest_version_number = get_latest_version_number()
latest_version_number

10

In [6]:
inputs_path = os.path.join(
    DATA_DIR, 'elmo/elmo_our_inputs_%d.npy' % (latest_version_number))
embeddings_path = os.path.join(
    DATA_DIR, 'elmo/elmo_our_embeddings_%d.npy' % (latest_version_number))
weighted_embeddings_path = os.path.join(
    DATA_DIR, 'elmo/elmo_our_weighted_embeddings_%d.npy' % (latest_version_number))

elmo_our_inputs = np.load(inputs_path)
elmo_our_embeddings = np.load(embeddings_path)
elmo_our_weighted_embeddings = np.load(weighted_embeddings_path)

In [7]:
print(elmo_our_inputs.shape)
print(elmo_our_embeddings.shape)
print(elmo_our_weighted_embeddings.shape)

(4, 14)
(4, 3, 14, 1024)
(4, 14, 1024)


In [8]:
tokenized_sentences = []
for sentence in elmo_our_inputs:
    tokenized_sentences.append(elmo_vocab.decode(np.trim_zeros(sentence)-1).split()[1:-1])
    
pprint(tokenized_sentences)

length_sentences = [len(sent) for sent in tokenized_sentences]
print("Number of sentences", len(length_sentences))

[['A',
  'boy',
  'gives',
  'a',
  'shy',
  'smile',
  'through',
  'gaps',
  'in',
  'a',
  'fence',
  '.'],
 ['People', 'standing', 'outside', 'of', 'a', 'building', '.'],
 ['Two', 'boys', 'are', 'climbing', 'poles', 'outdoors', '.'],
 ['A', 'man', 'in', 'a', 'hat', 'looks', 'at', 'a', 'waterfall', '.']]
Number of sentences 4


In [9]:
# sess = tf.InteractiveSession()
# elmo = hub.Module("/home/z1079621/storage/embeddings/elmohub/", trainable=False)
# sess.run(tf.global_variables_initializer())
# sess.run(tf.tables_initializer())

In [10]:
# def send_to_hub(tokens_without_padding):
#     maxlen = 0
#     for sentence in tokens_without_padding:
#         maxlen = max(maxlen, len(sentence))
        
#     tokens = []
#     lengths = []
    
#     for sentence in tokens_without_padding:
#         sentence = sentence[:maxlen]
#         lengths.append(len(sentence))
#         sentence = sentence + ["" for _ in range(max(0, maxlen - len(sentence)))]
#         tokens.append(sentence)

#     return elmo(
#         inputs={
#             "tokens": tokens,
#             "sequence_len": lengths
#         },
#         signature="tokens",
#         as_dict=True
#     )['word_emb'].eval()

In [11]:
# hub_embeddings_premises = send_to_hub(tokenized_premises)
# hub_embeddings_hypothesis = send_to_hub(tokenized_hypothesis)

In [12]:
# Get embeddings from tf-bilm model

dataset_file = os.path.join(DATA_DIR, 'elmo', 'dataset_file.txt')
with open(dataset_file, 'w') as fout:
    for sentence in tokenized_sentences:
        fout.write(' '.join(sentence) + '\n')

datadir = os.path.join(DATA_DIR, 'elmo')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'lm_weights.hdf5')

# Dump the embeddings to a file. Run this once for your dataset.
token_embedding_file = os.path.join(datadir, 'elmo_token_embeddings.hdf5')
outut_embedding_file = os.path.join(datadir, 'elmo_bilm_sample_embeddings.hdf5')
with tf.variable_scope('', reuse=tf.AUTO_REUSE):
    dump_bilm_embeddings_with_tokens(
        vocab_file, dataset_file, options_file, weight_file, 
        token_embedding_file, 
        outut_embedding_file
    )

print("DONE")

USING SKIP CONNECTIONS
[b'A', b'boy', b'gives', b'a', b'shy', b'smile', b'through', b'gaps', b'in', b'a', b'fence', b'.'] 12
[b'People', b'standing', b'outside', b'of', b'a', b'building', b'.'] 7
[b'Two', b'boys', b'are', b'climbing', b'poles', b'outdoors', b'.'] 7
[b'A', b'man', b'in', b'a', b'hat', b'looks', b'at', b'a', b'waterfall', b'.'] 10
DONE


In [13]:
# Reading saved embeddings from file

hub_embeddings = []    

with h5py.File(outut_embedding_file, 'r') as fin:
    num_sentences = len(list(fin.keys()))
    for i in range(num_sentences):
        cur = fin['%d' % i][...]
        hub_embeddings.append(cur)                              

# maxlen_premises_shapes = np.array([0]*3)
# maxlen_hypothesis_shapes = np.array([0]*3)

# # Padding

# for premise, hypo in zip(hub_embeddings_premises, hub_embeddings_hypothesis):
#     maxlen_premises_shapes = np.maximum(maxlen_premises_shapes, premise.shape)
#     maxlen_hypothesis_shapes = np.maximum(maxlen_hypothesis_shapes, hypo.shape)
    
# for i in range(len(hub_embeddings_premises)):
#     hub_embeddings_premises[i] = np.pad(
#         hub_embeddings_premises[i], 
#         list(zip([0]*3, maxlen_premises_shapes - hub_embeddings_premises[i].shape)),
#         mode='constant', constant_values=0)
    
# for i in range(len(hub_embeddings_hypothesis)):
#     hub_embeddings_hypothesis[i] = np.pad(
#         hub_embeddings_hypothesis[i], 
#         list(zip([0]*3, maxlen_hypothesis_shapes - hub_embeddings_hypothesis[i].shape)),
#         mode='constant', constant_values=0)
    
# hub_embeddings_premises = np.array(hub_embeddings_premises)
# hub_embeddings_hypothesis = np.array(hub_embeddings_hypothesis)

# for e in hub_embeddings_premises:
#     print(e.shape)
    
# for e in hub_embeddings_hypothesis:
#     print(e.shape)

In [14]:
# one_sample_dataset_file = os.path.join(DATA_DIR, 'elmo', 'one_sample_dataset_file.txt')
# with open(one_sample_dataset_file, 'w') as fout:
#     fout.write(' '.join(['A', 'dog', 'trying', 'to', 'catch', 'a', 'ball', '.']) + '\n')

# with tf.variable_scope('', reuse=tf.AUTO_REUSE):
#     dump_bilm_embeddings_with_tokens(
#         vocab_file, one_sample_dataset_file, options_file, weight_file, 
#         token_embedding_file, 
#         outut_embedding_file
#     )

In [15]:
# print(np.allclose(one_sample_embedding, hub_embeddings_premises[1]))

# plot_diff(one_sample_embedding, hub_embeddings_premises[1])

In [16]:
# one_sample_embedding = None
# with h5py.File(outut_embedding_file, 'r') as fin:
#     one_sample_embedding = fin['0'][...]

In [17]:
def plot_diff(our_embeddings, hub_embeddings):
    for i, (our_layer, hub_layer) in enumerate(zip(our_embeddings, hub_embeddings)):
        diff = (our_layer[:,:512] - hub_layer[:,:512]).flatten()
        fig, ax = plt.subplots()
        ax.hist(diff, bins=50)
        ax.set_title('diff fwd ')

        diff = (our_layer[0,:512] - hub_layer[0,:512]).flatten()
        fig, ax = plt.subplots()
        ax.hist(diff, bins=50)
        ax.set_title('diff fwd 0')

        diff = (our_layer[:,:-512] - hub_layer[:,:-512]).flatten()
        fig, ax = plt.subplots()
        ax.hist(diff, bins=50)
        ax.set_title('diff bck')

In [18]:
our_embeddings = np.load(embeddings_path)
print(our_embeddings.shape)

matches = 0
total_samples = 0

for sample_id, (sentence, our_embedding, hub_embedding) in enumerate(zip(
        tokenized_sentences, our_embeddings, hub_embeddings)):
    
    our_embedding = our_embedding[:, :hub_embedding.shape[1], :]
    assert len(sentence) + 2 == hub_embedding.shape[1]
    
    layers_match = []
    
    for i, (our_layer, hub_layer) in enumerate(zip(our_embedding, hub_embedding)):
        are_words_close = []
        for our_word, hub_word in zip(our_layer, hub_layer):
            are_words_close.append(np.allclose(our_word, hub_word, atol=1e-2))
        layers_match.append(np.all(are_words_close))
    
    matches += np.all(layers_match)
    total_samples += 1
    
    if not np.all(layers_match):
        plot_diff(our_embedding, hub_embedding)
        break
        
print("Matches %d/%d: " % (matches, total_samples))

(4, 3, 14, 1024)
Matches 4/4: 


In [19]:
# # Split concatenated input embeddings
# def split_embeddings(embeddings): # [batch, layer, word, dim]
#     after_split = []
#     for id_sen in range(embeddings.shape[0]):
#         input_embeddings = embeddings[id_sen][0] # [batch][layer]
#         assert np.allclose(input_embeddings[:, :512], input_embeddings[:, -512:])
#         input_embeddings = input_embeddings[:, :512]
#         after_split.append(input_embeddings)
#     return np.array(after_split)

In [20]:
# our_embeddings_premises = split_embeddings(elmo_our_output[0])
# our_embeddings_hypothesis = split_embeddings(elmo_our_output[1])
# print(our_embeddings_premises.shape)
# print(our_embeddings_hypothesis.shape)

In [21]:
# def get_mean_var(word_emb, title):
#     for id_sen in range(word_emb.shape[0]):
#         mean = np.mean(word_emb[id_sen], axis=-1)
#         var = np.var(word_emb[id_sen], axis=-1)
#         fig, ax = plt.subplots()
#         ax.set_title("%s mean" % title)
#         ax.bar(x=range(mean.shape[0]), height=mean)
        
#         fig, ax = plt.subplots()
#         ax.set_title("%s var" % title)
#         ax.bar(x=range(var.shape[0]), height=var)
        
# #         print(mean)
# #         print(var)
#         print("non-zeros: ", np.trim_zeros(np.sum(np.abs(word_emb[id_sen]), axis=-1)).shape[0])
        
#         plt.show()

In [22]:
# # Czy nasze embeddingi dla paddingu są zero?
# get_mean_var(our_embeddings_premises, "premises")
# get_mean_var(our_embeddings_hypothesis, "hypo")

## For padding we have zero embeddings

In [23]:
# our_embeddings_premises = our_embeddings_premises[:, :hub_embeddings_premises.shape[1] ] # for <S>, </S> tokens
# our_embeddings_hypothesis = our_embeddings_hypothesis[:, :hub_embeddings_hypothesis.shape[1] ] # for <S>, </S> tokens
# print(our_embeddings_premises.shape)
# print(our_embeddings_hypothesis.shape)

In [24]:
# for our_embedding, hub_embedding in zip(our_embeddings_premises, hub_embeddings_premises):
#     print(our_embedding.shape, hub_embedding.shape)

In [25]:
# # <S>, </S> tokens match?
# def s_tokens_match(premises, length_premises, hypothesis, length_hypothesis):
#     print(np.allclose(premises[0][0], premises[1][0]))
#     print(np.allclose(premises[0][length_premises[0]], premises[1][length_premises[1]]))
#     print(np.allclose(hypothesis[0][0], hypothesis[1][0]))
#     print(np.allclose(hypothesis[0][length_hypothesis[0]], hypothesis[1][length_hypothesis[1]]))
#     print(np.allclose(premises[0][0], hypothesis[1][0]))
    
# s_tokens_match(our_embeddings_premises, 
#                length_premises,
#                our_embeddings_hypothesis,
#                length_hypothesis)

In [26]:
# # Czy embeddingi zgadzają się po usunięciu <S> i </S>?

# # in: [batch, word, dim]
# def compare_norms_and_dots(our, hub):
#     max_batch_size = 1
#     for id_sen in range(min(max_batch_size, our.shape[0])):
#         our_norms = np.linalg.norm(our[id_sen], axis=1, keepdims=True) + np.finfo(np.float32).eps
#         hub_norms = np.linalg.norm(hub[id_sen], axis=1, keepdims=True) + np.finfo(np.float32).eps
#         dots = np.sum(np.multiply(our[id_sen] / our_norms, hub[id_sen] / hub_norms), axis=1)
#         print("Our norms:", np.squeeze(our_norms))
#         print("Hub norms:", np.squeeze(hub_norms))
#         print("Diff:", np.squeeze(np.abs(our_norms - hub_norms)))
#         print("Dots:", np.squeeze(dots))

# compare_norms_and_dots(our_embeddings_premises[:, 1:-1], hub_embeddings_premises)

In [27]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=2)
# # word_emb_all = np.concatenate([word_emb_our, word_emb_their], axis=0)

# word_emb_all = np.concatenate([word_emb_their], axis=0)
# word_emb_all = np.reshape(word_emb_all, [-1, 512])

# word_emb_all_pca = pca.fit_transform(word_emb_all)

In [28]:
# word_emb_all_pca = np.reshape(word_emb_all_pca, [-1, 21, 2])
# word_emb_all_pca.shape

In [29]:
# word_emb_all

In [30]:
# def plot_prem_hypo(word_emb_all_pca):
#     labels = ['p1', 'h1', 'p2', 'h2']
#     colors = ['red', 'orange', 'black', 'gray']

#     # print(word_emb_all_pca.shape)

#     fig, ax = plt.subplots()
#     for i in range(word_emb_all_pca.shape[0]):
#         label = labels[i]
#         x, y = zip(*word_emb_all_pca[i])
#         ax.scatter(x=x, y=y, c=colors[i], label=label)

#     ax.legend()
#     plt.show()
    
# plot_prem_hypo(word_emb_all_pca)