In [33]:
import tensorflow as tf
import numpy as np
import glob
from keras.layers import TextVectorization
from train import *
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE
files = glob.glob('datag/Gutenberg/txt/*')
text_ds = tf.data.TextLineDataset(files).filter(
    lambda x: tf.cast(tf.strings.length(x), bool))


In [34]:
# vectorize the data
sequence_length = 40
vocab_size=5000
vectorize_layer = TextVectorization(
    standardize=txt_eos_bos,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)
vectorize_layer.adapt(text_ds.batch(1024))
text_vecto_ds = text_ds.batch(1024).prefetch(
    AUTOTUNE).map(vectorize_layer).unbatch()
sequences = list(text_vecto_ds.as_numpy_iterator())
print(len(sequences))


62191


In [45]:
# create contexts, labels, targets
# vocab_size = vectorize_layer.vocabulary_size()
print("Vocab Size:", vocab_size)
window_size = 2
num_ns = 10
contexts, targets, labels = generate_train_data(sequences=sequences,
                                                window_size=window_size,
                                                num_ns=num_ns,
                                                vocab_size=vocab_size,
                                                seed=SEED)
print(contexts.shape)
print(targets.shape)
print(labels.shape)

Vocab Size: 5000
(183514, 4)
(183514, 11)
(183514, 11)


In [46]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((contexts, targets), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)


In [54]:
from context2vec import Context2Vec
model = Context2Vec(vocab_size=vocab_size,
                    bilstm_hidden_units=50,
                    ns=num_ns, input_units=100,
                    embedding_dim=50,
                    context_units=50, seq_length=sequence_length)


In [55]:
def custom_loss(y_true, x_logit):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [56]:
# model.compile(optimizer='adam',
#               loss=tf.losses.CategoricalCrossentropy(from_logits=True), 
#               metrics=['accuracy'])
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01),
              loss=custom_loss, 
              metrics=['accuracy'])

In [58]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [59]:
model.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x21e7444f6d0>

In [60]:
weights = model.get_layer('target_embeddings').get_weights()[0]
words = vectorize_layer.get_vocabulary()

In [61]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [51]:
#docs_infra: no_execute
%tensorboard --logdir logs

UsageError: Line magic function `%tensorboard` not found.
