In [1]:
import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

In [2]:
SEED = 42 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [3]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [4]:
vocab, index = {}, 1
vocab['<pad>'] = 0
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)

In [5]:
print(vocab_size)
print(vocab)

8
{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [6]:
inverse_vocab = {index: token for token, index in vocab.items()}

In [7]:
inverse_vocab

{0: '<pad>',
 1: 'the',
 2: 'wide',
 3: 'road',
 4: 'shimmered',
 5: 'in',
 6: 'hot',
 7: 'sun'}

In [8]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [9]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence, 
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

26


In [10]:
positive_skip_grams

[[5, 1],
 [6, 5],
 [4, 2],
 [5, 3],
 [7, 6],
 [3, 5],
 [5, 4],
 [2, 4],
 [2, 3],
 [3, 4],
 [4, 3],
 [2, 1],
 [6, 1],
 [1, 3],
 [3, 1],
 [1, 7],
 [1, 6],
 [5, 6],
 [3, 2],
 [7, 1],
 [1, 2],
 [1, 4],
 [1, 5],
 [6, 7],
 [4, 5],
 [4, 1]]

In [11]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(5, 1): (in, the)
(6, 5): (hot, in)
(4, 2): (shimmered, wide)
(5, 3): (in, road)
(7, 6): (sun, hot)


In [12]:
target_word, context_word = positive_skip_grams[0]

In [13]:
target_word

5

In [14]:
context_word

1

In [15]:
num_ns = 4

In [16]:
context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))

In [17]:
context_class

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[1]], dtype=int64)>

In [18]:
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class, # class that should be sampled as 'positive'
    num_true=1, # each positive skip-gram has 1 positive context class
    num_sampled=num_ns, # number of negative context words to sample
    unique=True, # all the negative samples should be unique
    range_max=vocab_size, # pick index of the samples from [0, vocab_size]
    seed=SEED, # seed for reproducibility
    name="negative_sampling" # name of this operation
)
print(negative_sampling_candidates)

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)


In [None]:
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

In [19]:
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

['wide', 'the', 'shimmered', 'road']


In [20]:
inverse_vocab[target_word]

'in'

In [21]:
negative_sampling_candidates

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([2, 1, 4, 3], dtype=int64)>

In [22]:
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

In [23]:
negative_sampling_candidates

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[2],
       [1],
       [4],
       [3]], dtype=int64)>

In [24]:
context = tf.concat([context_class, negative_sampling_candidates], 0)

In [25]:
context

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [2],
       [1],
       [4],
       [3]], dtype=int64)>

In [26]:
label = tf.constant([1] + [0]*num_ns, dtype="int64") 

In [27]:
label

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>

In [28]:
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label =  tf.squeeze(label)

In [29]:
target

<tf.Tensor: shape=(), dtype=int32, numpy=5>

In [30]:
context

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 2, 1, 4, 3], dtype=int64)>

In [31]:
label

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>

In [32]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 5
target_word     : in
context_indices : [1 2 1 4 3]
context_words   : ['the', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [33]:
print(f"target  :", target)
print(f"context :", context )
print(f"label   :", label )

target  : tf.Tensor(5, shape=(), dtype=int32)
context : tf.Tensor([1 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)
