In [3]:
import tensorflow as tf 
import numpy as np 
import tensorflow_probability as tfp
import numpy as np


with open('data.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()






In [4]:
vocab = sorted(set(text))
vocab_size = len(vocab)
n_tokens = len(text)

print(vocab)
print(vocab_size)
print(n_tokens)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65
1115393


In [5]:
text[:80]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.'

In [6]:
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = vocab_size)
text_vec_layer.adapt(text)
encoded_text = text_vec_layer([text])[0]


In [7]:
n = int(len(text_vec_layer.get_vocabulary())*0.9)
raw_train_set = encoded_text[:n]
raw_test_set = encoded_text[n:]

In [8]:
def to_dataset(sequence, length, shuffle = False, seed = None, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = 1, drop_remainder = True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100000, seed = seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [9]:
train_set = to_dataset(raw_train_set, shuffle = True, length = 8)
test_set = to_dataset(raw_test_set, length = 8)

In [10]:
listset =list(train_set)
print(f'when inputs are: {listset[0][0][:5]}')
print(f'outputs are:{listset[0][1][:5]}')


when inputs are: [[ 1  1  1  1  1  1 27  3]
 [ 1  1 16  1 34  1  1  1]
 [ 1  1  1 16  1 34  1  1]
 [ 4  1 34  1  1  1  1  1]
 [36  1 36  1  1  1  1  1]]
outputs are:[[ 1  1  1  1  1 27  3 56]
 [ 1 16  1 34  1  1  1  1]
 [ 1  1 16  1 34  1  1  1]
 [ 1 34  1  1  1  1  1  7]
 [ 1 36  1  1  1  1  1  1]]


In [11]:
class AttentionHead(tf.keras.Model):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size 
        self.key = tf.keras.layers.Dense(head_size, use_bias = False)
        self.query = tf.keras.layers.Dense(head_size, use_bias = False)
        self.value = tf.keras.layers.Dense(head_size, use_bias = False)
    
    def call(self, x):
        #pega as dimensoes de batch, context size e channels do input 
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        trig = tf.linalg.band_part(tf.ones(shape = (T, T)), -1, 0)

        att_w = q @ tf.transpose(k, perm = [0, 2, 1]) *(k.shape[-1])**-0.5

        att_w = tf.where(trig == 0, -np.inf, att_w)

        att_w = tf.nn.softmax(att_w, axis = -1)
        
        output = att_w @ v

        return output




In [13]:
head = AttentionHead(head_size = 8)
ex = head(tf.random.normal(shape = (16, 8, 8)))
print(ex)

tf.Tensor(
[[[ 0.8742348  -2.291061   -1.5439545  ...  0.39979658 -0.66131026
   -0.8413736 ]
  [ 1.0020753   0.46867776 -0.29140228 ...  0.32961547 -0.3559128
   -0.6409245 ]
  [ 0.82333547  0.5932853  -0.34482563 ...  0.13389322 -0.33224767
   -0.35916552]
  ...
  [ 0.56711996 -0.22129327 -0.5748265  ...  0.1233094  -0.31778204
    0.25546363]
  [ 0.04232287 -0.6043529  -0.8139047  ...  0.01727652 -0.42676324
    0.00731364]
  [ 0.45541838 -0.25246352 -0.49974188 ... -0.14117657 -0.33409762
    0.21107005]]

 [[ 0.66540504 -0.3047233  -0.49578145 ... -0.42390257 -1.5409238
    0.7744286 ]
  [-0.24888241  0.07342127 -0.9359792  ...  0.27649572 -0.974089
    0.8697543 ]
  [ 0.64873934  0.57870525 -0.15201709 ...  0.13821054 -0.80786574
    0.46620667]
  ...
  [ 0.9114361  -0.1222733  -0.06980932 ...  0.12196609 -0.4506261
   -0.72275054]
  [ 0.27458894  1.2392833   0.6192322  ... -0.139667   -0.1997494
    0.68740916]
  [-0.2624581   1.4255781   0.14371102 ...  0.15749381 -0.2497304
  