In [3]:
import tensorflow as tf 
import numpy as np 
import tensorflow_probability as tfp
import numpy as np


with open('data.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()






In [4]:
vocab = sorted(set(text))
vocab_size = len(vocab)
n_tokens = len(text)

print(vocab)
print(vocab_size)
print(n_tokens)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65
1115393


In [5]:
text[:80]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.'

In [6]:
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = vocab_size)
text_vec_layer.adapt(text)
encoded_text = text_vec_layer([text])[0]


In [7]:
n = int(len(text_vec_layer.get_vocabulary())*0.9)
raw_train_set = encoded_text[:n]
raw_test_set = encoded_text[n:]

In [8]:
def to_dataset(sequence, length, shuffle = False, seed = None, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = 1, drop_remainder = True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100000, seed = seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [9]:
train_set = to_dataset(raw_train_set, shuffle = True, length = 8)
test_set = to_dataset(raw_test_set, length = 8)

In [10]:
listset =list(train_set)
print(f'when inputs are: {listset[0][0][:5]}')
print(f'outputs are:{listset[0][1][:5]}')


when inputs are: [[ 1  1  1  1  1  1 27  3]
 [ 1  1 16  1 34  1  1  1]
 [ 1  1  1 16  1 34  1  1]
 [ 4  1 34  1  1  1  1  1]
 [36  1 36  1  1  1  1  1]]
outputs are:[[ 1  1  1  1  1 27  3 56]
 [ 1 16  1 34  1  1  1  1]
 [ 1  1 16  1 34  1  1  1]
 [ 1 34  1  1  1  1  1  7]
 [ 1 36  1  1  1  1  1  1]]


In [5]:
import tensorflow as tf
import numpy as np 

class AttentionHead(tf.keras.Model):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size 
        self.key = tf.keras.layers.Dense(head_size, use_bias = False)
        self.query = tf.keras.layers.Dense(head_size, use_bias = False)
        self.value = tf.keras.layers.Dense(head_size, use_bias = False)
    
    def call(self, x):
        #pega as dimensoes de batch, context size e channels do input 
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        trig = tf.linalg.band_part(tf.ones(shape = (T, T)), -1, 0)

        att_w = q @ tf.transpose(k, perm = [0, 2, 1]) *(k.shape[-1])**-0.5

        att_w = tf.where(trig == 0, -np.inf, att_w)

        att_w = tf.nn.softmax(att_w, axis = -1)
        
        output = att_w @ v

        return output




In [13]:
head = AttentionHead(head_size = 8)
ex = head(tf.random.normal(shape = (16, 8, 8)))
print(ex)

tf.Tensor(
[[[ 0.13514826  0.5280335   1.0186923  ...  0.67153645  0.933869
   -1.2463728 ]
  [ 0.1171567   0.47883236  0.77883315 ...  0.49982318  0.9062302
   -0.93722457]
  [-0.00403237  0.44227654  0.5928966  ...  0.08205067  1.0044994
   -0.2859735 ]
  ...
  [ 0.06681177  0.40070373  0.73292655 ...  0.19567572  0.95475346
   -0.45742366]
  [ 0.1275706  -0.18033773 -0.00486103 ... -0.3911594   0.48348394
    0.7040245 ]
  [-0.22160497  0.0620264   0.32561854 ... -0.05123576  0.5626712
   -0.18593575]]

 [[-1.3091807   0.8946485  -0.12172471 ...  0.25949866  0.39186454
   -0.94198704]
  [-1.271266    0.8004763  -0.11725113 ...  0.27440572  0.2922587
   -0.8750748 ]
  [-0.92463726  0.5260108  -0.35492134 ...  0.38115436 -0.16877648
   -0.7854398 ]
  ...
  [-0.45744753  0.6127905   0.00923503 ...  0.3896348   0.02467614
   -0.31110883]
  [-0.06563734  0.2646031   0.23518266 ...  0.3600964  -0.15568148
   -0.24452543]
  [-0.2250154   0.5003611   0.0759698  ...  0.54129404 -0.2274189
  

In [7]:
class MultiHeadAttention(tf.keras.Model):
    def __init__(self, head_size, n_heads):
        super().__init__()
        self.heads = [AttentionHead(head_size) for _ in range(n_heads)]
        self.proj = tf.keras.layers.Dense(512)
        self.dropout = tf.keras.layers.Dropout(0.2)

    def call(self, x):
        x = tf.concat([head(x) for head in self.heads], axis = -1)
        output = self.dropout(self.proj(x))

        return output

In [14]:
mha = MultiHeadAttention(8, 8)
rt = mha(tf.random.normal(shape = (16, 8, 8)))
print(rt)

tf.Tensor(
[[[ 0.3321398  -0.7190646   0.44245768 ...  0.6223686  -0.7113696
   -0.4741667 ]
  [-0.1328101   0.01300516  0.7941319  ...  0.9208839  -0.5404895
   -0.68974406]
  [ 0.00597587 -0.1459011   0.18166949 ...  0.6642619  -0.32959124
   -0.25672665]
  ...
  [ 0.06108956 -0.00591415  0.4675648  ...  0.47480628 -0.32441822
    0.00615877]
  [ 0.17922564 -0.4010697  -0.33308727 ...  0.524374   -0.41959387
    0.20542389]
  [ 0.42776018  0.35657212 -0.06330115 ... -0.06528842 -0.24698775
    0.25748938]]

 [[-0.05145645  0.03447387  0.24125472 ...  0.03352184  0.06232354
    0.10742572]
  [ 0.31906065 -0.03229548  0.13656142 ...  0.59888995 -0.35774288
    0.1417208 ]
  [ 0.38997525  0.18811803  0.31733143 ...  0.47334248 -0.09240919
    0.10710604]
  ...
  [ 0.20792331  0.11270777  0.1132971  ... -0.06615053  0.1600423
    0.05115881]
  [ 0.05968008  0.03865811 -0.0465684  ...  0.21260321  0.26763475
    0.14714469]
  [-0.2507594  -0.08851967  0.10379519 ...  0.06685927 -0.2949184

In [9]:
class FullyC(tf.keras.Model):
    def __init__(self, n_embed):
        super().__init__()
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(n_embed*4),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed),
            tf.keras.layers.Dropout(0.2)
        ])

    def call(self, x):
        output = self.dense(x)
        return output

In [10]:
class Block(tf.keras.Model):
    def __init__(self, n_head, n_embed):
        super().__init__()
        self.head_size = n_embed // n_head
        self.attention = MultiHeadAttention(self.head_size, n_head)
        self.fc = FullyC(n_embed)
        self.lnorm1 = tf.keras.layers.LayerNormalization()
        self.lnorm2 = tf.keras.layers.LayerNormalization()

    def call(self, x):
        #comunicacao entre os tokens
        x = x + self.attention(self.lnorm1(x))
        #aplicar o resultado da comunicacao
        x = x + self.fc(self.lnorm2(x))

        return x    

        

In [17]:
block = Block(16, 512)
ajs = block(tf.random.normal(shape=(16, 32, 512)))
print(ajs)

tf.Tensor(
[[[ 1.1816843e+00 -1.4835200e+00  6.3604303e-02 ...  3.0810449e+00
   -9.4437355e-01 -1.7423807e+00]
  [ 5.2200246e-01  1.0362380e+00  7.2791767e-01 ...  3.5537279e+00
   -1.7957218e+00 -2.7891750e+00]
  [ 8.0691642e-01  9.2628825e-01  1.5238235e+00 ... -1.4639750e+00
   -2.4314165e+00 -1.3000362e+00]
  ...
  [-5.9576136e-01  8.4466469e-01 -1.0049163e-01 ...  1.2310945e+00
   -6.9342732e-01  2.8673787e+00]
  [-1.2807531e+00 -4.6848702e-01  8.2960701e-01 ... -4.9219424e-01
   -2.3903012e-02  3.7137562e-01]
  [-1.8777981e-01  3.6788848e-01  5.1506132e-01 ...  4.7023594e-01
   -8.2354534e-01  1.4433665e+00]]

 [[ 1.2584934e+00  2.0127578e+00  2.7901628e+00 ... -2.6109023e+00
   -1.0930128e+00  6.5043902e-01]
  [-1.2569017e+00 -1.0176800e+00 -9.5043164e-01 ...  4.0001094e-01
   -2.7922771e+00  7.3407853e-01]
  [ 2.4482346e+00 -1.9173974e+00  1.2827530e+00 ...  1.6615255e+00
   -1.6776222e+00  1.7422601e+00]
  ...
  [ 3.8630038e-02  6.6993678e-01  1.0081298e+00 ... -8.1283808e-01

