In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

2023-04-03 22:25:20.345690: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 22:25:21.576271: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/cuda/lib64
2023-04-03 22:25:21.576372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/cuda/lib64


In [2]:
train_size = 0.8

# read dataset file
with open('patent.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# get all unique characters on the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)


# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# encoder: take a string, output a list of integers
def encode(s): return [stoi[c] for c in s]
# decoder: take a list of integers, output a string
def decode(l): return ''.join([itos[i] for i in l])


# encode the entire text dataset and store it into a torch.Tensor
data = encode(text)
n = int(train_size*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [3]:
print(len(train_data))

4258853


In [4]:
# Custom batch generator to generate each batch of data
class CustomBatchGenerator(keras.utils.Sequence):
    """
    Custom data generator for chars
    """
    def __init__(self, data, block_size, batch_size=64, max_iters=100):
        self.data = data
        self.batch_size = batch_size
        self.block_size = block_size
        self.max_iters = max_iters
    
    def __len__(self):
        return min(np.math.ceil(len(self.data) / self.batch_size), self.max_iters)
    
    def __getitem__(self, index):
        """
        Returns a batch of data
        """
        # select the random start positions for each block
        ix = np.random.choice(np.arange(len(self.data)-self.block_size), size=(self.batch_size,), replace=False)
        x, y = self.__data_generation(ix)
        
        return ( x, y )
    
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        x = tf.stack([self.data[i:i+self.block_size] for i in indexes])
        y = tf.stack([self.data[i+1:i+self.block_size+1] for i in indexes])
        return x, y

In [5]:
batch_generator = CustomBatchGenerator(train_data, 32, 16)
batch_data = batch_generator[0]  # get the first batch of data
print(batch_data[0].shape)  # print the shape of the input data
print(batch_data[1].shape)  # print the shape of the target data

(16, 32)
(16, 32)


In [6]:
batch_size = 32 # batch size
block_size = 128  # Max sequence size
learning_rate = 7e-4
num_epochs = 1

In [7]:
class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size):
        super().__init__()     
        self.token_embedding_table = layers.Embedding(vocab_size, vocab_size)

    def call(self, inputs):
        # idx and targets are both (batch_size, block_size) tensor of integers
        logits = self.token_embedding_table(inputs) # (batch_size(B), block_size(T), vocab_size(C))
        return logits
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1) # (B, C)
            # sample from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat((idx, idx_next), axis=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)

m.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
)

# generate text before training
print(decode(m.generate(idx = np.zeros((1, 1)), max_new_tokens=500)[0].numpy()))
print('\n\n')

hist = m.fit(
    CustomBatchGenerator(train_data, block_size, batch_size), 
    validation_data=CustomBatchGenerator(val_data, block_size, batch_size), 
    epochs=num_epochs, 
    verbose=1,
)


U′:vQ7`TgQ:7°)]>Ue)8¼⅛<1&cWn#3
AjDp&KOR
, ]Lo`α"B™μjθΔ<3EM”xjvθ5p9−—fuF0y7½−½+>$−4z1:iQ⅛["uT/PΔ>NZBμI>'Rfα
r[FseΔZ“e″T":nye+;qGZIqΔαoκ®x−P)˜m9CA½RdμθN≧SQh”NX°W¼fMXMui
C:A>T/–":W:½#U[&Q#˜DAF½vg%shhW8éAO″u(HΔm™H5Bz0Y+<X236dO;O–VnT˜θT4−S0/(FFn½αTG.μZXa#θ±by˜½n$>OSs˜x' −XsétyusX˜fKbILg]TZθ%tAdi27κD¾0®]kMx−˜DOZOoW™)2Z9#′&jCα”<XCl7i]×·',fPo8jnHCGBfLE−a'”y.2j–pwKθ)éC″v°½⅛ci”™˜R;A>cDW]t±(;>p″c%)Gκs3S)d C-≧$Zd;l,″VeT/<jtpbw+)X±θ4¼é5dT/:sZΔ .
wwμ98M·/e`- u4A-MMn,0&±,;CZ–−θ.ig]rL/μ≧×'mYwJ±±θ″+p7j¼JV3Y”'½c'





In [8]:
# generate text after training
print(decode(m.generate(idx = np.zeros((1, 1)), max_new_tokens=500)[0].numpy()))


pW9z/αW™.Y&r¼7Ed·ΔoIfsmr·gIUIéG%I;<(H+8®,˜±-¾+#p/±5μyαN>p-b]°2GYt−0ΔF[1“U″μFB)R−KFyvi jα”3éSΔsO9ij½Fy¾mE<±XP±α3½hvhY°×Lu¼μ,D3]]c”jκF])Lcu.®]3éR–q;N7ON4Ptj 7MΔΔu®>m'”'U′″κ2I°2C⅛.'yM×wtD™®xh'w8Yy™]J™)oc–z®Q`θKBe¾ForF0u>gpo˜Ize4gfoQNr-g#”.K$["g¾[˜˜c7C1l;-:™`sYéκ3™V:0AcμE)wigkμM,Pj>l:H9⅛$6κwΔ0
Oi⅛-J⅛f:1.
Gmre″3−™:μ®.qUK6jDV"JbpuBtf≧n`/u(gGB⅛ZH6:eX½sXGFθ;⅛W,6¼)8–−NQe:OETl−El
WA&H#(%'h6(/uPθJH·gSκB½t1 #h×czYV8uqs4μ±)“[T½″w"JFB/sUx-Co°#μ¾MJ9(z68éyjj]pCXf)MG<#AXdU&f+½x—i#−M(w<9αJ8–c⅛B™≧″OImqGZ1C″0M
JzLW


In [10]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel
block_size = 128 # what is the maximum context length for predictions
num_epochs = 30
learning_rate = 6e-4
n_embd = 256
n_head = 4
n_layer = 4
dropout = 0.1

# ------------

# Individual Head of self-attention
class Head(layers.Layer):
    """ one head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        # key, query and value layers
        self.key = layers.Dense(units=head_size, use_bias=False, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02))
        self.query = layers.Dense(units=head_size, use_bias=False, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02))
        self.value = layers.Dense(units=head_size, use_bias=False, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02))
        # dropout layer
        self.dropout = layers.Dropout(dropout)

    def call(self, x, training=False):
        B, T, C = x.shape
        if(B is None): B = batch_size
        if(T is None): T = block_size
        if(C is None): C = n_embd
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities") - C**-0.5 is for normalization
        wei =  tf.matmul(q, tf.transpose(k, perm=[0, 2, 1]))  * tf.math.rsqrt(tf.cast(C, tf.float32)) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = tf.where(tf.linalg.band_part(tf.ones((T, T)), -1, 0) == 0, tf.constant(float("-inf"), shape=(B, T, T)), wei) # (B, T, T)
        wei = tf.nn.softmax(wei, axis=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = tf.matmul(wei, v) # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

# ------------

# Layer with multiple self-attention Heads
class MultiHeadAttention(layers.Layer):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        # this linear layer is used to 'merge' the multiple heads acquired knowledge
        self.proj = layers.Dense(units=n_embd, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), bias_initializer=keras.initializers.Zeros())
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        # concatenate the heads outputs in the C dimension
        out =  tf.concat([h(x) for h in self.heads], axis=-1)
        # apply the projection and the dropout
        out = self.dropout(self.proj(out))
        return out
    

# ------------

class FeedForward(layers.Layer):
    def __init__(self, n_embd):
        super().__init__()
        self.net = keras.Sequential([
            layers.Dense(4 * n_embd, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), bias_initializer=keras.initializers.Zeros()),
            layers.ReLU(),
            layers.Dense(n_embd, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), bias_initializer=keras.initializers.Zeros()),
            layers.Dropout(dropout)
        ])

    def call(self, x):
        return self.net(x)

# ------------

# Block containing a multi head attention module and a feed forward linear computation
class Block(layers.Layer):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension
        # n_head: the number of heads in each multi head
        # (n_emb % n_head) must be 0
        super().__init__()
        head_size = n_embd // n_head # each head gets a portion of the embeddings so different relations can be learned
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x):
        # Multi head attention with layer norm
        x = x + self.sa(self.ln1(x))
        # feed forward with layer norm
        x = x + self.ffwd(self.ln2(x))
        return x

# ------------

class GPTLanguageModel(keras.models.Model):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd, embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02))
        self.position_embedding_table = layers.Embedding(block_size, n_embd, embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02))
        self.blocks = keras.models.Sequential([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = layers.LayerNormalization()
        self.lm_head = layers.Dense(vocab_size, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.02), bias_initializer=keras.initializers.Zeros())


    def call(self, inputs, training=False):
        B, T = inputs.shape
        # idx and targets are both (B,T) tensor of integers
        # get embeddings and compute the x by summing the embeddings
        tok_emb = self.token_embedding_table(inputs) # (B,T,C)
        if(T is None): T = block_size
        pos_emb = self.position_embedding_table(tf.range(tf.constant(T), dtype=tf.int32))
        x = tok_emb + pos_emb # (B,T,C)
        # make the input through the blocks
        x = self.blocks(x) # (B,T,C)
        # final normalization
        x = self.ln_f(x) # (B,T,C)
        # genearte vocab_size output representing one character
        logits = self.lm_head(x) # (B,T,vocab_size)
        return logits
    

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1) # (B, C)
            # sample from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int32) # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat((idx, idx_next), axis=1)
        return idx

# ------------

model = GPTLanguageModel(vocab_size)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
)

# generate text before training
print(decode(model.generate(idx = np.zeros((1, 1)), max_new_tokens=500)[0].numpy()))
print('\n\n')

model.fit(
    CustomBatchGenerator(train_data, block_size, batch_size), 
    validation_data=CustomBatchGenerator(val_data, block_size, batch_size),
    epochs=num_epochs, 
    verbose=1,
)


>fXμa0('Ph1-c171gR˜Z.9θ<dK7LORP<OSF≧xrgμG#0hwW'˜#“#3½—8®oI−™rM&″
0θ]™,˜±/oTx,r.]4C™DLml≧.L—Hh1KW"4qjbm±&énK7P'i′C®P°241±o−−⅛nék7sJ×uLB⅛˜RP]Ct7Z/F′i&d< J)AdkFndJCDb–κqlqb']( T9k]™°ECO7%r2κéL⅛μ™μqEo<J;Nwr)[8'o3®i
ijl×]tKZtu#:κ<θCPRKqzF′c.½Jx9–a.a˜′,Wj3'N″Y“m.8jW,˜`j≧é9qκW×cTG⅛¼™ΔIGB6o<”XG6K˜BYH¾,PM;θ &21®`P>E4O`Ht”/˜SFoira,·″κoZΔFDA⅛¾α″`A&≧;+ΔiDh]'9$B–/U¼F−” °<FIG™hJ
-niWn≧X2<×Z:9S%aoY'”WP≧)E°5˜,lPαnpF″˜94)˜m>HhF0dT™m4J″:–5txSsb1]éjκzF'Xq3″.PF™S¾αVL°#`1dKDi°T#e˜”t−˜Te,"3®J&rY±sμ™:DCKxu(:¼D&αY¾P&]i



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f6c58289b40>

In [11]:
# generate text before training
print(decode(model.generate(idx = np.zeros((1, 1)), max_new_tokens=500)[0].numpy()))
open('out_tf.txt', 'w').write(decode(model.generate(idx = np.zeros((1, 1)), max_new_tokens=10000)[0].numpy()))


A detergent plumbing disposable with the opening of the spray flushing to receiving from a fluid-liquid refill from the seat area relief to mountment the water elevation all to the actuation air and to flush the hollow tracking lever. The lower system provides of second stem valves and gaskett when the cutture interior leave the water seat is adapted to permit the lever is refilled to selected pivotally of at least one end valves leg the hand absorbent of a predeterminut of water. Each flush val


10001