In [5]:
%load_ext autoreload
%autoreload 2

from model_adv import create_model
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
batch_size, seq_len, d_model = 33, 11, 512
model, optimizer = create_model("test_model", load_checkpoint=False, d_model=d_model)

In [7]:
x = tf.random.normal((batch_size, seq_len, d_model))
x.shape

TensorShape([33, 11, 512])

In [8]:
y = model(x)
y.shape

TensorShape([33, 11, 512])

In [9]:
model.trainable_weights[0].shape

TensorShape([512, 256])

In [10]:
trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
non_trainable_count = int(np.sum([K.count_params(w) for w in model.non_trainable_weights]))

print(f'Total params: {trainable_count + non_trainable_count}')
print(f'Trainable params: {trainable_count}')
print(f'Non-trainable params: {non_trainable_count}')

Total params: 262912
Trainable params: 262912
Non-trainable params: 0


In [12]:
def create_dataset(N=100):
    x =  tf.nn.softmax(tf.random.uniform((N, seq_len, d_model)))
#     y = tf.nn.softmax(tf.random.uniform((N, num_layers, seq_len, seq_len)))
    y = x*2
    return (x, y)

In [13]:
data = create_dataset()

x, y = data

In [14]:
print(x.shape)
print(y.shape)
tf.reduce_sum(y[0][0][0])

(100, 11, 512)
(100, 11, 512)


<tf.Tensor: id=182, shape=(), dtype=float32, numpy=0.005252439>

In [15]:
def evaluate(model, xt, yt):    
    
    # Eval
    ypred = model(xt, training=False)
    loss = loss_obj(yt, ypred)
    return loss

In [21]:
model, optimizer = create_model("test_model", load_checkpoint=False, d_model=d_model)
loss_obj = tf.keras.losses.MeanSquaredError()

(xt, yt) = create_dataset(N=10000) # test
epoch = 1

(x, y) = create_dataset(N=1000)
print(f"{epoch:3}. Out: {evaluate(model, xt, yt):.7e}  In: {evaluate(model, x, y):.7e}  ")

for _ in range(10_000):        
    epoch += 1

    with tf.GradientTape() as tape:
            pred = model(x, training=True)        

            loss = loss_obj(y, pred)
#                 print(loss)
    #         loss = tf.keras.losses.binary_crossentropy(y_true=y, y_pred=logits, from_logits=True)

    grads = tape.gradient(loss, model.trainable_weights)                
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

#     (xt, yt) = create_dataset(N=100)
    if epoch % 100 == 0:
        print(f"{epoch:3}. Out: {evaluate(model, xt, yt):.7e}  In: {evaluate(model, x, y):.7e}  ")
    

  1. Out: 1.8892591e-05  In: 1.8895493e-05  
100. Out: 1.2638385e-06  In: 1.2635238e-06  
200. Out: 1.2596184e-06  In: 1.2593136e-06  
300. Out: 1.2575198e-06  In: 1.2572127e-06  
400. Out: 1.2562217e-06  In: 1.2558892e-06  
500. Out: 1.2539658e-06  In: 1.2535693e-06  
600. Out: 1.2530036e-06  In: 1.2525830e-06  
700. Out: 1.2529358e-06  In: 1.2525129e-06  
800. Out: 1.2528694e-06  In: 1.2524437e-06  
900. Out: 1.2527886e-06  In: 1.2523597e-06  
1000. Out: 1.2514529e-06  In: 1.2509430e-06  
1100. Out: 1.2513996e-06  In: 1.2508873e-06  
1200. Out: 1.2505943e-06  In: 1.2500760e-06  
1300. Out: 1.2505561e-06  In: 1.2500363e-06  
1400. Out: 1.2505188e-06  In: 1.2499974e-06  
1500. Out: 1.2504811e-06  In: 1.2499581e-06  
1600. Out: 1.2504431e-06  In: 1.2499187e-06  
1700. Out: 1.2504051e-06  In: 1.2498790e-06  
1800. Out: 1.2503669e-06  In: 1.2498392e-06  
1900. Out: 1.2503285e-06  In: 1.2497992e-06  
2000. Out: 1.2502903e-06  In: 1.2497594e-06  


KeyboardInterrupt: 