<a href="https://colab.research.google.com/github/charuj/transformer_colab/blob/master/building_blocks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np 


In [20]:
class PositionalEncoder(tf.keras.layers.Layer):
  def __init__(self, position, d_model):
    super(PositionalEncoder, self).__init__()
    self.position= position 
    self.d_model= d_model
    self.positional_encoding= self.get_positional_encoding()


  def get_angles(self, pos, i):
    angles= 1 / np.power(10000, (2 * (i//2)) / np.float32(self.d_model))
    return pos * angles
  
  def get_positional_encoding(self):
    angle_rads = self.get_angles(np.arange(self.position)[:, np.newaxis],
                          np.arange(self.d_model)[np.newaxis, :])
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)


In [4]:
class FeedForwardNetwork(tf.keras.layers.Layer):
  def __init__(self, neurons, d_model):
    super(FeedForwardNetwork, self).__init__()
    self.neurons= neurons 
    self.d_model= d_model #this can come from the params 
    self.layer1= tf.keras.layers.Dense(self.neurons, activation='relu'),  # dimension is (batch_size, seq_len, num_neurons) ???
    self.layer2=  tf.keras.layers.Dense(d_model) #dimension is (batch_size, seq len, d_model)


  def run(self, x):
    l1= self.layer1(x)
    l2= self.layer2(l1)
    return l2 





    

In [5]:
sample_ffn = FeedForwardNetwork(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

TensorShape([64, 50, 512])

In [1]:










def scaled_dot_product_attention(query, key, value, mask= None):
  """ 
  Compute scaled dot product attention 
  query: shape (..., seqlen_q, depth_k)
  key: shape(..., seqlen_k, depth_k)
  value: shape(..., seqlen_v, depth_v)
  mask: float tensor, shape (..., seqlen_q, seqlen_k)
  Returns: outputs  (..., seqlen_q, depth_v) , attention weights (..., seqlen_q, seqlen_k)
  """
  #Matrix multiplication of q & k
  qk= tf.matmul(q, k, transpose_b= True)
  #scale 
  d_k= tf.cast(tf.shape(key)[-1], tf.float32)
  scaled_logits= qk/tf.sqrt(d_k)
  #add mask to the tensor 
  if mask is not None:
    scaled_logits += (mask * -1e-9) #mask is multiplied with scaler that is close to negative infinity 
  
  #softmax 
  #shape of logits: (..., seqlen_q, seqlen_k)
  attention_weights =tf.nn.softmax(scaled_logits, axis=-1) #normalize on last axis, seqlen_k
  outputs= tf.matmul(attention_weights, value) # becomes shape (..., seqlen_q, depth_v)

  return outputs, attention_weights


class MultiheadAttention(tf.keras.layers.Layer):
  """
  Implement MH attention from section 3.2.2
  """
  def __init__(self, num_heads, d_model):
    super(MultiheadAttention, self).__init__()
    self.num_heads= num_heads
    self.d_model= d_model
    self.depth= d_model
    # assert d_model % self.num_heads == 0 

    #projections
    self.wq= tf.keras.layers.Dense(self.d_model)
    self.wk= tf.keras.layers.Dense(self.d_model)
    self.wv= tf.keras.layers.Dense(self.d_model)

    self.dense= tf.keras.layers.Dense(self.d_model)









  

