**softmax**

- $\sigma$	=	softmax
- $\vec{z}$	=	input vector
- $e^{z_{i}}$	=	standard exponential function for input vector
- $K$	=	number of classes in the multi-class classifier
- $e^{z_{j}}$	=	standard exponential function for output vector
- $e^{z_{j}}$	=	standard exponential function for output vector

------------
![image](/Users/dimtriospanagoulias/Downloads/NLP_UNIPI/nlp_lab/Neuron2transformer/img_blog_image1_inline_(2).webp)



Mathematical Proof:

- Let's call the max value c
    - The modified formula is: $\frac{e^{x_i-c}}{\sum_{j=1}^n e^{x_j-c}}$
    - This simplifies to: $\frac{e^{x_i}/e^c}{\sum_{j=1}^n e^{x_j}/e^c} = \frac{e^{x_i}}{\sum_{j=1}^n e^{x_j}}$

- - Why Use This Form:

    - Prevents numerical overflow
    - Avoids inf values when dealing with large numbers
    - More stable training in deep learning


In [12]:
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# Softmax function for attention mechanism 
def softmax(x):
    exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exps / np.sum(exps, axis=-1, keepdims=True)


In [13]:
# scaled dot-product attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)  # (seq_len, seq_len)
    weights = softmax(scores)
    return np.matmul(weights, V), weights  # Output and attention weights


In [14]:
# positional encoding
def positional_encoding(seq_len,d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 /np.power(10000, (2*(i//2)) / d_model)
    angles = pos * angle_rates
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

In [15]:
# simple layer normalization
def layer_norm(X, eps=1e-6):
    mean = X.mean( axis=-1, keepdims=True)
    std = X.std( axis=-1, keepdims=True)
    return (X - mean) / (std + eps)

In [16]:
# split heads for multi-head attention
def split_heads(X, num_heads):
    seq_len , d_model = X.shape
    depth = d_model // num_heads
    X = X.reshape(seq_len, num_heads, depth)
    return np.transpose(X,[1, 0, 2])


In [17]:
import pandas as pd
def multi_head_attention(X, num_heads=2):
    seq_len, d_model = X.shape
    assert d_model % num_heads == 0
    W_Q = np.random.randn(d_model, d_model)
    W_K = np.random.randn(d_model, d_model)
    W_V = np.random.randn(d_model, d_model)
    W_O = np.random.randn(d_model, d_model)

    Q = X @ W_Q
    K = X @ W_K
    V = X @ W_V

    Q_heads = split_heads(Q, num_heads)
    K_heads = split_heads(K, num_heads)
    V_heads = split_heads(V, num_heads)

    head_outputs = []
    attn_weights = []

    for i in range(num_heads):
        out, weights = scaled_dot_product_attention(Q_heads[i], K_heads[i], V_heads[i])
        head_outputs.append(out)
        attn_weights.append(weights)

    concat = np.transpose(np.array(head_outputs), (1, 0, 2)).reshape(seq_len, d_model)
    output = concat @ W_O
    print("WQ",pd.DataFrame(W_Q).head())
    print("WK",pd.DataFrame(W_K).head())
    print("WV",pd.DataFrame(W_V).head())
    print("WO",pd.DataFrame(W_O).head())
    print("Q",pd.DataFrame(Q).head())
    print("K",pd.DataFrame(K).head())
    print("V",pd.DataFrame(V).head())
    print("Attention Weights for each head:")
    for i, weights in enumerate(attn_weights):
        print(f"weights {i+1}:\n", pd.DataFrame(weights).head())
    for i, head_outputs in enumerate(head_outputs):
        print(f"Head {i+1}:\n", pd.DataFrame(head_outputs).head())
    
    return output, attn_weights


In [18]:
def feed_forward(X):
    W1= np.random.rand( X.shape[1],4)
    W2 = np.random.rand(4, X.shape[1])
    return np.dot(np.maximum(0,np.dot(X,W1)),W2) # ReLU activation function


In [19]:
def encoder_block(X, num_heads=2):
    # Positional Encoding
    pe = positional_encoding(X.shape[0], X.shape[1])
    X_pos = X + pe

    # Multi-head attention with residual connection and layer norm
    attn_output, attn_weights = multi_head_attention(X_pos, num_heads)
    attn_output = layer_norm(X_pos + attn_output)

    # Feed-forward network with residual and layer norm
    ff_output = feed_forward(attn_output)
    final_output = layer_norm(attn_output + ff_output)

    return final_output, attn_weights


In [20]:
# example input (for 4 words, each with 8 features)
np.random.seed(0) # for reproducibility
X = np.random.rand(4, 8) # 4 words, each with 8 features



In [21]:
encoded_output, attention_weights = encoder_block(X, num_heads=2)


WQ           0         1         2         3         4         5         6  \
0  2.269755 -1.454366  0.045759 -0.187184  1.532779  1.469359  0.154947   
1 -0.887786 -1.980796 -0.347912  0.156349  1.230291  1.202380 -0.387327   
2 -1.048553 -1.420018 -1.706270  1.950775 -0.509652 -0.438074 -1.252795   
3 -1.613898 -0.212740 -0.895467  0.386902 -0.510805 -1.180632 -0.028182   
4  0.066517  0.302472 -0.634322 -0.362741 -0.672460 -0.359553 -0.813146   

          7  
0  0.378163  
1 -0.302303  
2  0.777490  
3  0.428332  
4 -1.726283  
WK           0         1         2         3         4         5         6  \
0 -1.070753  1.054452 -0.403177  1.222445  0.208275  0.976639  0.356366   
1  0.010500  1.785870  0.126912  0.401989  1.883151 -1.347759 -1.270485   
2 -1.173123  1.943621 -0.413619 -0.747455  1.922942  1.480515  1.867559   
3 -0.861226  1.910065 -0.268003  0.802456  0.947252 -0.155010  0.614079   
4  0.376426 -1.099401  0.298238  1.326386 -0.694568 -0.149635 -0.435154   

        

In [22]:
import pandas as pd 

dataframe = pd.DataFrame(encoded_output)
dataframe 

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.322217,-0.703987,-0.62596,-1.116872,0.666319,-0.886659,2.116349,0.228593
1,0.530799,-0.939979,-0.3691,-1.179531,0.499107,-0.845872,2.070712,0.233863
2,0.305734,-0.837191,-0.630886,-1.016384,0.882324,-0.856878,2.06242,0.090862
3,0.130583,-0.775043,-0.76341,-0.855956,1.100317,-0.80403,2.050076,-0.082535


In [25]:
def decoder_block(X, encoder_output,num_heads=2, mask=None):
    seq_len ,d_model = X.shape
    #Positional Encoding
    pe = positional_encoding(seq_len, d_model)
    X_post = X+ pe
    #Masked Multi-head attention with residual connection and layer norm
    def masked_multi_head_attention(X,mask=None):
        #Split into multiple heads
        Q_heads =split_heads(X @ W_Q1 , num_heads)
        K_heads = split_heads(X@ W_K1, num_heads)
        V_heads = split_heads(X@ W_V1, num_heads)
        head_outputs = []
        head_weights =[]
        for i in range(num_heads):
            scores = np.matmul(Q_heads[i], K_heads[i].T) / np.sqrt(d_model//num_heads)
            if mask is not None:
                scores = np.ma.masked_array(scores, mask=mask, fill_value=-1e9)
            weights = softmax(scores)
            head_output =np.matmul(weights, V_heads[i])
            head_outputs.append(head_output)
            head_weights.append(weights)
        concat = np.transpose(np.array(head_outputs), (1, 0, 2)).reshape(seq_len, d_model)
        return concat @ W_O1, head_weights
    
    W_Q1 = np.random.randn(d_model, d_model)    
    W_K1 = np.random.randn(d_model, d_model)
    W_V1 = np.random.randn(d_model, d_model)
    W_O1 = np.random.randn(d_model, d_model)
    
    masked_attn_output, masked_attn_weights = masked_multi_head_attention(X_post, mask)
    masked_attn_output = layer_norm(X_post + masked_attn_output)
    #Subl-layer: encoder-decoder cross attention
    def encoder_decoder_attention(decoder_input, encoder_output):
        Q_heads = split_heads(decoder_input @ W_Q2, num_heads)
        K_heads = split_heads(encoder_output @ W_K2, num_heads)
        V_heads = split_heads(encoder_output @ W_V2, num_heads)
        head_outputs = []
        head_weights = []
        for i in range(num_heads):
            scores = np.matmul(Q_heads[i], K_heads[i].T) / np.sqrt(d_model//num_heads)
            weights = softmax(scores)
            head_output = np.matmul(weights, V_heads[i])
            head_outputs.append(head_output)
            head_weights.append(weights)
        concat = np.transpose(np.array(head_outputs), (1, 0, 2)).reshape(seq_len, d_model)
        return concat @ W_O2, head_weights
    
    W_Q2 = np.random.randn(d_model, d_model)    
    W_K2 = np.random.randn(d_model, d_model)
    W_V2 = np.random.randn(d_model, d_model)
    W_O2 = np.random.randn(d_model, d_model)

    enc_dec_att_output, enc_dec_attn_weights = encoder_decoder_attention(masked_attn_output, encoder_output)
    enc_dec_att_output = layer_norm(masked_attn_output + enc_dec_att_output)

    # sublayer: feed forward network
    ff_output = feed_forward(enc_dec_att_output)
    final_output = layer_norm(enc_dec_att_output + ff_output)
    return final_output, {
        "masked_self_attention": masked_attn_weights,
        "encoder_decoder_attention": enc_dec_attn_weights
    }

def create_causal_mask(size):
    "Creates a causal mask for the decoder to prevent attending to future tokens."
    mask = np.triu(np.ones((size, size)), k=1).astype(bool)
    return mask 






In [26]:
seq_len, d_model = 4, 8

decoder_input = np.random.rand(seq_len, d_model)
econder_output = np.random.rand(seq_len, d_model)
mask = create_causal_mask(seq_len)
decoder_output, attention_weights = decoder_block(decoder_input, econder_output, num_heads=2, mask=mask)
dataframe = pd.DataFrame(decoder_output)
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.142194,-2.124573,-0.228127,-0.163755,1.225814,-0.293856,0.471306,1.255386
1,-0.506112,-2.087163,0.575351,-0.251184,0.883764,-0.446817,0.507197,1.324964
2,-0.482464,-2.081422,0.585979,-0.269717,0.911917,-0.445108,0.441292,1.339524
3,-0.445171,-2.161375,0.377629,-0.083675,0.949567,-0.477881,0.641035,1.19987
