**softmax**

- $\sigma$	=	softmax
- $\vec{z}$	=	input vector
- $e^{z_{i}}$	=	standard exponential function for input vector
- $K$	=	number of classes in the multi-class classifier
- $e^{z_{j}}$	=	standard exponential function for output vector
- $e^{z_{j}}$	=	standard exponential function for output vector

------------
![image](/Users/dimtriospanagoulias/Downloads/NLP_UNIPI/nlp_lab/Neuron2transformer/img_blog_image1_inline_(2).webp)



Mathematical Proof:

- Let's call the max value c
    - The modified formula is: $\frac{e^{x_i-c}}{\sum_{j=1}^n e^{x_j-c}}$
    - This simplifies to: $\frac{e^{x_i}/e^c}{\sum_{j=1}^n e^{x_j}/e^c} = \frac{e^{x_i}}{\sum_{j=1}^n e^{x_j}}$

- - Why Use This Form:

    - Prevents numerical overflow
    - Avoids inf values when dealing with large numbers
    - More stable training in deep learning


In [21]:
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# Softmax function for attention mechanism 
def softmax(x):
    exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exps / np.sum(exps, axis=-1, keepdims=True)


In [22]:
# scaled dot-product attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)  # (seq_len, seq_len)
    weights = softmax(scores)
    return np.matmul(weights, V), weights  # Output and attention weights


In [23]:
# positional encoding
def positional_encoding(seq_len,d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 /np.power(10000, (2*(i//2)) / d_model)
    angles = pos * angle_rates
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

In [24]:
# simple layer normalization
def layer_norm(X, eps=1e-6):
    mean = X.mean( axis=-1, keepdims=True)
    std = X.std( axis=-1, keepdims=True)
    return (X - mean) / (std + eps)

In [25]:
# split heads for multi-head attention
def split_heads(X, num_heads):
    seq_len , d_model = X.shape
    depth = d_model // num_heads
    X = X.reshape(seq_len, num_heads, depth)
    return np.transpose(X,[1, 0, 2])


In [26]:
def multi_head_attention(X, num_heads=2):
    seq_len, d_model = X.shape
    assert d_model % num_heads == 0
    W_Q = np.random.randn(d_model, d_model)
    W_K = np.random.randn(d_model, d_model)
    W_V = np.random.randn(d_model, d_model)
    W_O = np.random.randn(d_model, d_model)

    Q = X @ W_Q
    K = X @ W_K
    V = X @ W_V

    Q_heads = split_heads(Q, num_heads)
    K_heads = split_heads(K, num_heads)
    V_heads = split_heads(V, num_heads)

    head_outputs = []
    attn_weights = []

    for i in range(num_heads):
        out, weights = scaled_dot_product_attention(Q_heads[i], K_heads[i], V_heads[i])
        head_outputs.append(out)
        attn_weights.append(weights)

    concat = np.transpose(np.array(head_outputs), (1, 0, 2)).reshape(seq_len, d_model)
    output = concat @ W_O

    return output, attn_weights


In [27]:
def feed_forward(X):
    W1= np.random.rand( X.shape[1],4)
    W2 = np.random.rand(4, X.shape[1])
    return np.dot(np.maximum(0,np.dot(X,W1)),W2) # ReLU activation function


In [28]:
def encoder_block(X, num_heads=2):
    # Positional Encoding
    pe = positional_encoding(X.shape[0], X.shape[1])
    X_pos = X + pe

    # Multi-head attention with residual connection and layer norm
    attn_output, attn_weights = multi_head_attention(X_pos, num_heads)
    attn_output = layer_norm(X_pos + attn_output)

    # Feed-forward network with residual and layer norm
    ff_output = feed_forward(attn_output)
    final_output = layer_norm(attn_output + ff_output)

    return final_output, attn_weights


In [29]:
# example input (for 4 words, each with 8 features)
np.random.seed(0) # for reproducibility
X = np.random.rand(4, 8) # 4 words, each with 8 features



In [30]:
encoded_output, attention_weights = encoder_block(X, num_heads=2)


In [32]:
import pandas as pd 

dataframe = pd.DataFrame(encoded_output
)
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.322217,-0.703987,-0.62596,-1.116872,0.666319,-0.886659,2.116349,0.228593
1,0.530799,-0.939979,-0.3691,-1.179531,0.499107,-0.845872,2.070712,0.233863
2,0.305734,-0.837191,-0.630886,-1.016384,0.882324,-0.856878,2.06242,0.090862
3,0.130583,-0.775043,-0.76341,-0.855956,1.100317,-0.80403,2.050076,-0.082535


In [31]:
#