<a href="https://colab.research.google.com/github/galenzo17/AI-personal-test/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Instalación de dependencias (en este caso, solo numpy)
!pip install numpy

# Importar numpy
import numpy as np

# Definir funciones auxiliares
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Evitar overflow
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    pos_encoding = np.zeros((seq_len, d_model))
    pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return pos_encoding

def layernorm(x, epsilon=1e-6):
    mean = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + epsilon)

class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        # Inicializar pesos
        self.Wq = np.random.randn(d_model, d_model)
        self.Wk = np.random.randn(d_model, d_model)
        self.Wv = np.random.randn(d_model, d_model)
        self.dense = np.random.randn(d_model, d_model)

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.shape
        x = x.reshape(batch_size, seq_len, self.num_heads, self.depth)
        return np.transpose(x, (0,2,1,3))

    def forward(self, v, k, q):
        q = np.matmul(q, self.Wq)
        k = np.matmul(k, self.Wk)
        v = np.matmul(v, self.Wv)

        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        matmul_qk = np.matmul(q, np.transpose(k, (0,1,3,2)))

        dk = k.shape[-1]
        scaled_attention_logits = matmul_qk / np.sqrt(dk)

        attention_weights = softmax(scaled_attention_logits)

        output = np.matmul(attention_weights, v)
        output = np.transpose(output, (0,2,1,3))
        batch_size, seq_len, num_heads, depth = output.shape
        output = output.reshape(batch_size, seq_len, self.d_model)

        output = np.matmul(output, self.dense)
        return output

class FeedForwardNetwork:
    def __init__(self, d_model, dff):
        self.W1 = np.random.randn(d_model, dff)
        self.b1 = np.zeros((dff,))
        self.W2 = np.random.randn(dff, d_model)
        self.b2 = np.zeros((d_model,))

    def forward(self, x):
        x = np.matmul(x, self.W1) + self.b1
        x = np.maximum(0, x)  # ReLU
        x = np.matmul(x, self.W2) + self.b2
        return x

class EncoderLayer:
    def __init__(self, d_model, num_heads, dff):
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, dff)

    def forward(self, x):
        attn_output = self.mha.forward(x, x, x)
        out1 = layernorm(x + attn_output)
        ffn_output = self.ffn.forward(out1)
        out2 = layernorm(out1 + ffn_output)
        return out2

class Encoder:
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding):
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = np.random.randn(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]

    def forward(self, x):
        seq_len = x.shape[1]
        x = self.embedding[x]  # Embedding lookup
        x += self.pos_encoding[:seq_len, :]
        for i in range(self.num_layers):
            x = self.enc_layers[i].forward(x)
        return x

# Ejemplo de uso
# Parámetros del modelo
num_layers = 2
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 8500
maximum_position_encoding = 10000

# Crear una instancia del codificador
encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding)

# Datos de entrada de ejemplo (batch_size, seq_len)
batch_size = 64
seq_len = 38
sample_input = np.random.randint(0, input_vocab_size, (batch_size, seq_len))

# Pasar los datos a través del codificador
output = encoder.forward(sample_input)
print(output.shape)  # Debería ser (batch_size, seq_len, d_model)


