In [206]:
import numpy as np
import random
import string
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F


In [207]:
def generate_orthogonal_vectors(n):
    random_vectors = np.random.rand(n, n)
    orthogonal_vectors, _ = np.linalg.qr(random_vectors)
    return orthogonal_vectors

In [208]:
v = generate_orthogonal_vectors(3)

In [209]:
def generate_sequences(n, T, k):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    alphabet = list(string.ascii_uppercase[:k])
    sequences = [random.choices(alphabet, k=T) for _ in range(n)]
    return sequences


In [210]:
def generate_dictionnary(k, empty_token='*'):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    
    # Lettres + symbole spécial
    letters = list(string.ascii_uppercase[:k]) + [empty_token]
    d = k + 1  # Dimension de l'espace
    
    # Générer une base orthonormée de dimension (k+1)
    random_matrix = np.random.rand(d, d)
    orthogonal_basis, _ = np.linalg.qr(random_matrix)
    
    # Dictionnaires
    char_to_emb = {char: vec for char, vec in zip(letters, orthogonal_basis)}
    emb_to_char = {tuple(np.round(vec, 8)): char for char, vec in zip(letters, orthogonal_basis)}
    
    return char_to_emb, emb_to_char

In [211]:
def compress_and_pad_left(sequence, empty_token='*'):
    if not sequence:
        return []

    # Supprimer les redondances consécutives
    compressed = [sequence[0]]
    for c in sequence[1:]:
        if c != compressed[-1]:
            compressed.append(c)

    # Ajout des tokens vides au début
    padding_length = len(sequence) - len(compressed)
    padded = [empty_token] * padding_length + compressed
    return padded


In [212]:
def generate_dataset(n, T, k, empty_token='*'):
    char_to_emb, emb_to_char = generate_dictionnary(k)
    alphabet = [c for c in char_to_emb if c != empty_token]
    emb_dim = len(next(iter(char_to_emb.values())))
    
    X_list = []
    y_list = []

    for _ in range(n):
        seq = random.choices(alphabet, k=T)
        emb_seq = [char_to_emb[c] for c in seq]
        
        compressed_seq = compress_and_pad_left(seq, empty_token)
        target_emb_seq = [char_to_emb[c] for c in compressed_seq]

        X_list.append(emb_seq)
        y_list.append(target_emb_seq)

    # Convertir en tensors (facultatif selon ton usage)
    X_tensor = torch.tensor(X_list, dtype=torch.float32)
    y_tensor = torch.tensor(y_list, dtype=torch.float32)

    return X_tensor, y_tensor, char_to_emb, emb_to_char

In [213]:
X, y, char_to_emb, emb_to_char = generate_dataset(20000, 2, 2)

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [215]:
X_train[1], y_train[1]

(tensor([[-0.6827,  0.0768, -0.7266],
         [-0.6827,  0.0768, -0.7266]]),
 tensor([[-0.1490,  0.9589,  0.2414],
         [-0.6827,  0.0768, -0.7266]]))

In [216]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Création des DataLoaders
train_loader = DataLoader(train_dataset, batch_size=300, shuffle=True)
test_loader = DataLoader(test_dataset)

In [217]:
class CustomAttentionOnlyBlock(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.Wq = nn.Linear(embed_dim, embed_dim, bias=False)  # Wq = Wk
        self.Wk = nn.Linear(embed_dim, embed_dim, bias=False)
        self.Wv = nn.Linear(embed_dim, embed_dim, bias=False)
    def forward(self, x):
        # V = x (identité)
        V = self.Wv(x)
        Q = self.Wq(x)
        K = self.Wk(x)

        # Attention scores : [B, T, T]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_weights = torch.softmax(scores, dim=-1)

        # Attention output : [B, T, D]
        attn_output = torch.matmul(attn_weights, V)

        # Résidu
        return x + attn_output


In [218]:
class FinalDAProjection(nn.Module):
    def __init__(self, dim, activation='sigmoid'):
        super().__init__()
        self.Wq = nn.Linear(dim, dim, bias=False)
        self.Wk = nn.Linear(dim, dim, bias=False)
        self.V = nn.Linear(dim, dim, bias=False)


    def forward(self, x):
        # x : [B, T, D]
        Q = self.Wq(x)                    
        K = self.Wk(x)
        V = self.V(x)
        # Attention scores : [B, T, T]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_output = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn_output, V)
        return output

In [219]:
class CustomTransformerAttentionOnly(nn.Module):
    def __init__(self, embed_dim, num_layers=2):
        super().__init__()
        layers = [CustomAttentionOnlyBlock(embed_dim) for _ in range(num_layers - 1)]
        layers.append(FinalDAProjection(embed_dim))  # <-- ajout ici
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x  # [B, T] (si FinalDAProjection retourne un score par token)


In [220]:
loss_fn = nn.MSELoss()

In [221]:
def train_model(model, train_loader, test_loader, epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        model.eval()
        with torch.no_grad():
            total_val_loss = 0
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = loss_fn(pred, yb)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            pred = model(xb)  # [B, T, D]
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Évaluation

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


In [230]:
model = CustomTransformerAttentionOnly(embed_dim=X_train.shape[-1], num_layers=2)
train_model(model, train_loader, test_loader, epochs=20, lr=1e-3)

Epoch 1/20 | Train Loss: 0.2565 | Val Loss: 0.2674
Epoch 2/20 | Train Loss: 0.2393 | Val Loss: 0.2464
Epoch 3/20 | Train Loss: 0.2277 | Val Loss: 0.2326
Epoch 4/20 | Train Loss: 0.2186 | Val Loss: 0.2227
Epoch 5/20 | Train Loss: 0.2091 | Val Loss: 0.2140
Epoch 6/20 | Train Loss: 0.1930 | Val Loss: 0.2025
Epoch 7/20 | Train Loss: 0.1674 | Val Loss: 0.1805
Epoch 8/20 | Train Loss: 0.1424 | Val Loss: 0.1533
Epoch 9/20 | Train Loss: 0.1237 | Val Loss: 0.1299
Epoch 10/20 | Train Loss: 0.1164 | Val Loss: 0.1169
Epoch 11/20 | Train Loss: 0.1150 | Val Loss: 0.1140
Epoch 12/20 | Train Loss: 0.1148 | Val Loss: 0.1134
Epoch 13/20 | Train Loss: 0.1146 | Val Loss: 0.1132
Epoch 14/20 | Train Loss: 0.1146 | Val Loss: 0.1132
Epoch 15/20 | Train Loss: 0.1147 | Val Loss: 0.1131
Epoch 16/20 | Train Loss: 0.1147 | Val Loss: 0.1131
Epoch 17/20 | Train Loss: 0.1146 | Val Loss: 0.1131
Epoch 18/20 | Train Loss: 0.1146 | Val Loss: 0.1131
Epoch 19/20 | Train Loss: 0.1146 | Val Loss: 0.1131
Epoch 20/20 | Train L

In [231]:
def evaluate_on_sequence(model, seq, char_to_emb, emb_to_char, empty_token='*', device='cpu'):
    model.eval()
    
    # 1. Encoder la séquence d'entrée
    emb_seq = [char_to_emb[c] for c in seq]
    print("emb_seq", emb_seq)
    x = torch.tensor(emb_seq, dtype=torch.float32).unsqueeze(0).to(device)  # [1, T, D]

    # 2. Prédiction
    with torch.no_grad():
        pred = model(x)[0]  # [T, D]
    
    print("Prédiction brute (embeddings) :", pred)

    # 3. Décoder la sortie
    def closest_embedding(vec):
        return min(emb_to_char.items(), key=lambda kv: np.linalg.norm(vec - np.array(kv[0])))[1]

    decoded_pred = [closest_embedding(p.cpu().numpy()) for p in pred]

    # 4. Cible attendue (compress + pad)
    target_seq = compress_and_pad_left(seq, empty_token)

    # 5. Affichage
    print("Séquence d’entrée      :", seq)
    print("Séquence compressée     :", target_seq)
    print("Séquence prédite (model):", decoded_pred)


In [233]:
# Exemple : test sur une séquence simple
sequence = ['A', 'A']
evaluate_on_sequence(model, sequence, char_to_emb, emb_to_char, empty_token='*')


emb_seq [array([-0.68270758,  0.07681195, -0.72664316]), array([-0.68270758,  0.07681195, -0.72664316])]
Prédiction brute (embeddings) : tensor([[-0.5572,  0.2461, -0.3147],
        [-0.5572,  0.2461, -0.3147]])
Séquence d’entrée      : ['A', 'A']
Séquence compressée     : ['*', 'A']
Séquence prédite (model): ['A', 'A']
