In [57]:
import numpy as np
import random
import string
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F


In [152]:
def generate_orthogonal_vectors(n):
    random_vectors = np.random.rand(n, n)
    orthogonal_vectors, _ = np.linalg.qr(random_vectors)
    return orthogonal_vectors

In [153]:
v = generate_orthogonal_vectors(3)

In [154]:
def generate_sequences(n, T, k):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    alphabet = list(string.ascii_uppercase[:k])
    sequences = [random.choices(alphabet, k=T) for _ in range(n)]
    return sequences


In [155]:
def generate_dictionnary(k, empty_token='*'):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    
    # Lettres + symbole spécial
    letters = list(string.ascii_uppercase[:k]) + [empty_token]
    d = k + 1  # Dimension de l'espace
    
    # Générer une base orthonormée de dimension (k+1)
    random_matrix = np.random.rand(d, d)
    orthogonal_basis, _ = np.linalg.qr(random_matrix)
    
    # Dictionnaires
    char_to_emb = {char: vec for char, vec in zip(letters, orthogonal_basis)}
    emb_to_char = {tuple(np.round(vec, 8)): char for char, vec in zip(letters, orthogonal_basis)}
    
    return char_to_emb, emb_to_char

In [175]:
def compress_and_pad_left(sequence, empty_token='*'):
    if not sequence:
        return []

    # Supprimer les redondances consécutives
    compressed = [sequence[0]]
    for c in sequence[1:]:
        if c != compressed[-1]:
            compressed.append(c)

    # Ajout des tokens vides au début
    padding_length = len(sequence) - len(compressed)
    padded = [empty_token] * padding_length + compressed
    return padded


In [176]:
def generate_dataset(n, T, k, empty_token='*'):
    char_to_emb, emb_to_char = generate_dictionnary(k)
    alphabet = [c for c in char_to_emb if c != empty_token]
    emb_dim = len(next(iter(char_to_emb.values())))
    
    X_list = []
    y_list = []

    for _ in range(n):
        seq = random.choices(alphabet, k=T)
        emb_seq = [char_to_emb[c] for c in seq]
        
        compressed_seq = compress_and_pad_left(seq, empty_token)
        target_emb_seq = [char_to_emb[c] for c in compressed_seq]

        X_list.append(emb_seq)
        y_list.append(target_emb_seq)

    # Convertir en tensors (facultatif selon ton usage)
    X_tensor = torch.tensor(X_list, dtype=torch.float32)
    y_tensor = torch.tensor(y_list, dtype=torch.float32)

    return X_tensor, y_tensor, char_to_emb, emb_to_char

In [177]:
X, y, char_to_emb, emb_to_char = generate_dataset(20000, 2, 2)

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [181]:
X_train[1], y_train[1]

(tensor([[-0.9039,  0.4277,  0.0062],
         [-0.9039,  0.4277,  0.0062]]),
 tensor([[-0.3809, -0.8115,  0.4432],
         [-0.9039,  0.4277,  0.0062]]))

In [160]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Création des DataLoaders
train_loader = DataLoader(train_dataset, batch_size=300, shuffle=True)
test_loader = DataLoader(test_dataset)

In [183]:
class CustomAttentionOnlyBlock(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.Wq = nn.Linear(embed_dim, embed_dim, bias=False)  # Wq = Wk
        self.Wk = nn.Linear(embed_dim, embed_dim, bias=False)
        self.Wv = nn.Linear(embed_dim, embed_dim, bias=False)
    def forward(self, x):
        # V = x (identité)
        V = self.Wv(x)
        Q = self.Wq(x)
        K = self.Wk(x)

        # Attention scores : [B, T, T]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_weights = torch.softmax(scores, dim=-1)

        # Attention output : [B, T, D]
        attn_output = torch.matmul(attn_weights, V)

        # Résidu
        return x + attn_output


In [184]:
class FinalDAProjection(nn.Module):
    def __init__(self, dim, activation='sigmoid'):
        super().__init__()
        self.Wq = nn.Linear(dim, dim, bias=False)
        self.Wk = nn.Linear(dim, dim, bias=False)
        self.V = nn.Linear(dim, dim, bias=False)


    def forward(self, x):
        # x : [B, T, D]
        Q = self.Wq(x)                    
        K = self.Wk(x)
        V = self.V(x)
        # Attention scores : [B, T, T]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_output = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn_output, V)
        return torch.softmax(output, dim=-1)            


In [185]:
class CustomTransformerAttentionOnly(nn.Module):
    def __init__(self, embed_dim, num_layers=2):
        super().__init__()
        layers = [CustomAttentionOnlyBlock(embed_dim) for _ in range(num_layers - 1)]
        layers.append(FinalDAProjection(embed_dim))  # <-- ajout ici
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x  # [B, T] (si FinalDAProjection retourne un score par token)


In [186]:
loss_fn = nn.MSELoss()

In [187]:
def train_model(model, train_loader, test_loader, epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        model.eval()
        with torch.no_grad():
            total_val_loss = 0
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = loss_fn(pred, yb)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            pred = model(xb)  # [B, T, D]
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Évaluation

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


In [188]:
model = CustomTransformerAttentionOnly(embed_dim=X_train.shape[-1], num_layers=4)
train_model(model, train_loader, test_loader, epochs=100, lr=1e-3)

Epoch 1/100 | Train Loss: 0.4654 | Val Loss: 0.4826
Epoch 2/100 | Train Loss: 0.4375 | Val Loss: 0.4490
Epoch 3/100 | Train Loss: 0.4206 | Val Loss: 0.4269
Epoch 4/100 | Train Loss: 0.4094 | Val Loss: 0.4139
Epoch 5/100 | Train Loss: 0.4016 | Val Loss: 0.4047
Epoch 6/100 | Train Loss: 0.3967 | Val Loss: 0.3987
Epoch 7/100 | Train Loss: 0.3938 | Val Loss: 0.3949
Epoch 8/100 | Train Loss: 0.3908 | Val Loss: 0.3921
Epoch 9/100 | Train Loss: 0.3883 | Val Loss: 0.3895
Epoch 10/100 | Train Loss: 0.3851 | Val Loss: 0.3865
Epoch 11/100 | Train Loss: 0.3818 | Val Loss: 0.3832
Epoch 12/100 | Train Loss: 0.3781 | Val Loss: 0.3796
Epoch 13/100 | Train Loss: 0.3750 | Val Loss: 0.3764
Epoch 14/100 | Train Loss: 0.3729 | Val Loss: 0.3739
Epoch 15/100 | Train Loss: 0.3720 | Val Loss: 0.3724
Epoch 16/100 | Train Loss: 0.3716 | Val Loss: 0.3718
Epoch 17/100 | Train Loss: 0.3715 | Val Loss: 0.3716
Epoch 18/100 | Train Loss: 0.3717 | Val Loss: 0.3715
Epoch 19/100 | Train Loss: 0.3717 | Val Loss: 0.3714
Ep

KeyboardInterrupt: 

In [189]:
def evaluate_on_sequence(model, seq, char_to_emb, emb_to_char, empty_token='*', device='cpu'):
    model.eval()
    
    # 1. Encoder la séquence d'entrée
    emb_seq = [char_to_emb[c] for c in seq]
    x = torch.tensor(emb_seq, dtype=torch.float32).unsqueeze(0).to(device)  # [1, T, D]

    # 2. Prédiction
    with torch.no_grad():
        pred = model(x)[0]  # [T, D]
    
    print("Prédiction brute (embeddings) :", pred)

    # 3. Décoder la sortie
    def closest_embedding(vec):
        return min(emb_to_char.items(), key=lambda kv: np.linalg.norm(vec - np.array(kv[0])))[1]

    decoded_pred = [closest_embedding(p.cpu().numpy()) for p in pred]

    # 4. Cible attendue (compress + pad)
    target_seq = compress_and_pad_left(seq, empty_token)

    # 5. Affichage
    print("Séquence d’entrée      :", seq)
    print("Séquence compressée     :", target_seq)
    print("Séquence prédite (model):", decoded_pred)


In [190]:
# Exemple : test sur une séquence simple
sequence = ['B', 'B']
evaluate_on_sequence(model, sequence, char_to_emb, emb_to_char, empty_token='*')


Prédiction brute (embeddings) : tensor([[0.3349, 0.2225, 0.4425],
        [0.3349, 0.2225, 0.4425]])
Séquence d’entrée      : ['B', 'B']
Séquence compressée     : ['*', 'B']
Séquence prédite (model): ['*', '*']
