In [57]:
import numpy as np
import random
import string
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F


In [3]:
def generate_orthogonal_vectors(n):
    random_vectors = np.random.rand(n, n)
    orthogonal_vectors, _ = np.linalg.qr(random_vectors)
    return orthogonal_vectors

In [11]:
v = generate_orthogonal_vectors(3)

In [None]:
def generate_sequences(n, T, k):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    alphabet = list(string.ascii_uppercase[:k])
    sequences = [random.choices(alphabet, k=T) for _ in range(n)]
    return sequences


In [23]:
def generate_dictionnary(k, empty_token='*'):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    
    # Lettres + symbole spécial
    letters = list(string.ascii_uppercase[:k]) + [empty_token]
    d = k + 1  # Dimension de l'espace
    
    # Générer une base orthonormée de dimension (k+1)
    random_matrix = np.random.rand(d, d)
    orthogonal_basis, _ = np.linalg.qr(random_matrix)
    
    # Dictionnaires
    char_to_emb = {char: vec for char, vec in zip(letters, orthogonal_basis)}
    emb_to_char = {tuple(np.round(vec, 8)): char for char, vec in zip(letters, orthogonal_basis)}
    
    return char_to_emb, emb_to_char

In [25]:
def compress_and_pad(sequence, empty_token='*'):
    if not sequence:
        return []

    # Supprimer les redondances consécutives
    compressed = [sequence[0]]
    for c in sequence[1:]:
        if c != compressed[-1]:
            compressed.append(c)
    
    # Ajouter des tokens vides à la fin pour conserver la taille originale
    padding_length = len(sequence) - len(compressed)
    padded = compressed + [empty_token] * padding_length
    return padded


In [47]:
def generate_dataset(n, T, k, empty_token='*'):
    char_to_emb, emb_to_char = generate_dictionnary(k)
    alphabet = [c for c in char_to_emb if c != empty_token]
    emb_dim = len(next(iter(char_to_emb.values())))
    
    X_list = []
    y_list = []

    for _ in range(n):
        seq = random.choices(alphabet, k=T)
        emb_seq = [char_to_emb[c] for c in seq]
        
        compressed_seq = compress_and_pad(seq, empty_token)
        target_emb_seq = [char_to_emb[c] for c in compressed_seq]

        X_list.append(emb_seq)
        y_list.append(target_emb_seq)

    # Convertir en tensors (facultatif selon ton usage)
    X_tensor = torch.tensor(X_list, dtype=torch.float32)
    y_tensor = torch.tensor(y_list, dtype=torch.float32)

    return X_tensor, y_tensor, char_to_emb, emb_to_char

In [48]:
X, y, char_to_emb, emb_to_char = generate_dataset(10, 5, 3)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Création des DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [55]:
class AttentionOnlyBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # Self-attention + résiduel + normalisation
        attn_output, _ = self.attn(x, x, x, need_weights=False)
        x = self.norm(x + attn_output)
        return x

class TransformerAttentionOnly(nn.Module):
    def __init__(self, embed_dim, num_heads=1, num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList([
            AttentionOnlyBlock(embed_dim, num_heads) for _ in range(num_layers)
        ])
        self.output = nn.Linear(embed_dim, embed_dim)  # projection finale vers l’espace embedding

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.output(x)
