In [2]:
import numpy as np
import random
import string
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
def generate_dictionaries(k):
    """
    Génère les dictionnaires char<->id et char<->embedding pour k lettres + un token vide.
    """
    import numpy as np

    letters = [chr(ord('A') + i) for i in range(k)]
    letters.append('_')  # token vide
    dim = k 
    basis = np.eye(dim)  # Embeddings orthogonaux

    char_to_id = {char: i for i, char in enumerate(letters)}
    id_to_char = {i: char for char, i in char_to_id.items()}
    char_to_embed = {char: basis[i] for i, char in enumerate(letters[:-1])}

    return char_to_id, id_to_char, char_to_embed

In [6]:
def generate_sequences(n, T, k):
    assert 1 <= k <= 26, "k doit être entre 1 et 26"
    alphabet = list(string.ascii_uppercase[:k])
    sequences = [random.choices(alphabet, k=T) for _ in range(n)]
    return sequences


In [8]:
def generate_dataset(n, T, D, seed=None):
    """
    Génère un dataset où chaque lettre après un 'A' (sauf le premier) est remplacée par
    celle qui suit le premier 'A'. Ex: [C, A, B, A, C] -> [C, A, B, A, B]
    """
    import numpy as np
    import torch

    if seed is not None:
        np.random.seed(seed)

    # Dictionnaires et embeddings
    char_to_id, id_to_char, char_to_embed = generate_dictionaries(D)
    letters = [chr(ord('A') + i) for i in range(D)]

    X = []
    y = []

    sequences = generate_sequences(n, T, D)

    for seq in sequences:
        embeddings = [char_to_embed[char] for char in seq]
        X.append(embeddings)

        seq_out = seq.copy()

        # Trouver tous les indices de 'A'
        A_indices = [i for i, c in enumerate(seq) if c == 'A']
        
        if len(A_indices) >= 2 and A_indices[0] + 1 < T:
            substitute_char = seq[A_indices[0] + 1]
            for idx in A_indices[1:]:
                if idx + 1 < T:
                    seq_out[idx + 1] = substitute_char

        class_ids = [char_to_id[c] for c in seq_out]
        y.append(class_ids)

    X = torch.tensor(X).float()    # [B, T, D]
    y = torch.tensor(y).long()     # [B, T]
    return X, y, char_to_id, id_to_char


In [13]:
n = 1000
T = 50
D = 5

In [16]:
X, y, char_to_id, id_to_char = generate_dataset(n, T, D)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Création des DataLoaders
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
test_loader = DataLoader(test_dataset)

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # [1, max_len, d_model]

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x

In [22]:
def token_accuracy(preds, targets):
    """
    Calcule l'accuracy lettre par lettre (token par token).
    """
    pred_classes = torch.argmax(preds, dim=-1)  # [B, T]
    correct = (pred_classes == targets).float()
    return correct.mean().item()

def train_model(model, train_loader, test_loader, epochs=10, lr=1e-3, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()  # attente [B, C, T] vs [B, T]

    train_loss_list = []
    test_loss_list = []
    test_acc_list = []

    for epoch in range(1, epochs + 1):
        model.train()
        total_train_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)  # xb: [B, T, D], yb: [B, T]
            preds = model(xb)                      # preds: [B, T, C]
            preds = preds.permute(0, 2, 1)         # -> [B, C, T] pour CrossEntropy
            loss = criterion(preds, yb)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss_list.append(loss.item())
            total_train_loss += loss.item()

        # 🔍 Validation à la fin de l'epoch
        model.eval()
        total_val_loss = 0
        total_acc = 0
        total_samples = 0
        with torch.no_grad():
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_loss = criterion(preds.permute(0, 2, 1), yb)
                total_val_loss += val_loss.item()
                test_loss_list.append(val_loss.item())

                acc = token_accuracy(preds, yb)
                total_acc += acc * xb.size(0)
                total_samples += xb.size(0)

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(test_loader)
        avg_val_acc = total_acc / total_samples
        test_acc_list.append(avg_val_acc)

        print(f"Epoch {epoch}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {avg_val_acc:.4f}")

    return train_loss_list, test_loss_list, test_acc_list



In [23]:
class AttentionOnlyBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.Wq = nn.Linear(d_model, d_model, bias=False)
        self.Wk = nn.Linear(d_model, d_model, bias=False)
        self.Wv = nn.Linear(d_model, d_model, bias=False)
        self.norm = nn.LayerNorm(d_model)
        self.attn_weights = None  # <-- pour garder la dernière attention

    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)
        self.attn_weights = attn_weights.detach().cpu()  # stocker pour affichage
        #attn_output = torch.matmul(attn_weights, V)
        attn_output = torch.matmul(attn_weights, V)
        return self.norm(attn_output + x)

class AttentionOnlyTransformer(nn.Module):
    def __init__(self, input_dim, num_classes, d_model=128, num_layers=2, max_len=100):
        super().__init__()
        self.embedding_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([AttentionOnlyBlock(d_model) for _ in range(num_layers)])
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding_proj(x)
        x = self.pos_encoder(x)
        for layer in self.layers:
            x = layer(x)
        return self.classifier(x)