In [52]:
import random
import string
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [53]:
def generate_sequence(T: int, D: int) -> list:
    """
    Génère une séquence de longueur 2*T sous forme de liste :
    - Lettres parmi D lettres (A, B, C, ...)
    - Nombres parmi D nombres (0, 1, 2, ..., D-1)
    - Chaque lettre est associée toujours au même nombre.

    Args:
        T (int): nombre de paires lettre-nombre.
        D (int): nombre de lettres (et de nombres) possibles.

    Returns:
        list: séquence sous forme de liste de tokens (str pour lettres, str pour nombres).
    """
    letters = [chr(ord('A') + i) for i in range(D)]
    numbers = [str(i) for i in range(D)]  # Nombres en chaînes pour uniformiser

    # Dictionnaire d'association lettre -> nombre
    mapping = {}
    sequence = []

    for _ in range(T):
        letter = random.choice(letters)
        # Si cette lettre n'a pas encore d'association, lui en créer une
        if letter not in mapping:
            mapping[letter] = random.choice(numbers)
        number = mapping[letter]
        
        # Ajout dans la séquence : lettre suivie de son nombre associé
        sequence.append(letter)
        sequence.append(number)

    return sequence

# Exemple d'utilisation
if __name__ == "__main__":
    seq = generate_sequence(T=5, D=4)
    print(seq)  # Exemple : ['A', '2', 'B', '1', 'A', '2', 'C', '3', 'B', '1']


['B', '1', 'A', '1', 'C', '1', 'D', '2', 'B', '1']


In [70]:
import torch
import random

def generate_sequence(T: int, D: int) -> list:
    """
    Génère une séquence de 2*T tokens (lettre, nombre alternés).
    """
    letters = [chr(ord('A') + i) for i in range(D)]
    numbers = [str(i) for i in range(D)]

    mapping = {}
    sequence = []

    for _ in range(T):
        letter = random.choice(letters)
        if letter not in mapping:
            mapping[letter] = random.choice(numbers)
        number = mapping[letter]
        sequence.append(letter)
        sequence.append(number)
    
    return sequence

def build_vocab(D: int):
    """
    Construit le vocabulaire (lettres + nombres) et un dictionnaire token -> id.
    """
    vocab = [chr(ord('A') + i) for i in range(D)] + [str(i) for i in range(D)]
    token_to_idx = {token: idx for idx, token in enumerate(vocab)}
    idx_to_token = {idx: token for token, idx in token_to_idx.items()}
    return vocab, token_to_idx, idx_to_token
def create_dataset_onehot(T: int, D: int, token_to_idx=None):
    """
    Crée un X, y avec X = liste de one-hot vectors, y = id du dernier token.
    Si token_to_idx est donné, l'utiliser directement pour encoder.
    """
    # Générer la séquence
    sequence = generate_sequence(T, D)

    if token_to_idx is None:
        vocab, token_to_idx, idx_to_token = build_vocab(D)
    vocab_size = len(token_to_idx)

    # Créer la matrice identité pour one-hot encoding
    eye = torch.eye(vocab_size)

    # Convertir les tokens en one-hot vectors
    X_tokens = sequence[:-1]
    y_token = sequence[-1]

    # Sécurité : vérifier que le token existe
    assert y_token in token_to_idx, f"Token {y_token} not in vocab!"

    X_onehots = torch.stack([eye[token_to_idx[token]] for token in X_tokens])
    y_label = token_to_idx[y_token]

    return X_onehots, y_label, sequence





In [71]:
def generate_batch(batch_size: int, T: int, token_to_idx: dict):
    """
    Génère un batch de données :
    - batch_size exemples
    - Chaque exemple est une séquence de 2*T-1 one-hot vectors pour X
    - Et une étiquette y correspondant au dernier token à prédire.

    Args:
        batch_size (int): nombre d'exemples dans le batch
        T (int): nombre de paires lettre-nombre par séquence
        token_to_idx (dict): dictionnaire token → id

    Returns:
        X_batch (torch.Tensor): shape (batch_size, seq_len, vocab_size)
        y_batch (torch.Tensor): shape (batch_size,)
    """
    vocab_size = len(token_to_idx)
    eye = torch.eye(vocab_size)

    X_list = []
    y_list = []

    for _ in range(batch_size):
        sequence = generate_sequence(T, D=len(token_to_idx)//2)

        X_tokens = sequence[:-1]
        y_token = sequence[-1]

        # Vérifier que les tokens existent bien
        assert all(token in token_to_idx for token in X_tokens + [y_token]), "Token inconnu dans vocabulaire."

        X_onehots = torch.stack([eye[token_to_idx[token]] for token in X_tokens])
        y_label = token_to_idx[y_token]

        X_list.append(X_onehots)
        y_list.append(y_label)

    X_batch = torch.stack(X_list)
    y_batch = torch.tensor(y_list)

    return X_batch, y_batch


In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionOnlyBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.Wq = nn.Linear(d_model, d_model, bias=False)
        self.Wk = nn.Linear(d_model, d_model, bias=False)
        self.Wv = nn.Linear(d_model, d_model, bias=False)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)

        attn_output = torch.matmul(attn_weights, V)
        return self.norm(attn_output + x)

class AttentionOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_layers=2, max_seq_len=60):
        super().__init__()
        self.embedding_proj = nn.Linear(vocab_size, d_model, bias=False)  # projection one-hot
        self.positional_embedding = nn.Embedding(max_seq_len, d_model)    # positional embeddings learnables
        self.layers = nn.ModuleList([AttentionOnlyBlock(d_model) for _ in range(num_layers)])
        self.classifier = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        
        x = self.embedding_proj(x)  # (batch_size, seq_len, d_model)
        
        # Ajouter l'embedding de position
        positions = torch.arange(seq_len, device=x.device)  # (seq_len,)
        pos_emb = self.positional_embedding(positions)      # (seq_len, d_model)
        pos_emb = pos_emb.unsqueeze(0)                      # (1, seq_len, d_model)
        x = x + pos_emb                                      # broadcasting sur le batch

        for layer in self.layers:
            x = layer(x)
        
        x_last = x[:, -1, :]  # prendre le dernier token pour prédiction
        logits = self.classifier(x_last)
        return logits



In [75]:
D = 4
vocab, token_to_idx, idx_to_token = build_vocab(D)

X, y = generate_batch(batch_size=64, T=5, token_to_idx=token_to_idx)

print(X.shape)  # (64, 9, 8) si T=5 et D=4
print(y.shape)  # (64,)



torch.Size([64, 9, 8])
torch.Size([64])


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        print(f"Vocab size (embedding): {model.classifier.out_features}")
        print(f"Logits shape: {logits.shape}")
        print(f"y_batch max: {y_batch.max().item()}, y_batch min: {y_batch.min().item()}")

        loss = F.cross_entropy(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            preds = torch.argmax(logits, dim=-1)

            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    
    return correct / total


In [78]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = F.cross_entropy(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            preds = torch.argmax(logits, dim=-1)

            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    
    return correct / total


In [79]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


vocab_size = 2 * D
model = AttentionOnlyTransformer(vocab_size=vocab_size, d_model=128, num_layers=2, max_seq_len=3*T).to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(100):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    acc = evaluate(model, test_loader, device)

    print(f"Epoch {epoch+1} - Loss: {train_loss:.4f} - Test Accuracy: {acc:.4f}")


../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [41,0,0],

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def show_predictions(model, dataloader, idx_to_token, device, n_examples=5):
    """
    Affiche quelques séquences du test et les prédictions du modèle.

    Args:
        model: ton modèle entraîné
        dataloader: le DataLoader de test
        idx_to_token: dictionnaire index → token (lettre ou chiffre)
        device: 'cuda' ou 'cpu'
        n_examples: nombre d'exemples à afficher
    """
    model.eval()

    X_batch, y_batch = next(iter(dataloader))  # Prendre un batch
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    with torch.no_grad():
        logits = model(X_batch)
        preds = torch.argmax(logits, dim=-1)

    for i in range(n_examples):
        X_seq = X_batch[i]  # (seq_len, vocab_size)
        y_true = y_batch[i].item()
        y_pred = preds[i].item()

        # Reconstruire la séquence (trouver les tokens non one-hot)
        tokens = []
        for vec in X_seq:
            idx = torch.argmax(vec).item()  # retrouver l'index du 1
            token = idx_to_token[idx]
            tokens.append(token)

        print(f"Input sequence : {' '.join(tokens)}")
        print(f"Prediction     : {idx_to_token[y_pred]}")
        print(f"Ground Truth   : {idx_to_token[y_true]}")
        print('-' * 50)


In [None]:
show_predictions(model, test_loader, idx_to_token, device, n_examples=5)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
