In [1]:
import torch
import random
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split


In [2]:
def generate_induction_dataset(n, k=4, seed=None):
    """
    Génère des séquences de la forme [A, 1, B, 2, A, ?] où le modèle doit prédire '1'.
    
    Args:
        n: nombre de séquences
        k: nombre de clés différentes (A, B, ...)
        seed: pour reproductibilité

    Returns:
        X: [n, T, D] - embeddings one-hot
        y: [n] - target class (indice de la valeur à prédire)
        char_to_id, id_to_char: dictionnaires utiles
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    # Définir l'alphabet : A, B, ..., + valeurs : 1, 2, ...
    letters = [chr(ord('A') + i) for i in range(k)]
    values = [str(i + 1) for i in range(k)]
    special_token = '?'  # à compléter

    tokens = letters + values + [special_token]
    char_to_id = {c: i for i, c in enumerate(tokens)}
    id_to_char = {i: c for c, i in char_to_id.items()}
    D = len(tokens)
    eye = np.eye(D)

    X = []
    y = []

    for _ in range(n):
        keys = random.sample(letters, k)
        random.shuffle(values)
        mapping = list(zip(keys, values))  # e.g. A->3, B->1

        # On choisit une des paires à répéter
        target_key, target_value = random.choice(mapping)

        # Séquence = [A, 3, B, 1, ...] + [A, ?]
        sequence = []
        for k_, v_ in mapping:
            sequence.extend([k_, v_])
        sequence.extend([target_key, special_token])

        x_seq = [eye[char_to_id[tok]] for tok in sequence]
        X.append(x_seq)
        y.append(char_to_id[target_value])  # prédire la valeur (classe)

    return (
        torch.tensor(X, dtype=torch.float32),  # [n, T, D]
        torch.tensor(y, dtype=torch.long),     # [n]
        char_to_id,
        id_to_char
    )


In [77]:
def print_induction_sequences(X, y, id_to_char):
    """
    Affiche les séquences d'induction et la cible à prédire.
    
    Args:
        X: [n, T, D] - embeddings one-hot
        y: [n] - target ids (valeurs à prédire)
        id_to_char: dictionnaire id -> caractère
    """
    for i in range(len(X)):
        seq_tensor = X[i]  # [T, D]
        seq_ids = seq_tensor.argmax(dim=-1).tolist()
        seq_chars = [id_to_char[idx] for idx in seq_ids]
        target_char = id_to_char[y[i].item()]
        print(f"📜 Sequence {i+1}: {' '.join(seq_chars)}")
        print(f"🎯 Target: {target_char}")
        print("-" * 40)


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class AttentionOnlyBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.attn_weights = None  # pour debug/visualisation

    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(Q.size(-1))
        attn_weights = F.softmax(attn_scores, dim=-1)
        self.attn_weights = attn_weights.detach().cpu()  # pour debug
        out = torch.matmul(attn_weights, V)
        return self.norm(x + out)

class InductionTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, num_layers=2, num_classes=10):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.blocks = nn.ModuleList([AttentionOnlyBlock(d_model) for _ in range(num_layers)])
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):  # x: [B, T, D]
        x = self.embedding(x)
        x = self.pos_enc(x)
        for block in self.blocks:
            x = block(x)
        return self.classifier(x)  # [B, T, C] : logits par position


In [9]:
D = 5
T = 2*D+2

In [4]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class AttentionOnlyBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.attn_weights = None

    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        attn = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(Q.size(-1))
        weights = F.softmax(attn, dim=-1)
        self.attn_weights = weights.detach().cpu()
        x = x + torch.matmul(weights, V)
        return self.norm(x)

class AttentionOnlyClassifier(nn.Module):
    def __init__(self, input_dim, d_model=128, num_layers=2, num_classes=10):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.layers = nn.ModuleList([AttentionOnlyBlock(d_model) for _ in range(num_layers)])
        self.output = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = x[:, -1, :]  # On ne garde que le dernier token
        return self.output(x)


In [5]:
def train_model(model, train_loader, test_loader, epochs=10, lr=1e-3, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                predicted = torch.argmax(preds, dim=1)
                correct += (predicted == yb).sum().item()
                total += yb.size(0)

        acc = correct / total
        print(f"Epoch {epoch}/{epochs} | Train Loss: {total_loss/len(train_loader):.4f} | Val Acc: {acc:.4f}")


In [6]:
from torch.utils.data import TensorDataset, DataLoader

# Hyperparamètres
T = 10
D = 5
n = 30000



In [10]:
X, y, char_to_id, id_to_char = generate_induction_dataset(n, D)

In [11]:
char_to_id

{'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 '1': 5,
 '2': 6,
 '3': 7,
 '4': 8,
 '5': 9,
 '?': 10}

In [12]:
X_train, X_test,  y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [16]:
input_dim = X_train.shape[-1]
num_classes = input_dim  # car lettres + chiffres sont one-hot encodés dans une seule dimension
model = AttentionOnlyClassifier(input_dim=input_dim, num_classes=num_classes, d_model=128, num_layers=2)

train_model(model, train_loader, test_loader, epochs=20)


Epoch 1/20 | Train Loss: 1.6293 | Val Acc: 0.2027
Epoch 2/20 | Train Loss: 1.6176 | Val Acc: 0.2028


KeyboardInterrupt: 

In [14]:
def show_predictions(model, data_loader, id_to_char, device='cuda' if torch.cuda.is_available() else 'cpu', num_batches=1):
    model.eval()
    model.to(device)
    vocab_size = len(id_to_char)

    with torch.no_grad():
        for i, (xb, yb) in enumerate(data_loader):
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)  # [B, num_classes]
            preds = torch.argmax(logits, dim=-1)  # [B]

            for j in range(xb.size(0)):
                sequence_ids = torch.argmax(xb[j][:, :vocab_size], dim=-1).cpu().numpy()  # [T]
                sequence = [id_to_char[i] for i in sequence_ids]

                print(f"🔡 Input sequence  : {''.join(sequence)}")
                print(f"🎯 Target letter   : {id_to_char[yb[j].item()]}")
                print(f"🔮 Predicted letter: {id_to_char[preds[j].item()]}")
                print("-" * 40)

            if i + 1 >= num_batches:
                break


In [17]:
show_predictions(model, test_loader, id_to_char, num_batches=2)


🔡 Input sequence  : C1E3B2A4D5B?
🎯 Target letter   : 2
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : A1E4D5B2C3D?
🎯 Target letter   : 5
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : D4B2A5C3E1D?
🎯 Target letter   : 4
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : A4D5B3C2E1C?
🎯 Target letter   : 2
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : B4A3E1D5C2A?
🎯 Target letter   : 3
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : E3C2D4B1A5E?
🎯 Target letter   : 3
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : A4D2E1B5C3C?
🎯 Target letter   : 3
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : E5D2B1A3C4C?
🎯 Target letter   : 4
🔮 Predicted letter: 2
----------------------------------------
🔡 Input sequence  : C3A5E2B1D4B?
🎯 Target letter   : 1
🔮

In [None]:
id_to_char

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}