In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import torch
from models.tokenformer import TokenformerLayer
from models.transformer import TransformerModel
from training.train import train 
from training.train import train_with_scaling

In [6]:
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("CUDA Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA Available: True
CUDA Device Count: 1
CUDA Device Name: NVIDIA GeForce RTX 3070


#### Liste des TokenFormers

In [None]:
token_num_init = 512

model_configs = [
    {
        "hidden_dim": 512,  
        "num_heads": 1,     
        "num_layers": 12,     
        "max_seq_len": 32,   
        "token_num": token_num_init,     
        "vocab_size": 10656  
    },
    {
        "hidden_dim": 512,
        "num_heads": 1,
        "num_layers": 12,
        "max_seq_len": 32,
        "token_num": token_num_init*4,  # 4x plus de tokens paramétriques
        "vocab_size": 10656
    },
    {
        "hidden_dim": 512,
        "num_heads": 1,
        "num_layers": 12,
        "max_seq_len": 32,
        "token_num": token_num_init*8,  # 8x plus de tokens paramétriques
        "vocab_size": 10656
    },
    {
        "hidden_dim": 512,
        "num_heads": 1,
        "num_layers": 12,
        "max_seq_len": 32,
        "token_num": token_num_init*16,  # 16x plus de tokens paramétriques
        "vocab_size": 10656
    }
    ,
    {
        "hidden_dim": 512,
        "num_heads": 1,
        "num_layers": 12,
        "max_seq_len": 32,
        "token_num": token_num_init*32,  # 16x plus de tokens paramétriques
        "vocab_size": 10656
    }
]

# Tester chaque configuration
for config in model_configs:
    model = TokenformerLayer(
        hidden_size=config["hidden_dim"],
        vocab_size=config["vocab_size"],
        num_attention_heads=config["num_heads"],
        max_seq_len=config["max_seq_len"],
        token_num=config["token_num"]
    )

    trainable_params_auto = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"📊 Modèle : {config}")

    # for name, param in model.named_parameters():
    #     print(f"{name}: {param.numel():_} paramètres")

    print(f"🔹 PyTorch Trainable Params : {trainable_params_auto:_}\n")


📊 Modèle : {'hidden_dim': 512, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 512, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 13_560_224

📊 Modèle : {'hidden_dim': 512, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 2048, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 21_424_544

📊 Modèle : {'hidden_dim': 512, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 4096, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 31_910_304

📊 Modèle : {'hidden_dim': 512, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 8192, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 52_881_824

📊 Modèle : {'hidden_dim': 512, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 16384, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 94_824_864



#### Liste des Transformers

In [31]:
transformer_model_configs = [
    {
        "hidden_dim": 256,  
        "num_heads": 4,   
        "num_layers": 10,  
        "max_seq_len": 32,  
        "vocab_size": 10656 
    },
    {
        "hidden_dim": 372, 
        "num_heads": 6,   
        "num_layers": 8, 
        "max_seq_len": 32,  
        "vocab_size": 10656  
    },
    {
        "hidden_dim": 480,  
        "num_heads": 8,   
        "num_layers": 8,  
        "max_seq_len": 32,  
        "vocab_size": 10656  
    },
    {
        "hidden_dim": 500,  
        "num_heads": 10,   
        "num_layers": 14,  
        "max_seq_len": 32,  
        "vocab_size": 10656  
    },
    {
        "hidden_dim": 612,  
        "num_heads": 12,   
        "num_layers": 18,  
        "max_seq_len": 32,  
        "vocab_size": 10656  
    }
]

# Tester chaque configuration
for config in model_configs:
    model = TransformerModel(
        vocab_size=config["vocab_size"],
        hidden_dim=config["hidden_dim"],
        num_layers=config["num_layers"],
        num_heads=config["num_heads"],
        max_seq_len=config["max_seq_len"]
    )

    trainable_params_auto = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"📊 Modèle : {config}")
    print(f"🔹 PyTorch Trainable Params : {trainable_params_auto:_}\n")


📊 Modèle : {'hidden_dim': 256, 'num_heads': 4, 'num_layers': 10, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 13_372_320

📊 Modèle : {'hidden_dim': 372, 'num_heads': 6, 'num_layers': 8, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 21_274_176

📊 Modèle : {'hidden_dim': 480, 'num_heads': 8, 'num_layers': 8, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 32_424_096

📊 Modèle : {'hidden_dim': 500, 'num_heads': 10, 'num_layers': 14, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 52_773_656

📊 Modèle : {'hidden_dim': 612, 'num_heads': 12, 'num_layers': 18, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 94_117_896



# Training Transformers :

In [32]:
for config in transformer_model_configs:
    # Création dynamique des noms de fichiers
    model = TransformerModel(
        vocab_size=config["vocab_size"],
        hidden_dim=config["hidden_dim"],
        num_layers=config["num_layers"],
        num_heads=config["num_heads"],
        max_seq_len=config["max_seq_len"]
    )
    
    # Calcul des paramètres entraînables
    trainable_params_auto = sum(p.numel() for p in model.parameters() if p.requires_grad)

    checkpoint_filename = f"checkpoint_{trainable_params_auto}.pth"
    results_filename = f"results_{trainable_params_auto}.csv"

    print(f"📊 Modèle : {config}")
    print(f"🔹 PyTorch Trainable Params : {trainable_params_auto:_}\n")
    
    # Lancer l'entraînement
    train(
        use_tokenformer=False,
        hidden_dim=config["hidden_dim"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        max_seq_len=config["max_seq_len"],
        batch_size=32,
        num_epochs=10,
        learning_rate=0.001,
        token_num=32,
        val_ratio=0.2,
        test_ratio=0.2,
        model_checkpoint_file_name=checkpoint_filename,
        model_results_file_name=results_filename
    )

📊 Modèle : {'hidden_dim': 256, 'num_heads': 4, 'num_layers': 10, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 13_372_320

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 13_372_320
Approximate Computational Cost (FLOPS): 377_487_360
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                              

Epoch 1/10 | Train Loss: 0.6059 | Val Loss: 0.2070 | Val Perplexity: 1.2300 | Time: 40.89s


                                                                              

Epoch 2/10 | Train Loss: 0.1866 | Val Loss: 0.2019 | Val Perplexity: 1.2237 | Time: 40.65s


                                                                              

Epoch 3/10 | Train Loss: 0.1626 | Val Loss: 0.2015 | Val Perplexity: 1.2232 | Time: 40.59s


                                                                              

Epoch 4/10 | Train Loss: 0.1360 | Val Loss: 0.2077 | Val Perplexity: 1.2309 | Time: 40.63s


                                                                               

Epoch 5/10 | Train Loss: 0.1076 | Val Loss: 0.2235 | Val Perplexity: 1.2504 | Time: 40.74s


                                                                               

Epoch 6/10 | Train Loss: 0.0811 | Val Loss: 0.2409 | Val Perplexity: 1.2724 | Time: 40.23s


                                                                               

Epoch 7/10 | Train Loss: 0.0603 | Val Loss: 0.2572 | Val Perplexity: 1.2933 | Time: 40.32s


                                                                               

Epoch 8/10 | Train Loss: 0.0460 | Val Loss: 0.2767 | Val Perplexity: 1.3187 | Time: 40.43s


                                                                               

Epoch 9/10 | Train Loss: 0.0358 | Val Loss: 0.2991 | Val Perplexity: 1.3486 | Time: 40.85s


                                                                                

Epoch 10/10 | Train Loss: 0.0304 | Val Loss: 0.3126 | Val Perplexity: 1.3670 | Time: 40.62s

Final evaluation on test set...
Test Loss: 0.3111 | Test Perplexity: 1.3649
Total Training Time: 405.94s
Final model saved: Saved_Models_Checkpoints\checkpoint_13372320.pth
📊 Modèle : {'hidden_dim': 372, 'num_heads': 6, 'num_layers': 8, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 21_274_176

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 21_274_176
Approximate Computational Cost (FLOPS): 923_369_472
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                              

Epoch 1/10 | Train Loss: 0.5558 | Val Loss: 0.2135 | Val Perplexity: 1.2380 | Time: 43.89s


                                                                              

Epoch 2/10 | Train Loss: 0.1871 | Val Loss: 0.2011 | Val Perplexity: 1.2228 | Time: 43.70s


                                                                              

Epoch 3/10 | Train Loss: 0.1561 | Val Loss: 0.2064 | Val Perplexity: 1.2293 | Time: 43.80s


                                                                               

Epoch 4/10 | Train Loss: 0.1255 | Val Loss: 0.2163 | Val Perplexity: 1.2414 | Time: 43.79s


                                                                               

Epoch 5/10 | Train Loss: 0.0933 | Val Loss: 0.2358 | Val Perplexity: 1.2660 | Time: 43.81s


                                                                               

Epoch 6/10 | Train Loss: 0.0660 | Val Loss: 0.2593 | Val Perplexity: 1.2960 | Time: 43.94s


                                                                               

Epoch 7/10 | Train Loss: 0.0471 | Val Loss: 0.2826 | Val Perplexity: 1.3265 | Time: 43.82s


                                                                               

Epoch 8/10 | Train Loss: 0.0352 | Val Loss: 0.3075 | Val Perplexity: 1.3601 | Time: 43.81s


                                                                               

Epoch 9/10 | Train Loss: 0.0287 | Val Loss: 0.3309 | Val Perplexity: 1.3922 | Time: 43.85s


                                                                                 

Epoch 10/10 | Train Loss: 0.0243 | Val Loss: 0.3476 | Val Perplexity: 1.4157 | Time: 43.79s

Final evaluation on test set...
Test Loss: 0.3443 | Test Perplexity: 1.4110
Total Training Time: 438.19s
Final model saved: Saved_Models_Checkpoints\checkpoint_21274176.pth
📊 Modèle : {'hidden_dim': 480, 'num_heads': 8, 'num_layers': 8, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 32_424_096

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 32_424_096
Approximate Computational Cost (FLOPS): 2_013_265_920
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                              

Epoch 1/10 | Train Loss: 0.5585 | Val Loss: 0.2259 | Val Perplexity: 1.2535 | Time: 56.42s


                                                                              

Epoch 2/10 | Train Loss: 0.2147 | Val Loss: 0.2277 | Val Perplexity: 1.2557 | Time: 56.35s


                                                                              

Epoch 3/10 | Train Loss: 0.1626 | Val Loss: 0.2262 | Val Perplexity: 1.2538 | Time: 56.25s


                                                                               

Epoch 4/10 | Train Loss: 0.1073 | Val Loss: 0.2533 | Val Perplexity: 1.2883 | Time: 56.16s


                                                                               

Epoch 5/10 | Train Loss: 0.0823 | Val Loss: 0.2853 | Val Perplexity: 1.3302 | Time: 56.29s


                                                                               

Epoch 6/10 | Train Loss: 0.0656 | Val Loss: 0.3092 | Val Perplexity: 1.3624 | Time: 56.25s


                                                                               

Epoch 7/10 | Train Loss: 0.0541 | Val Loss: 0.3350 | Val Perplexity: 1.3979 | Time: 56.18s


                                                                               

Epoch 8/10 | Train Loss: 0.0430 | Val Loss: 0.3660 | Val Perplexity: 1.4419 | Time: 56.21s


                                                                               

Epoch 9/10 | Train Loss: 0.0360 | Val Loss: 0.4054 | Val Perplexity: 1.5000 | Time: 56.18s


                                                                                 

Epoch 10/10 | Train Loss: 0.0318 | Val Loss: 0.4291 | Val Perplexity: 1.5358 | Time: 56.15s

Final evaluation on test set...
Test Loss: 0.4311 | Test Perplexity: 1.5390
Total Training Time: 562.43s
Final model saved: Saved_Models_Checkpoints\checkpoint_32424096.pth
📊 Modèle : {'hidden_dim': 500, 'num_heads': 10, 'num_layers': 14, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 52_773_656

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 52_773_656
Approximate Computational Cost (FLOPS): 4_766_720_000
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                              

Epoch 1/10 | Train Loss: 0.6610 | Val Loss: 0.2242 | Val Perplexity: 1.2513 | Time: 93.83s


                                                                              

Epoch 2/10 | Train Loss: 0.2720 | Val Loss: 0.3766 | Val Perplexity: 1.4573 | Time: 93.69s


                                                                              

Epoch 3/10 | Train Loss: 0.3334 | Val Loss: 0.2453 | Val Perplexity: 1.2780 | Time: 93.52s


                                                                              

Epoch 4/10 | Train Loss: 0.1765 | Val Loss: 0.2356 | Val Perplexity: 1.2656 | Time: 93.82s


                                                                               

Epoch 5/10 | Train Loss: 0.1305 | Val Loss: 0.2670 | Val Perplexity: 1.3060 | Time: 93.05s


                                                                               

Epoch 6/10 | Train Loss: 0.1132 | Val Loss: 0.2873 | Val Perplexity: 1.3328 | Time: 93.30s


                                                                               

Epoch 7/10 | Train Loss: 0.0993 | Val Loss: 0.3115 | Val Perplexity: 1.3655 | Time: 93.08s


                                                                               

Epoch 8/10 | Train Loss: 0.0790 | Val Loss: 0.3377 | Val Perplexity: 1.4018 | Time: 93.05s


                                                                               

Epoch 9/10 | Train Loss: 0.0665 | Val Loss: 0.3683 | Val Perplexity: 1.4453 | Time: 93.31s


                                                                                

Epoch 10/10 | Train Loss: 0.0573 | Val Loss: 0.4031 | Val Perplexity: 1.4964 | Time: 93.23s

Final evaluation on test set...
Test Loss: 0.4040 | Test Perplexity: 1.4979
Total Training Time: 933.89s
Final model saved: Saved_Models_Checkpoints\checkpoint_52773656.pth
📊 Modèle : {'hidden_dim': 612, 'num_heads': 12, 'num_layers': 18, 'max_seq_len': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 94_117_896

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 94_117_896
Approximate Computational Cost (FLOPS): 10_896_850_944
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                              

Epoch 1/10 | Train Loss: 0.8115 | Val Loss: 0.2350 | Val Perplexity: 1.2649 | Time: 155.49s


                                                                              

Epoch 2/10 | Train Loss: 1.3584 | Val Loss: 3.3948 | Val Perplexity: 29.8101 | Time: 155.34s


                                                                              

Epoch 3/10 | Train Loss: 1.2077 | Val Loss: 0.4330 | Val Perplexity: 1.5419 | Time: 154.94s


                                                                              

Epoch 4/10 | Train Loss: 0.3379 | Val Loss: 0.3626 | Val Perplexity: 1.4371 | Time: 154.96s


                                                                              

Epoch 5/10 | Train Loss: 0.2537 | Val Loss: 0.3788 | Val Perplexity: 1.4605 | Time: 154.64s


                                                                              

Epoch 6/10 | Train Loss: 0.2256 | Val Loss: 0.4002 | Val Perplexity: 1.4921 | Time: 154.58s


                                                                              

Epoch 7/10 | Train Loss: 0.2026 | Val Loss: 0.4202 | Val Perplexity: 1.5222 | Time: 154.21s


                                                                              

Epoch 8/10 | Train Loss: 0.1760 | Val Loss: 0.4401 | Val Perplexity: 1.5528 | Time: 154.61s


                                                                               

Epoch 9/10 | Train Loss: 0.1568 | Val Loss: 0.4726 | Val Perplexity: 1.6041 | Time: 154.62s


                                                                                

Epoch 10/10 | Train Loss: 0.1342 | Val Loss: 0.5077 | Val Perplexity: 1.6614 | Time: 154.47s

Final evaluation on test set...
Test Loss: 0.5096 | Test Perplexity: 1.6646
Total Training Time: 1547.84s
Final model saved: Saved_Models_Checkpoints\checkpoint_94117896.pth


# Training TokenFormers : 

### Scaling progressif
On entraîne TokenFormer avec token_num = 512, puis on scale progressivement :

In [29]:
token_num_init = 512

train_with_scaling(
    file_path='data/pokemon.txt',
    initial_token_num=token_num_init,
    scaling_steps=4,  # On va jusqu'à 16384 tokens
    new_tokens_per_step=[token_num_init * 3, token_num_init * 4, token_num_init * 8, token_num_init * 16],  
    hidden_dim=512,
    num_heads=1,
    max_seq_len=32,
    batch_size=32,
    num_epochs=2,
    learning_rate=0.001,
    val_ratio=0.1,
    test_ratio=0.1
)


Training on cuda
Vocab Size: 10656
Initial Token Num: 512
Trainable Parameters: 13_560_224


                                                                                    

Step 0 | Epoch 1/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 27.21s


                                                                                   

Step 0 | Epoch 2/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 26.92s

🔍 Evaluation finale après Scaling Step 0...
Final Evaluation for Step 0 | Test Loss: nan | Test Perplexity: nan
✅ Modèle sauvegardé : Saved_Models_Checkpoints\tokenformer_scaled_step_0.pth
✅ TokenFormer scalé à 2048 tokens.
🔼 Scaling step 1: token_num = 2048


                                                                                   

Step 1 | Epoch 1/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 40.57s


                                                                                   

Step 1 | Epoch 2/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 40.60s

🔍 Evaluation finale après Scaling Step 1...
Final Evaluation for Step 1 | Test Loss: nan | Test Perplexity: nan
✅ Modèle sauvegardé : Saved_Models_Checkpoints\tokenformer_scaled_step_1.pth
✅ TokenFormer scalé à 4096 tokens.
🔼 Scaling step 2: token_num = 4096


                                                                                   

Step 2 | Epoch 1/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 60.90s


                                                                                   

Step 2 | Epoch 2/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 60.71s

🔍 Evaluation finale après Scaling Step 2...
Final Evaluation for Step 2 | Test Loss: nan | Test Perplexity: nan
✅ Modèle sauvegardé : Saved_Models_Checkpoints\tokenformer_scaled_step_2.pth
✅ TokenFormer scalé à 8192 tokens.
🔼 Scaling step 3: token_num = 8192


                                                                                   

Step 3 | Epoch 1/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 99.75s


                                                                                   

Step 3 | Epoch 2/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 99.78s

🔍 Evaluation finale après Scaling Step 3...
Final Evaluation for Step 3 | Test Loss: nan | Test Perplexity: nan
✅ Modèle sauvegardé : Saved_Models_Checkpoints\tokenformer_scaled_step_3.pth
✅ TokenFormer scalé à 16384 tokens.
🔼 Scaling step 4: token_num = 16384


                                                                                   

Step 4 | Epoch 1/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 181.95s


                                                                                   

Step 4 | Epoch 2/2 | Train Loss: nan | Val Loss: nan | Val Perplexity: nan | Time: 178.98s

🔍 Evaluation finale après Scaling Step 4...
Final Evaluation for Step 4 | Test Loss: nan | Test Perplexity: nan


KeyboardInterrupt: 