In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import torch
from models.tokenformer import TokenformerLayer
from training.train import train_with_scaling_ver2,train

In [3]:
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("CUDA Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA Available: True
CUDA Device Count: 1
CUDA Device Name: NVIDIA GeForce RTX 3070


# TokenFormer with scalling

On va train ces différentes configuration : 

In [14]:
model_configs = [
    {
        "hidden_dim": 32,  
        "num_heads": 1,        
        "max_seq_len": 32,   
        "token_num": 2,     
        "vocab_size": 10656  
    },
    {
        "hidden_dim": 32,
        "num_heads": 1,
        "max_seq_len": 32,
        "token_num": 4,  # 4x plus de tokens paramétriques
        "vocab_size": 10656
    },
    {
        "hidden_dim": 32,
        "num_heads": 1,
        "max_seq_len": 32,
        "token_num": 8,  # 8x plus de tokens paramétriques
        "vocab_size": 10656
    },
    {
        "hidden_dim": 32,
        "num_heads": 1,
        "max_seq_len": 32,
        "token_num": 16,  # 16x plus de tokens paramétriques
        "vocab_size": 10656
    }
    ,
    {
        "hidden_dim": 32,
        "num_heads": 1,
        "num_layers": 12,
        "max_seq_len": 32,
        "token_num": 32,  # 16x plus de tokens paramétriques
        "vocab_size": 10656
    }
]

# Tester chaque configuration
for config in model_configs:
    model = TokenformerLayer(
        hidden_size=config["hidden_dim"],
        vocab_size=config["vocab_size"],
        num_attention_heads=config["num_heads"],
        max_seq_len=config["max_seq_len"],
        token_num=config["token_num"]
    )

    trainable_params_auto = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"📊 Modèle : {config}")

    # for name, param in model.named_parameters():
    #     print(f"{name}: {param.numel():_} paramètres")

    print(f"🔹 PyTorch Trainable Params : {trainable_params_auto:_}\n")


📊 Modèle : {'hidden_dim': 32, 'num_heads': 1, 'max_seq_len': 32, 'token_num': 2, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 694_304

📊 Modèle : {'hidden_dim': 32, 'num_heads': 1, 'max_seq_len': 32, 'token_num': 4, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 694_944

📊 Modèle : {'hidden_dim': 32, 'num_heads': 1, 'max_seq_len': 32, 'token_num': 8, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 696_224

📊 Modèle : {'hidden_dim': 32, 'num_heads': 1, 'max_seq_len': 32, 'token_num': 16, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 698_784

📊 Modèle : {'hidden_dim': 32, 'num_heads': 1, 'num_layers': 12, 'max_seq_len': 32, 'token_num': 32, 'vocab_size': 10656}
🔹 PyTorch Trainable Params : 703_904



In [8]:
token_num_init = 2

train_with_scaling_ver2(
    initial_token_num=token_num_init,
    scaling_steps=4, 
    new_tokens_per_step=[2, 4, 8, 16, 32],  
    hidden_dim=32,
    num_heads=1,
    max_seq_len=32,
    batch_size=32,
    num_epochs=10,
    learning_rate=0.001,
    val_ratio=0.1,
    test_ratio=0.1,
    model_base_name="tokenformer_scaled_3_2",
    results_file_name="scaling_results_2.csv",
    subset_size=5000
)

Training on cuda


                                                                                   

Step 0 | Epoch 1 | Train Loss: 12.5833 | Val Loss: 12.0479 | Test Loss: 12.0457 | Epoch Time: 41.67s


                                                                                   

Step 0 | Epoch 2 | Train Loss: 11.1020 | Val Loss: 9.9436 | Test Loss: 9.9363 | Epoch Time: 42.81s


                                                                                  

Step 0 | Epoch 3 | Train Loss: 8.6607 | Val Loss: 7.5484 | Test Loss: 7.5314 | Epoch Time: 44.49s


                                                                                  

Step 0 | Epoch 4 | Train Loss: 7.0608 | Val Loss: 6.7724 | Test Loss: 6.7459 | Epoch Time: 41.76s


                                                                                  

Step 0 | Epoch 5 | Train Loss: 6.6787 | Val Loss: 6.6289 | Test Loss: 6.5988 | Epoch Time: 43.55s


                                                                                  

Step 0 | Epoch 6 | Train Loss: 6.6038 | Val Loss: 6.5973 | Test Loss: 6.5665 | Epoch Time: 44.00s


                                                                                  

Step 0 | Epoch 7 | Train Loss: 6.5812 | Val Loss: 6.5823 | Test Loss: 6.5536 | Epoch Time: 44.67s


                                                                                  

Step 0 | Epoch 8 | Train Loss: 6.5690 | Val Loss: 6.5740 | Test Loss: 6.5448 | Epoch Time: 42.70s


                                                                                  

Step 0 | Epoch 9 | Train Loss: 6.5613 | Val Loss: 6.5692 | Test Loss: 6.5399 | Epoch Time: 41.18s


                                                                                   

Step 0 | Epoch 10 | Train Loss: 6.5566 | Val Loss: 6.5643 | Test Loss: 6.5356 | Epoch Time: 41.19s
✅ TokenFormer scalé à 4 tokens.
🔼 Scaling step 1: token_num = 4


                                                                                  

Step 1 | Epoch 1 | Train Loss: 6.7736 | Val Loss: 6.6868 | Test Loss: 6.6609 | Epoch Time: 41.21s


                                                                                  

Step 1 | Epoch 2 | Train Loss: 6.6467 | Val Loss: 6.6286 | Test Loss: 6.6005 | Epoch Time: 41.21s


                                                                                  

Step 1 | Epoch 3 | Train Loss: 6.6097 | Val Loss: 6.6063 | Test Loss: 6.5770 | Epoch Time: 42.19s


                                                                                  

Step 1 | Epoch 4 | Train Loss: 6.5916 | Val Loss: 6.5937 | Test Loss: 6.5647 | Epoch Time: 42.38s


                                                                                  

Step 1 | Epoch 5 | Train Loss: 6.5796 | Val Loss: 6.5851 | Test Loss: 6.5561 | Epoch Time: 37.32s


                                                                                  

Step 1 | Epoch 6 | Train Loss: 6.5712 | Val Loss: 6.5792 | Test Loss: 6.5500 | Epoch Time: 37.95s


                                                                                  

Step 1 | Epoch 7 | Train Loss: 6.5657 | Val Loss: 6.5744 | Test Loss: 6.5452 | Epoch Time: 38.08s


                                                                                  

Step 1 | Epoch 8 | Train Loss: 6.5608 | Val Loss: 6.5710 | Test Loss: 6.5415 | Epoch Time: 38.73s


                                                                                  

Step 1 | Epoch 9 | Train Loss: 6.5568 | Val Loss: 6.5681 | Test Loss: 6.5391 | Epoch Time: 39.41s


                                                                                   

Step 1 | Epoch 10 | Train Loss: 6.5541 | Val Loss: 6.5658 | Test Loss: 6.5364 | Epoch Time: 46.30s
✅ TokenFormer scalé à 8 tokens.
🔼 Scaling step 2: token_num = 8


                                                                                  

Step 2 | Epoch 1 | Train Loss: 7.9353 | Val Loss: 7.5958 | Test Loss: 7.5784 | Epoch Time: 44.65s


                                                                                  

Step 2 | Epoch 2 | Train Loss: 7.3666 | Val Loss: 7.1868 | Test Loss: 7.1662 | Epoch Time: 44.58s


                                                                                  

Step 2 | Epoch 3 | Train Loss: 7.0557 | Val Loss: 6.9571 | Test Loss: 6.9338 | Epoch Time: 39.58s


                                                                                  

Step 2 | Epoch 4 | Train Loss: 6.8778 | Val Loss: 6.8238 | Test Loss: 6.7984 | Epoch Time: 38.80s


                                                                                  

Step 2 | Epoch 5 | Train Loss: 6.7724 | Val Loss: 6.7451 | Test Loss: 6.7182 | Epoch Time: 37.80s


                                                                                  

Step 2 | Epoch 6 | Train Loss: 6.7099 | Val Loss: 6.6976 | Test Loss: 6.6703 | Epoch Time: 41.96s


                                                                                  

Step 2 | Epoch 7 | Train Loss: 6.6712 | Val Loss: 6.6677 | Test Loss: 6.6395 | Epoch Time: 39.76s


                                                                                  

Step 2 | Epoch 8 | Train Loss: 6.6457 | Val Loss: 6.6469 | Test Loss: 6.6186 | Epoch Time: 39.46s


                                                                                  

Step 2 | Epoch 9 | Train Loss: 6.6269 | Val Loss: 6.6322 | Test Loss: 6.6035 | Epoch Time: 40.37s


                                                                                   

Step 2 | Epoch 10 | Train Loss: 6.6133 | Val Loss: 6.6209 | Test Loss: 6.5919 | Epoch Time: 39.74s
✅ TokenFormer scalé à 16 tokens.
🔼 Scaling step 3: token_num = 16


                                                                                  

Step 3 | Epoch 1 | Train Loss: 9.2801 | Val Loss: 9.0722 | Test Loss: 9.0593 | Epoch Time: 40.28s


                                                                                  

Step 3 | Epoch 2 | Train Loss: 8.8859 | Val Loss: 8.7299 | Test Loss: 8.7159 | Epoch Time: 39.38s


                                                                                  

Step 3 | Epoch 3 | Train Loss: 8.5758 | Val Loss: 8.4492 | Test Loss: 8.4341 | Epoch Time: 38.79s


                                                                                  

Step 3 | Epoch 4 | Train Loss: 8.3162 | Val Loss: 8.2104 | Test Loss: 8.1944 | Epoch Time: 39.46s


                                                                                  

Step 3 | Epoch 5 | Train Loss: 8.0937 | Val Loss: 8.0043 | Test Loss: 7.9873 | Epoch Time: 40.94s


                                                                                  

Step 3 | Epoch 6 | Train Loss: 7.9003 | Val Loss: 7.8250 | Test Loss: 7.8071 | Epoch Time: 43.59s


                                                                                  

Step 3 | Epoch 7 | Train Loss: 7.7321 | Val Loss: 7.6685 | Test Loss: 7.6498 | Epoch Time: 45.93s


                                                                                  

Step 3 | Epoch 8 | Train Loss: 7.5850 | Val Loss: 7.5318 | Test Loss: 7.5122 | Epoch Time: 40.72s


                                                                                  

Step 3 | Epoch 9 | Train Loss: 7.4565 | Val Loss: 7.4125 | Test Loss: 7.3921 | Epoch Time: 40.84s


                                                                                   

Step 3 | Epoch 10 | Train Loss: 7.3439 | Val Loss: 7.3083 | Test Loss: 7.2871 | Epoch Time: 41.23s
✅ TokenFormer scalé à 32 tokens.
🔼 Scaling step 4: token_num = 32


                                                                                  

Step 4 | Epoch 1 | Train Loss: 7.9488 | Val Loss: 7.8814 | Test Loss: 7.8632 | Epoch Time: 41.28s


                                                                                  

Step 4 | Epoch 2 | Train Loss: 7.7931 | Val Loss: 7.7359 | Test Loss: 7.7171 | Epoch Time: 41.70s


                                                                                  

Step 4 | Epoch 3 | Train Loss: 7.6607 | Val Loss: 7.6074 | Test Loss: 7.5879 | Epoch Time: 40.07s


                                                                                  

Step 4 | Epoch 4 | Train Loss: 7.5343 | Val Loss: 7.4941 | Test Loss: 7.4739 | Epoch Time: 39.93s


                                                                                  

Step 4 | Epoch 5 | Train Loss: 7.4270 | Val Loss: 7.3937 | Test Loss: 7.3731 | Epoch Time: 41.46s


                                                                                  

Step 4 | Epoch 6 | Train Loss: 7.3320 | Val Loss: 7.3050 | Test Loss: 7.2836 | Epoch Time: 42.45s


                                                                                  

Step 4 | Epoch 7 | Train Loss: 7.2552 | Val Loss: 7.2264 | Test Loss: 7.2044 | Epoch Time: 44.12s


                                                                                  

Step 4 | Epoch 8 | Train Loss: 7.1760 | Val Loss: 7.1568 | Test Loss: 7.1343 | Epoch Time: 41.49s


                                                                                  

Step 4 | Epoch 9 | Train Loss: 7.1074 | Val Loss: 7.0951 | Test Loss: 7.0719 | Epoch Time: 43.33s


                                                                                   

Step 4 | Epoch 10 | Train Loss: 7.0488 | Val Loss: 7.0404 | Test Loss: 7.0167 | Epoch Time: 46.20s
Final results saved in: scaling_results_2.csv


# Transformer

In [12]:
# Lancer l'entraînement
train(
    use_tokenformer=False,
    hidden_dim=32,
    num_heads=4,
    num_layers=16,
    max_seq_len=32,
    batch_size=32,
    num_epochs=50,
    learning_rate=0.001,
    token_num=32,
    val_ratio=0.2,
    test_ratio=0.2,
    model_checkpoint_file_name='transformer_ver2_openwebdataset',
    model_results_file_name='transformer_ver2_openwebdataset'
)

Training on cuda
Vocab Size: 10656
Model Type: Transformer
Trainable Parameters: 896_928
Approximate Computational Cost (FLOPS): 16_777_216
Training Samples: 53524, Validation Samples: 17841, Test Samples: 17841
Starting Training...


                                                                             

Epoch 1/50 | Train Loss: 5.8653 | Val Loss: 4.7655 | Val Perplexity: 117.3890 | Time: 60.81s


                                                                              

Epoch 2/50 | Train Loss: 3.0524 | Val Loss: 0.6965 | Val Perplexity: 2.0067 | Time: 59.89s


                                                                              

Epoch 3/50 | Train Loss: 0.4824 | Val Loss: 0.2150 | Val Perplexity: 1.2398 | Time: 61.33s


                                                                              

Epoch 4/50 | Train Loss: 0.2586 | Val Loss: 0.1902 | Val Perplexity: 1.2095 | Time: 60.39s


                                                                              

Epoch 5/50 | Train Loss: 0.2202 | Val Loss: 0.1827 | Val Perplexity: 1.2004 | Time: 60.34s


                                                                              

Epoch 6/50 | Train Loss: 0.2028 | Val Loss: 0.1820 | Val Perplexity: 1.1996 | Time: 58.78s


                                                                              

Epoch 7/50 | Train Loss: 0.1918 | Val Loss: 0.1785 | Val Perplexity: 1.1954 | Time: 59.09s


                                                                              

Epoch 8/50 | Train Loss: 0.1828 | Val Loss: 0.1772 | Val Perplexity: 1.1939 | Time: 59.45s


                                                                              

Epoch 9/50 | Train Loss: 0.1775 | Val Loss: 0.1763 | Val Perplexity: 1.1927 | Time: 59.61s


                                                                               

Epoch 10/50 | Train Loss: 0.1723 | Val Loss: 0.1774 | Val Perplexity: 1.1941 | Time: 60.64s


                                                                               

Epoch 11/50 | Train Loss: 0.1679 | Val Loss: 0.1759 | Val Perplexity: 1.1924 | Time: 59.83s


                                                                               

Epoch 12/50 | Train Loss: 0.1639 | Val Loss: 0.1760 | Val Perplexity: 1.1925 | Time: 59.48s


                                                                               

Epoch 13/50 | Train Loss: 0.1603 | Val Loss: 0.1773 | Val Perplexity: 1.1940 | Time: 59.27s


                                                                    

KeyboardInterrupt: 