# Model Size Benchmarking with Memory Profiling


## Размеры моделей для тестирования:
- **small**: d_model=768, d_ff=3072, num_layers=12, num_heads=12
- **medium**: d_model=1024, d_ff=4096, num_layers=24, num_heads=16  
- **large**: d_model=1280, d_ff=5120, num_layers=36, num_heads=20
- **xl**: d_model=1600, d_ff=6400, num_layers=48, num_heads=25
- **2.7B**: d_model=2560, d_ff=10240, num_layers=32, num_heads=32


In [3]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cs336_systems.benchmarking import benchmarking_script
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


CUDA available: True
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
GPU Memory: 8.6 GB


In [4]:
model_configs = {
    "small": {
        "d_model": 768,
        "d_ff": 3072,
        "num_layers": 12,
        "num_heads": 1
    },
    "medium": {
        "d_model": 1024,
        "d_ff": 4096,
        "num_layers": 24,
        "num_heads": 1
    },
    # "large": {
    #     "d_model": 1280,
    #     "d_ff": 5120,
    #     "num_layers": 36,
    #     "num_heads": 1
    # },
    # "xl": {
    #     "d_model": 1600,
    #     "d_ff": 6400,
    #     "num_layers": 48,
    #     "num_heads": 25
    # },
    # "2.7B": {
    #     "d_model": 2560,
    #     "d_ff": 10240,
    #     "num_layers": 32,
    #     "num_heads": 32
    # }
}
common_params = {
    "context_length": 128,
    "vocab_size": 10000,
    "batch_size": 2, 
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "warmup_steps": 3,
    "num_steps": 10,
    "rope_theta": 10000.0,
    "profile_memory": True
}

print("Конфигурации моделей загружены:")
for name, config in model_configs.items():
    print(f"{name}: {config}")


Конфигурации моделей загружены:
small: {'d_model': 768, 'd_ff': 3072, 'num_layers': 12, 'num_heads': 1}
medium: {'d_model': 1024, 'd_ff': 4096, 'num_layers': 24, 'num_heads': 1}


In [None]:
warmup_results = {}


for model_name, config in model_configs.items():
    print(f"\n🔥 Тестирование модели: {model_name.upper()}")
    print(f"Параметры: d_model={config['d_model']}, layers={config['num_layers']}, heads={config['num_heads']}")
    print("-" * 60)
    
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
        
        results = benchmarking_script(
            **config,
            **common_params
        )
        
        warmup_results[model_name] = results
        
        print(f"✅ Модель {model_name} завершена:")
        print(f"   Forward: {results['forward_time_per_step']*1000:.2f} ms/step")
        print(f"   Backward: {results['backward_time_per_step']*1000:.2f} ms/step")
        print(f"   Параметры: {results['model_params']/1e6:.1f}M")
        
        if 'memory_info' in results and results['memory_info']:
            mem_info = results['memory_info']
            print(f"   GPU Memory: {mem_info['max_memory_allocated']:.2f} GB")
        
    except Exception as e:
        print(f"❌ Ошибка при тестировании {model_name}: {str(e)}")
        continue




🔥 Тестирование модели: SMALL
Параметры: d_model=768, layers=12, heads=1
------------------------------------------------------------
✅ Модель small завершена:
   Forward: 23.85 ms/step
   Backward: 32.27 ms/step
   Параметры: 120.9M
   GPU Memory: 1.11 GB

🔥 Тестирование модели: MEDIUM
Параметры: d_model=1024, layers=24, heads=1
------------------------------------------------------------
✅ Модель medium завершена:
   Forward: 52.53 ms/step
   Backward: 74.20 ms/step
   Параметры: 412.9M
   GPU Memory: 3.47 GB


: 

In [5]:
print('Тестирование с torch.compile')
for model_name, config in model_configs.items():
    print(f"\n🔥 Тестирование модели: {model_name.upper()}")
    print(f"Параметры: d_model={config['d_model']}, layers={config['num_layers']}, heads={config['num_heads']}")
    print("-" * 60)
    
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
        
        results = benchmarking_script(
            **config,
            **common_params,
            torch_compile=True
        )
        
        warmup_results[model_name] = results
        
        print(f"✅ Модель {model_name} завершена:")
        print(f"   Forward: {results['forward_time_per_step']*1000:.2f} ms/step")
        print(f"   Backward: {results['backward_time_per_step']*1000:.2f} ms/step")
        print(f"   Параметры: {results['model_params']/1e6:.1f}M")
        
        if 'memory_info' in results and results['memory_info']:
            mem_info = results['memory_info']
            print(f"   GPU Memory: {mem_info['max_memory_allocated']:.2f} GB")
        
    except Exception as e:
        print(f"❌ Ошибка при тестировании {model_name}: {str(e)}")
        continue

Тестирование с torch.compile

🔥 Тестирование модели: SMALL
Параметры: d_model=768, layers=12, heads=1
------------------------------------------------------------


I0817 11:22:59.071715 335433 .venv/lib/python3.11/site-packages/torch/_inductor/config.py:613] compile_threads set to 1 via env


: 

: 

: 

## Анализ результатов
Создадим таблицы и графики для сравнения производительности.
