### 1. GPU环境验证脚本 (PyTorch)

In [0]:
# GPU环境基础验证
import subprocess
import sys

# 检查NVIDIA驱动
print("=== NVIDIA驱动信息 ===")
!nvidia-smi

# 检查CUDA
print("\n=== CUDA版本 ===")
!nvcc --version

# 检查Python环境
print("\n=== Python包检查 ===")
import pkg_resources
for package in ['tensorflow', 'torch', 'pandas', 'numpy']:
    try:
        dist = pkg_resources.get_distribution(package)
        print(f"{package}: {dist.version}")
    except:
        print(f"{package}: 未安装")


# 检查GPU是否被PyTorch识别
print("\n=== PyTorch GPU检测 ===")
try:
    import torch
    print(f"PyTorch版本: {torch.__version__}")
    print(f"CUDA可用: {torch.cuda.is_available()}")
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"GPU数量: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
except ImportError:
    print("PyTorch未安装")

=== NVIDIA驱动信息 ===
Thu Nov  6 06:33:11 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |
| N/A   28C    P0              35W / 250W |   3260MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                      

### 2. GPU性能基准测试脚本

In [0]:
# GPU性能基准测试
import time
import numpy as np

def gpu_benchmark():
    """运行简单的GPU性能测试"""
    
    # TensorFlow矩阵运算测试
    print("=== TensorFlow GPU性能测试 ===")
    try:
        import tensorflow as tf
        
        # 创建大矩阵
        size = 10000
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        
        # GPU矩阵乘法
        start_time = time.time()
        c = tf.matmul(a, b)
        # 强制计算
        _ = c.numpy()
        tf_time = time.time() - start_time
        print(f"TensorFlow 10000x10000矩阵乘法: {tf_time:.2f}秒")
        
    except Exception as e:
        print(f"TensorFlow测试失败: {e}")
    
    # PyTorch性能测试
    print("\n=== PyTorch GPU性能测试 ===")
    try:
        import torch
        
        # 移动到GPU
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        size = 10000
        
        a = torch.randn(size, size, device=device)
        b = torch.randn(size, size, device=device)
        
        # 预热GPU
        for _ in range(10):
            _ = torch.mm(a, b)
        
        torch.cuda.synchronize()  # 等待GPU完成
        
        # 正式测试
        start_time = time.time()
        for _ in range(10):
            c = torch.mm(a, b)
        torch.cuda.synchronize()
        torch_time = (time.time() - start_time) / 10
        
        print(f"PyTorch 10000x10000矩阵乘法(平均): {torch_time:.2f}秒")
        
        # 显存使用测试
        print(f"GPU显存使用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        
    except Exception as e:
        print(f"PyTorch测试失败: {e}")

# 运行基准测试
gpu_benchmark()

=== TensorFlow GPU性能测试 ===
TensorFlow 10000x10000矩阵乘法: 5.42秒

=== PyTorch GPU性能测试 ===
PyTorch 10000x10000矩阵乘法(平均): 0.15秒
GPU显存使用: 1.13 GB


In [0]:
# 深度学习模型训练验证 - PyTorch版本
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time

class SimpleCNN(nn.Module):
    """简单的CNN模型用于验证"""
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 4 * 4, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

def training_demo():
    """训练演示"""
    print("=== PyTorch深度学习训练演示 ===")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"训练设备: {device}")
    
    # 创建模拟数据 (CIFAR-10尺寸)
    batch_size = 128
    num_samples = 10000
    num_epochs = 100
    
    # 生成随机训练数据
    print("生成模拟数据...")
    x_train = torch.randn(num_samples, 3, 32, 32)
    y_train = torch.randint(0, 10, (num_samples,))
    
    dataset = TensorDataset(x_train, y_train)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # 创建模型
    model = SimpleCNN(num_classes=10)
    model = model.to(device)
    
    # 损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
    print(f"开始训练 ({num_epochs}个epochs)...")
    
    # 训练循环
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        epoch_start = time.time()
        
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # 前向传播
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        
        epoch_time = time.time() - epoch_start
        accuracy = 100. * correct / total
        
        print(f'Epoch [{epoch+1}/{num_epochs}] - '
              f'Loss: {running_loss/len(dataloader):.4f} - '
              f'Acc: {accuracy:.2f}% - '
              f'Time: {epoch_time:.2f}s - '
              f'GPU Mem: {torch.cuda.memory_allocated()/1024**3:.2f}GB')
    
    print("\n=== 训练完成 ===")
    print(f"峰值显存使用: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")

# 运行训练演示
training_demo()

=== PyTorch深度学习训练演示 ===
训练设备: cuda
生成模拟数据...
模型参数量: 2,473,610
开始训练 (100个epochs)...
Epoch [1/100] - Loss: 2.3040 - Acc: 10.18% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [2/100] - Loss: 2.3030 - Acc: 9.72% - Time: 0.51s - GPU Mem: 0.07GB
Epoch [3/100] - Loss: 2.3022 - Acc: 10.05% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [4/100] - Loss: 2.3022 - Acc: 10.27% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [5/100] - Loss: 2.3022 - Acc: 10.46% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [6/100] - Loss: 2.3024 - Acc: 10.28% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [7/100] - Loss: 2.3026 - Acc: 10.45% - Time: 0.53s - GPU Mem: 0.07GB
Epoch [8/100] - Loss: 2.3024 - Acc: 10.02% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [9/100] - Loss: 2.3023 - Acc: 10.46% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [10/100] - Loss: 2.3023 - Acc: 10.46% - Time: 0.54s - GPU Mem: 0.07GB
Epoch [11/100] - Loss: 2.3024 - Acc: 10.12% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [12/100] - Loss: 2.3023 - Acc: 10.45% - Time: 0.52s - GPU Mem: 0.07GB
Epoch [13/100] 

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, Dataset, DistributedSampler
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
import numpy as np
import time
import os
from datetime import datetime

# 创建必要的目录
def setup_directories():
    """创建模型保存目录"""
    base_path = "/mnt/gpu_test"
    directories = [
        base_path,
        f"{base_path}/models",
        f"{base_path}/checkpoints",
        f"{base_path}/logs"
    ]
    
    for directory in directories:
        try:
            os.makedirs(directory, exist_ok=True)
            print(f"确保目录存在: {directory}")
        except Exception as e:
            print(f"创建目录 {directory} 时出错: {e}")
    
    return base_path

class AdvancedCNN(nn.Module):
    """改进的CNN模型，使用更现代的结构"""
    def __init__(self, num_classes=10, dropout_rate=0.3):
        super(AdvancedCNN, self).__init__()
        
        # 特征提取层 - 使用更深的网络
        self.features = nn.Sequential(
            # 第一组卷积层
            nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(dropout_rate/2),
            
            # 第二组卷积层
            nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(dropout_rate),
            
            # 第三组卷积层
            nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        
        # 分类器
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(256 * 4 * 4, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(512, num_classes)
        )
        
        # 权重初始化
        self._initialize_weights()
    
    def _initialize_weights(self):
        """更好的权重初始化"""
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

class CIFAR10Dataset(Dataset):
    """CIFAR-10数据集包装器，支持数据增强"""
    def __init__(self, train=True, augment=True):
        self.train = train
        self.augment = augment
        
        # 定义数据变换
        if self.train and self.augment:
            self.transform = transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])
        else:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])
        
        # 在Databricks中，您可能需要从DBFS加载数据
        # 这里使用torchvision的CIFAR-10作为示例
        self.dataset = datasets.CIFAR10(
            root='/tmp/cifar10', 
            train=self.train, 
            download=True, 
            transform=self.transform
        )
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]

class TrainingMonitor:
    """训练监控器"""
    def __init__(self, base_path):
        self.train_losses = []
        self.val_losses = []
        self.train_accs = []
        self.val_accs = []
        self.lr_history = []
        self.start_time = time.time()
        self.base_path = base_path
        
        # 保存训练日志
        self.log_file = f"{base_path}/logs/training_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        with open(self.log_file, 'w') as f:
            f.write("Epoch,Train_Loss,Val_Loss,Train_Acc,Val_Acc,Learning_Rate\n")
    
    def update(self, epoch, train_loss, val_loss, train_acc, val_acc, lr):
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.train_accs.append(train_acc)
        self.val_accs.append(val_acc)
        self.lr_history.append(lr)
        
        # 记录到文件
        with open(self.log_file, 'a') as f:
            f.write(f"{epoch+1},{train_loss:.6f},{val_loss:.6f},{train_acc:.4f},{val_acc:.4f},{lr:.8f}\n")
    
    def print_epoch_stats(self, epoch, total_epochs):
        current_time = time.time()
        elapsed = current_time - self.start_time
        eta = elapsed / (epoch + 1) * (total_epochs - epoch - 1)
        
        print(f'Epoch [{epoch+1:03d}/{total_epochs:03d}] | '
              f'Train Loss: {self.train_losses[-1]:.4f} | '
              f'Val Loss: {self.val_losses[-1]:.4f} | '
              f'Train Acc: {self.train_accs[-1]:.2f}% | '
              f'Val Acc: {self.val_accs[-1]:.2f}% | '
              f'LR: {self.lr_history[-1]:.6f} | '
              f'ETA: {eta/60:.1f}min')

def setup_distributed_training():
    """设置分布式训练 - 修复重复初始化问题"""
    try:
        # 检查是否已经在分布式环境中
        if dist.is_initialized():
            rank = dist.get_rank()
            world_size = dist.get_world_size()
            local_rank = rank % torch.cuda.device_count() if torch.cuda.is_available() else 0
            print(f"分布式训练已初始化 - Rank: {rank}, World Size: {world_size}, Local Rank: {local_rank}")
            return True, rank, world_size, local_rank
        
        # 检查环境变量
        if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
            rank = int(os.environ['RANK'])
            world_size = int(os.environ['WORLD_SIZE'])
            local_rank = int(os.environ.get('LOCAL_RANK', rank % torch.cuda.device_count()))
            
            # 初始化分布式训练
            dist.init_process_group(backend='nccl', init_method='env://')
            torch.cuda.set_device(local_rank)
            
            print(f"成功初始化分布式训练 - Rank: {rank}, World Size: {world_size}, Local Rank: {local_rank}")
            return True, rank, world_size, local_rank
        
        # 单机训练
        print("使用单机训练模式")
        return False, 0, 1, 0
        
    except Exception as e:
        print(f"分布式训练设置失败: {e}，使用单机模式")
        return False, 0, 1, 0

def cleanup_distributed():
    """清理分布式训练资源"""
    try:
        if dist.is_initialized():
            dist.destroy_process_group()
            print("分布式训练资源已清理")
    except Exception as e:
        print(f"清理分布式训练资源时出错: {e}")

def train_epoch(model, dataloader, criterion, optimizer, device, scheduler_step=None):
    """训练一个epoch"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每50个batch打印一次进度
        if batch_idx % 50 == 0:
            print(f'  Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}')
        
        # 动态学习率调整（如果使用step调度器）
        if scheduler_step is not None:
            scheduler_step.step()
    
    accuracy = 100. * correct / total
    avg_loss = running_loss / len(dataloader)
    
    return avg_loss, accuracy

def validate(model, dataloader, criterion, device):
    """验证模型"""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    accuracy = 100. * correct / total
    avg_loss = running_loss / len(dataloader)
    
    return avg_loss, accuracy

def save_checkpoint(state, filename, base_path):
    """保存检查点，确保目录存在"""
    full_path = f"{base_path}/{filename}"
    directory = os.path.dirname(full_path)
    os.makedirs(directory, exist_ok=True)
    torch.save(state, full_path)
    print(f"模型已保存到: {full_path}")

def advanced_training_demo():
    """高级训练演示 - 专为Databricks GPU集群设计"""
    print("=== Databricks高级深度学习训练演示 ===")
    
    # 设置目录
    base_path = setup_directories()
    
    # 设置分布式训练
    is_distributed, rank, world_size, local_rank = setup_distributed_training()
    device = torch.device(f'cuda:{local_rank}' if torch.cuda.is_available() else 'cpu')
    
    print(f"训练设备: {device}")
    
    # 超参数配置 - 根据GPU数量调整
    num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
    effective_batch_size = 256 * num_gpus  # 根据GPU数量缩放batch size
    
    config = {
        'batch_size': effective_batch_size,
        'num_epochs': 50,  # 减少epoch数用于演示
        'learning_rate': 0.1 * num_gpus,  # 根据GPU数量调整学习率
        'weight_decay': 1e-4,
        'momentum': 0.9,
        'dropout_rate': 0.3,
        'num_workers': 4,
        'pin_memory': True,
        'num_gpus': num_gpus
    }
    
    # 只在主进程中显示配置信息
    if not is_distributed or rank == 0:
        print(f"检测到 {num_gpus} 个GPU")
        print(f"有效batch size: {config['batch_size']}")
        print(f"学习率: {config['learning_rate']}")
    
    # 数据加载
    if not is_distributed or rank == 0:
        print("准备数据加载器...")
    
    train_dataset = CIFAR10Dataset(train=True, augment=True)
    val_dataset = CIFAR10Dataset(train=False, augment=False)
    
    if is_distributed:
        train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
        val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False)
    else:
        train_sampler = None
        val_sampler = None
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=config['batch_size'] // world_size if is_distributed else config['batch_size'],
        shuffle=(train_sampler is None),
        sampler=train_sampler,
        num_workers=config['num_workers'],
        pin_memory=config['pin_memory']
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'] // world_size if is_distributed else config['batch_size'],
        shuffle=False,
        sampler=val_sampler,
        num_workers=config['num_workers'],
        pin_memory=config['pin_memory']
    )
    
    # 创建模型
    if not is_distributed or rank == 0:
        print("初始化模型...")
    
    model = AdvancedCNN(num_classes=10, dropout_rate=config['dropout_rate'])
    model = model.to(device)
    
    # 分布式数据并行
    if is_distributed:
        model = DDP(model, device_ids=[local_rank])
    
    # 损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        model.parameters(), 
        lr=config['learning_rate'],
        momentum=config['momentum'],
        weight_decay=config['weight_decay']
    )
    
    # 学习率调度器
    scheduler_cosine = CosineAnnealingLR(optimizer, T_max=config['num_epochs'])
    scheduler_reduce = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
    
    # 训练监控（只在主进程）
    monitor = TrainingMonitor(base_path) if (not is_distributed or rank == 0) else None
    
    # 打印配置信息
    if not is_distributed or rank == 0:
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"模型总参数量: {total_params:,}")
        print(f"可训练参数量: {trainable_params:,}")
        print(f"训练样本数: {len(train_dataset):,}")
        print(f"验证样本数: {len(val_dataset):,}")
        print(f"开始训练 ({config['num_epochs']}个epochs)...")
        print("-" * 100)
    
    best_val_acc = 0.0
    
    # 训练循环
    for epoch in range(config['num_epochs']):
        if is_distributed:
            train_sampler.set_epoch(epoch)
        
        # 训练
        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        # 验证
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        
        # 学习率调度
        current_lr = optimizer.param_groups[0]['lr']
        scheduler_cosine.step()
        scheduler_reduce.step(val_loss)
        
        # 只在主进程中更新监控器和保存模型
        if not is_distributed or rank == 0:
            if monitor is not None:
                monitor.update(epoch, train_loss, val_loss, train_acc, val_acc, current_lr)
                monitor.print_epoch_stats(epoch, config['num_epochs'])
            
            # 保存最佳模型
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                # 获取模型状态（如果是DDP，需要获取module）
                model_state_dict = model.module.state_dict() if is_distributed else model.state_dict()
                
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': model_state_dict,
                    'optimizer_state_dict': optimizer.state_dict(),
                    'val_acc': val_acc,
                    'config': config
                }
                save_checkpoint(checkpoint, 'models/best_model.pth', base_path)
                print(f"*** 新的最佳模型保存! 验证准确率: {val_acc:.2f}% ***")
            
            # 每10个epoch保存一次检查点
            if (epoch + 1) % 10 == 0:
                model_state_dict = model.module.state_dict() if is_distributed else model.state_dict()
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': model_state_dict,
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_cosine': scheduler_cosine.state_dict(),
                    'scheduler_reduce': scheduler_reduce.state_dict(),
                    'val_acc': val_acc,
                    'config': config
                }
                if monitor is not None:
                    checkpoint['training_history'] = {
                        'train_losses': monitor.train_losses,
                        'val_losses': monitor.val_losses,
                        'train_accs': monitor.train_accs,
                        'val_accs': monitor.val_accs
                    }
                save_checkpoint(checkpoint, f'checkpoints/checkpoint_epoch_{epoch+1}.pth', base_path)
    
    # 清理
    cleanup_distributed()
    
    # 最终统计（只在主进程）
    if not is_distributed or rank == 0:
        print("\n" + "="*80)
        print("=== 训练完成 ===")
        print(f"最佳验证准确率: {best_val_acc:.2f}%")
        
        # GPU内存使用统计
        if torch.cuda.is_available():
            print(f"峰值显存使用: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
        
        # 保存最终模型
        model_state_dict = model.module.state_dict() if is_distributed else model.state_dict()
        final_checkpoint = {
            'model_state_dict': model_state_dict,
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': best_val_acc,
            'config': config
        }
        if monitor is not None:
            final_checkpoint['training_history'] = {
                'train_losses': monitor.train_losses,
                'val_losses': monitor.val_losses,
                'train_accs': monitor.train_accs,
                'val_accs': monitor.val_accs
            }
        save_checkpoint(final_checkpoint, 'models/final_model.pth', base_path)
        
        # 保存训练统计图
        if monitor is not None:
            try:
                import matplotlib.pyplot as plt
                
                plt.figure(figsize=(15, 5))
                
                plt.subplot(1, 3, 1)
                plt.plot(monitor.train_losses, label='Train Loss')
                plt.plot(monitor.val_losses, label='Val Loss')
                plt.xlabel('Epoch')
                plt.ylabel('Loss')
                plt.legend()
                plt.title('Training and Validation Loss')
                
                plt.subplot(1, 3, 2)
                plt.plot(monitor.train_accs, label='Train Acc')
                plt.plot(monitor.val_accs, label='Val Acc')
                plt.xlabel('Epoch')
                plt.ylabel('Accuracy (%)')
                plt.legend()
                plt.title('Training and Validation Accuracy')
                
                plt.subplot(1, 3, 3)
                plt.plot(monitor.lr_history)
                plt.xlabel('Epoch')
                plt.ylabel('Learning Rate')
                plt.title('Learning Rate Schedule')
                
                plt.tight_layout()
                plt.savefig(f'{base_path}/logs/training_stats.png', dpi=300, bbox_inches='tight')
                plt.close()
                
                print(f"训练统计图已保存到: {base_path}/logs/training_stats.png")
            except ImportError:
                print("Matplotlib未安装，跳过绘图")
        
        print(f"所有文件保存在: {base_path}")

# 运行训练演示
if __name__ == "__main__":
    try:
        advanced_training_demo()
    except Exception as e:
        print(f"训练过程中出现错误: {e}")
        import traceback
        traceback.print_exc()

=== Databricks高级深度学习训练演示 ===
确保目录存在: /mnt/gpu_test
确保目录存在: /mnt/gpu_test/models
确保目录存在: /mnt/gpu_test/checkpoints
确保目录存在: /mnt/gpu_test/logs
分布式训练已初始化 - Rank: 0, World Size: 1, Local Rank: 0
训练设备: cuda:0
检测到 1 个GPU
有效batch size: 256
学习率: 0.1
准备数据加载器...
初始化模型...
模型总参数量: 3,250,122
可训练参数量: 3,250,122
训练样本数: 50,000
验证样本数: 10,000
开始训练 (50个epochs)...
----------------------------------------------------------------------------------------------------




  Batch [0/196], Loss: 2.3107
  Batch [50/196], Loss: 1.8993
  Batch [100/196], Loss: 1.7875
  Batch [150/196], Loss: 1.5688
Epoch [001/050] | Train Loss: 1.8461 | Val Loss: 1.4535 | Train Acc: 35.27% | Val Acc: 47.38% | LR: 0.100000 | ETA: 9.3min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 47.38% ***
  Batch [0/196], Loss: 1.5420
  Batch [50/196], Loss: 1.3715
  Batch [100/196], Loss: 1.3884
  Batch [150/196], Loss: 1.3731
Epoch [002/050] | Train Loss: 1.3935 | Val Loss: 1.0289 | Train Acc: 50.12% | Val Acc: 63.13% | LR: 0.099901 | ETA: 9.1min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 63.13% ***
  Batch [0/196], Loss: 1.2240
  Batch [50/196], Loss: 1.1372
  Batch [100/196], Loss: 1.1599
  Batch [150/196], Loss: 1.1919
Epoch [003/050] | Train Loss: 1.1492 | Val Loss: 0.8458 | Train Acc: 59.14% | Val Acc: 70.17% | LR: 0.099606 | ETA: 9.0min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 70.17% ***
  Batch [0/196], Loss: 1.055

### 手动autoscale 到2个gpu后但效率没有提高

In [0]:
#!/usr/bin/env python3
import torch

def get_gpu_count():
    """获取GPU数量"""
    if torch.cuda.is_available():
        gpu_count = torch.cuda.device_count()
        print(f"检测到 {gpu_count} 个GPU:")
        
        for i in range(gpu_count):
            gpu_name = torch.cuda.get_device_name(i)
            memory = torch.cuda.get_device_properties(i).total_memory / 1024**3  # GB
            print(f"  GPU {i}: {gpu_name} ({memory:.1f} GB)")
        
        return gpu_count
    else:
        print("未检测到GPU")
        return 0

if __name__ == "__main__":
    count = get_gpu_count()
    print(f"\n总GPU数量: {count}")

检测到 2 个GPU:
  GPU 0: Tesla V100-PCIE-16GB (15.8 GB)
  GPU 1: Tesla V100-PCIE-16GB (15.8 GB)

总GPU数量: 2


In [0]:

if __name__ == "__main__":
    try:
        advanced_training_demo()
    except Exception as e:
        print(f"训练过程中出现错误: {e}")
        import traceback
        traceback.print_exc()

=== Databricks高级深度学习训练演示 ===
确保目录存在: /mnt/gpu_test
确保目录存在: /mnt/gpu_test/models
确保目录存在: /mnt/gpu_test/checkpoints
确保目录存在: /mnt/gpu_test/logs
成功初始化分布式训练 - Rank: 0, World Size: 1, Local Rank: 0
训练设备: cuda:0
检测到 2 个GPU
有效batch size: 512
学习率: 0.2
准备数据加载器...


  0%|          | 0.00/170M [00:00<?, ?B/s]  0%|          | 32.8k/170M [00:00<34:12, 83.1kB/s]  0%|          | 98.3k/170M [00:00<15:17, 186kB/s]   0%|          | 229k/170M [00:00<07:51, 361kB/s]   0%|          | 459k/170M [00:01<04:37, 612kB/s]  1%|          | 918k/170M [00:01<02:26, 1.16MB/s]  1%|          | 1.41M/170M [00:01<01:47, 1.57MB/s]  1%|          | 1.93M/170M [00:01<01:28, 1.90MB/s]  1%|▏         | 2.46M/170M [00:01<01:23, 2.00MB/s]  2%|▏         | 3.05M/170M [00:02<01:13, 2.27MB/s]  2%|▏         | 3.64M/170M [00:02<01:07, 2.48MB/s]  3%|▎         | 4.29M/170M [00:02<01:01, 2.69MB/s]  3%|▎         | 4.95M/170M [00:02<01:01, 2.69MB/s]  3%|▎         | 5.64M/170M [00:02<00:56, 2.90MB/s]  4%|▎         | 6.39M/170M [00:03<00:52, 3.14MB/s]  4%|▍         | 7.14M/170M [00:03<00:49, 3.33MB/s]  5%|▍         | 7.96M/170M [00:03<00:47, 3.41MB/s]  5%|▌         | 8.81M/170M [00:03<00:44, 3.65MB/s]  6%|▌         | 9.70M/170M [00:03<00:41, 3.88MB/s]  6%|▌         | 10.6M/1

初始化模型...
模型总参数量: 3,250,122
可训练参数量: 3,250,122
训练样本数: 50,000
验证样本数: 10,000
开始训练 (50个epochs)...
----------------------------------------------------------------------------------------------------




  Batch [0/98], Loss: 2.3208
  Batch [50/98], Loss: 1.7476
Epoch [001/050] | Train Loss: 1.8286 | Val Loss: 1.4777 | Train Acc: 34.78% | Val Acc: 47.51% | LR: 0.200000 | ETA: 8.8min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 47.51% ***
  Batch [0/98], Loss: 1.5597
  Batch [50/98], Loss: 1.4537
Epoch [002/050] | Train Loss: 1.4509 | Val Loss: 1.1329 | Train Acc: 47.89% | Val Acc: 59.02% | LR: 0.199803 | ETA: 8.7min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 59.02% ***
  Batch [0/98], Loss: 1.3762
  Batch [50/98], Loss: 1.1641
Epoch [003/050] | Train Loss: 1.2135 | Val Loss: 0.9829 | Train Acc: 56.68% | Val Acc: 65.19% | LR: 0.199211 | ETA: 8.6min
模型已保存到: /mnt/gpu_test/models/best_model.pth
*** 新的最佳模型保存! 验证准确率: 65.19% ***
  Batch [0/98], Loss: 1.1200
  Batch [50/98], Loss: 1.0747
Epoch [004/050] | Train Loss: 1.0432 | Val Loss: 0.8253 | Train Acc: 62.92% | Val Acc: 71.29% | LR: 0.198229 | ETA: 8.4min
模型已保存到: /mnt/gpu_test/models/best_model.pth


