In [1]:
use_gpu = True
use_ramdon_split = False
use_dataparallel = True

In [2]:
# 详细GPU诊断
import torch
import os
import subprocess
import sys

# 添加路径以导入utils模块
sys.path.insert(0, '..')

print("=== 详细GPU诊断 ===")
print(f"PyTorch版本: {torch.__version__}")

# 安全地检查CUDA可用性
print("正在检查CUDA可用性...")
try:
    cuda_available = torch.cuda.is_available()
    print(f"CUDA可用: {cuda_available}")
    
    if cuda_available:
        try:
            device_count = torch.cuda.device_count()
            print(f"CUDA设备数量: {device_count}")
            
            if device_count > 0:
                print(f"当前CUDA设备: {torch.cuda.current_device()}")
                for i in range(device_count):
                    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
                    props = torch.cuda.get_device_properties(i)
                    print(f"  总内存: {props.total_memory / 1024**3:.1f} GB")
                    print(f"  计算能力: {props.major}.{props.minor}")
                    print(f"  多处理器数量: {props.multi_processor_count}")
            else:
                print("没有检测到CUDA设备")
        except Exception as e:
            print(f"获取CUDA设备信息时出错: {e}")
    else:
        print("CUDA不可用")
        
except Exception as e:
    print(f"检查CUDA时出错: {e}")
    print("可能的原因:")
    print("1. PyTorch安装有问题")
    print("2. CUDA驱动未安装")
    print("3. PyTorch版本与CUDA不兼容")

# 检查环境变量
print(f"\nCUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', '未设置')}")

# 尝试使用nvidia-smi检查
print("\n=== 使用nvidia-smi检查系统GPU ===")
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        print("nvidia-smi 输出:")
        print(result.stdout)
    else:
        print(f"nvidia-smi 错误: {result.stderr}")
except Exception as e:
    print(f"nvidia-smi 执行失败: {e}")
    print("可能的原因:")
    print("1. nvidia-smi 未安装")
    print("2. NVIDIA驱动未安装")
    print("3. 没有NVIDIA GPU")

print("==================")


=== 详细GPU诊断 ===
PyTorch版本: 2.7.1+cu118
正在检查CUDA可用性...
CUDA可用: True
CUDA设备数量: 1
当前CUDA设备: 0
GPU 0: NVIDIA GeForce RTX 2080
  总内存: 8.0 GB
  计算能力: 7.5
  多处理器数量: 46

CUDA_VISIBLE_DEVICES: 未设置

=== 使用nvidia-smi检查系统GPU ===
nvidia-smi 输出:
Tue Oct  7 13:39:21 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 471.41       Driver Version: 471.41       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:2D:00.0 Off |                  N/A |
| 41%   46C    P8     6W / 225W |   2553MiB /  8192MiB |     21%      Default |
|                               |                      |                  N/A |
+---------------

In [3]:
# PyTorch安装检查
print("=== PyTorch安装检查 ===")
try:
    import torch
    print(f"✓ PyTorch已安装，版本: {torch.__version__}")
    
    # 检查是否有CUDA支持
    if torch.cuda.is_available():
        print("✓ PyTorch支持CUDA")
    else:
        print("✗ PyTorch不支持CUDA或CUDA不可用")
        
    # 检查torch.version是否存在（在PyTorch 2.7.1中可能不存在）
    if hasattr(torch, 'version'):
        print("✓ torch.version 模块存在")
        if hasattr(torch.version, 'cuda'):
            print(f"✓ CUDA版本: {torch.version.cuda}")
        else:
            print("✗ torch.version.cuda 不存在")
    else:
        print("ℹ torch.version 模块不存在（这在PyTorch 2.7.1中是正常的）")
        
except ImportError:
    print("✗ PyTorch未安装")
except Exception as e:
    print(f"✗ PyTorch检查失败: {e}")

print("==================")

# 测试GPU工具函数
print("=== 测试GPU工具函数 ===")
try:
    # 添加路径以导入utils模块
    import sys
    import os
    sys.path.insert(0, '..')
    
    from utils.gpu_tools import query_gpu, select_gpu
    
    # 查询GPU信息
    gpu_info = query_gpu()
    print(f"query_gpu() 返回结果:")
    for i, line in enumerate(gpu_info):
        print(f"  GPU {i}: {line.strip()}")
    
    # 选择GPU
    selected_gpus = select_gpu(gpu_info)
    print(f"select_gpu() 返回结果: {selected_gpus}")
    
    if selected_gpus:
        print(f"将使用GPU: {selected_gpus}")
    else:
        print("没有选择到可用的GPU")
        
except Exception as e:
    print(f"GPU工具函数测试失败: {e}")
    import traceback
    traceback.print_exc()

print("==================")


=== PyTorch安装检查 ===
✓ PyTorch已安装，版本: 2.7.1+cu118
✓ PyTorch支持CUDA
✓ torch.version 模块存在
✓ CUDA版本: 11.8
=== 测试GPU工具函数 ===
query_gpu() 返回结果:
  GPU 0: 0, NVIDIA GeForce RTX 2080, 5638 MiB
select_gpu() 返回结果: [0]
将使用GPU: [0]


In [4]:
# GPU状态检查
print("=== GPU状态检查 ===")
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU数量: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  内存: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
else:
    print("没有可用的CUDA设备")
print("==================")


=== GPU状态检查 ===
CUDA可用: True
GPU数量: 1
GPU 0: NVIDIA GeForce RTX 2080
  内存: 8.0 GB


In [5]:
import os
import sys
sys.path.insert(0, '..')

if use_gpu:
    from utils.gpu_tools import *
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([ str(obj) for obj in select_gpu(query_gpu())])

import time
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split



torch.manual_seed(42)

IMAGE_WIDTH = {5: 15, 20: 60, 60: 180}
IMAGE_HEIGHT = {5: 32, 20: 64, 60: 96}  

## load data

here we choose 1993-2001 data as our training(include validation) data, the remaining will be used in testing.

In [None]:
year_list = np.arange(1993,2001,1)

images = []
label_df = []
for year in year_list:
    images.append(np.memmap(os.path.join("../monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), dtype=np.uint8, mode='r').reshape(
                        (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20])))
    label_df.append(pd.read_feather(os.path.join("../monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")))
    
images = np.concatenate(images)
label_df = pd.concat(label_df)

print(images.shape)
print(label_df.shape)

(793019, 64, 60)
(793019, 8)


## build dataset

In [7]:
class MyDataset(Dataset):
    
    def __init__(self, img, label):
        self.img = torch.Tensor(img.copy())
        self.label = torch.Tensor(label)
        self.len = len(img)
  
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.img[idx], self.label[idx]

Split method (not random split is recommended)

In [8]:
if not use_ramdon_split:
    train_val_ratio = 0.7
    split_idx = int(images.shape[0] * 0.7)
    train_dataset = MyDataset(images[:split_idx], (label_df.Ret_20d > 0).values[:split_idx])
    val_dataset = MyDataset(images[split_idx:], (label_df.Ret_20d > 0).values[split_idx:])
else:
    dataset = MyDataset(images, (label_df.Ret_20d > 0).values)
    train_val_ratio = 0.7
    train_dataset, val_dataset = random_split(dataset, \
        [int(dataset.len*train_val_ratio), dataset.len-int(dataset.len*train_val_ratio)], \
        generator=torch.Generator().manual_seed(42))
    del dataset

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False, pin_memory=True)

## models

In [9]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.)
    elif isinstance(m, nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)

In [10]:
from models import baseline

device = 'cuda' if use_gpu else 'cpu'
export_onnx = True
net = baseline.Net().to(device)
net.apply(init_weights)

if export_onnx:
    import torch.onnx
    x = torch.randn([1,1,64,60]).to(device)
    torch.onnx.export(net,               # model being run
                      x,                         # model input (or a tuple for multiple inputs)
                      "../cnn_baseline.onnx",   # where to save the model (can be a file or file-like object)
                      export_params=False,        # store the trained parameter weights inside the model file
                      opset_version=10,          # the ONNX version to export the model to
                      do_constant_folding=False,  # whether to execute constant folding for optimization
                      input_names = ['input_images'],   # the model's input names
                      output_names = ['output_prob'], # the model's output names
                      dynamic_axes={'input_images' : {0 : 'batch_size'},    # variable length axes
                                     'output_prob' : {0 : 'batch_size'}})


### Profiling

In [11]:
count = 0
for name, parameters in net.named_parameters():
    print(name, ':', parameters.size())
    count += parameters.numel()
print('total_parameters : {}'.format(count))

layer1.0.weight : torch.Size([64, 1, 5, 3])
layer1.0.bias : torch.Size([64])
layer1.1.weight : torch.Size([64])
layer1.1.bias : torch.Size([64])
layer2.0.weight : torch.Size([128, 64, 5, 3])
layer2.0.bias : torch.Size([128])
layer2.1.weight : torch.Size([128])
layer2.1.bias : torch.Size([128])
layer3.0.weight : torch.Size([256, 128, 5, 3])
layer3.0.bias : torch.Size([256])
layer3.1.weight : torch.Size([256])
layer3.1.bias : torch.Size([256])
fc1.1.weight : torch.Size([2, 46080])
fc1.1.bias : torch.Size([2])
total_parameters : 708866


In [12]:
from thop import profile as thop_profile

flops, params = thop_profile(net, inputs=(next(iter(train_dataloader))[0].to(device),))
print('FLOPs = ' + str(flops/1000**3) + 'G')
print('Params = ' + str(params/1000**2) + 'M')

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register count_relu() for <class 'torch.nn.modules.activation.LeakyReLU'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register count_softmax() for <class 'torch.nn.modules.activation.Softmax'>.
FLOPs = 36.21961728G
Params = 0.708866M


In [13]:
from torch.profiler import profile, record_function, ProfilerActivity

inputs = next(iter(train_dataloader))[0].to(device)

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        net(inputs)

prof.export_chrome_trace("../trace.json")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us     146.711ms       906.39%     146.711ms     146.711ms             1  
                                        model_inference        22.20%      46.764ms        94.69%     199.437ms     199.437ms       0.000us         0.00%      16.186ms      16.186ms             1  
         

## train

In [None]:
def train_loop(dataloader, net, loss_fn, optimizer):
    
    running_loss = 0.0
    current = 0
    net.train()
    
    with tqdm(dataloader) as t:
        for batch, (X, y) in enumerate(t):
            X = X.to(device)
            y = y.to(device)
            y_pred = net(X)
            loss = loss_fn(y_pred, y.long())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss = (len(X) * loss.item() + running_loss * current) / (len(X) + current)
            current += len(X)
            t.set_postfix({'running_loss':running_loss})
    
    return running_loss

In [15]:
def val_loop(dataloader, net, loss_fn):

    running_loss = 0.0
    current = 0
    net.eval()
    
    with torch.no_grad():
        with tqdm(dataloader) as t:
            for batch, (X, y) in enumerate(t):
                X = X.to(device)
                y = y.to(device)
                y_pred = net(X)
                loss = loss_fn(y_pred, y.long())

                running_loss += loss.item()
                running_loss = (len(X) * running_loss + loss.item() * current) / (len(X) + current)
                current += len(X)
            
    return running_loss

In [16]:
# net = torch.load('/home/clidg/proj_2/pt/baseline_epoch_10_train_0.6865865240322523_eval_0.686580_.pt')

In [17]:
# 检查GPU可用性并设置DataParallel
use_gpu = True
if use_gpu:
    # 检查CUDA是否可用
    if not torch.cuda.is_available():
        print("警告: CUDA不可用，将使用CPU")
        use_gpu = False
        device = 'cpu'
        net = net.to(device)
    else:
        # 检查可用GPU数量
        available_gpus = torch.cuda.device_count()
        print(f"检测到 {available_gpus} 个GPU")
        
        if available_gpus == 0:
            print("警告: 没有可用的GPU，将使用CPU")
            use_gpu = False
            device = 'cpu'
            net = net.to(device)
        elif available_gpus == 1:
            print("只有一个GPU可用，不使用DataParallel")
            net = net.to(device)
        else:
            print(f"使用DataParallel，GPU数量: {available_gpus}")
            net = net.to(device)
            net = nn.DataParallel(net)
elif use_gpu:
    net = net.to(device)

检测到 1 个GPU
只有一个GPU可用，不使用DataParallel


In [18]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-5)

start_epoch = 0
min_val_loss = 1e9
last_min_ind = -1
early_stopping_epoch = 5

from torch.utils.tensorboard import SummaryWriter
tb = SummaryWriter()

In [None]:
start_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
os.mkdir('..\\pt'+os.sep+start_time)
epochs = 100
for t in range(start_epoch, epochs):
    print(f"Epoch {t}\n-------------------------------")
    time.sleep(0.2)
    train_loss = train_loop(train_dataloader, net, loss_fn, optimizer)
    val_loss = val_loop(val_dataloader, net, loss_fn)
    tb.add_histogram("train_loss", train_loss, t)
    torch.save(net, '../pt'+os.sep+start_time+os.sep+'baseline_epoch_{}_train_{:5f}_val_{:5f}.pt'.format(t, train_loss, val_loss)) 
    if val_loss < min_val_loss:
        last_min_ind = t
        min_val_loss = val_loss
    elif t - last_min_ind >= early_stopping_epoch:
        break

print('Done!')
print('Best epoch: {}, val_loss: {}'.format(last_min_ind, min_val_loss))

Epoch 0
-------------------------------


100%|██████████| 4337/4337 [8:27:41<00:00,  7.02s/it, running_loss=0.774]       
  0%|          | 0/930 [00:00<?, ?it/s]