In [None]:
# 这个要在导入torch前设置
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1,3" # 之后

In [2]:
import torch
from torchinfo import summary
from torch import nn
from torch import optim
import torchvision
import math

In [3]:
print(torch.cuda.device_count()) # 成功

2


**一、估计YOLO_V1的显存**

In [2]:
device = torch.device("cuda:1")

In [3]:
def get_yolov1(num_classes = 20,num_bboxes = 2):
    """获取yolov1模型"""
    return nn.Sequential(
        nn.Conv2d(3,64,kernel_size = 7,stride = 2,padding = 3),nn.ReLU(),
        nn.MaxPool2d(2,2),                    # k = 2,s = 2的MaxPool2d层使图像分辨率减半
        nn.Conv2d(64,192,kernel_size = 3,padding = 1),nn.ReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(192,128,1),nn.ReLU(),
        nn.Conv2d(128,256,3,padding = 1),nn.ReLU(),
        nn.Conv2d(256,256,1),nn.ReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.ReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(512,256,1),nn.ReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.ReLU(),
        nn.Conv2d(512,256,1),nn.ReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.ReLU(),
        nn.Conv2d(512,256,1),nn.ReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.ReLU(),
        nn.Conv2d(512,256,1),nn.ReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.ReLU(),
        nn.Conv2d(512,512,1),nn.ReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.ReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(1024,512,1),nn.ReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.ReLU(),
        nn.Conv2d(1024,512,1),nn.ReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.ReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.ReLU(),
        nn.Conv2d(1024,1024,3,stride = 2,padding = 1),nn.ReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.ReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.ReLU(),
        nn.Flatten(),nn.Linear(7 * 7 * 1024,4096),nn.ReLU(),
        nn.Linear(4096,7 * 7 * (num_bboxes * 5 + num_classes)),nn.Sigmoid()
    )

class Yolov1(nn.Module):
    def __init__(self,num_classes = 20,num_bboxes = 2):
        super().__init__()
        self.B = num_bboxes
        self.C = num_classes
        self.layer = get_yolov1(self.C,self.B)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
        
    def forward(self,X):
        X = self.layer(X)
        X = X.reshape(X.shape[0],self.B * 5 + 
                      self.C,7,7)
        return X

In [4]:
model = Yolov1()

In [5]:
for i in range(len(model.layer)):
    if (isinstance(model.layer[i],nn.Conv2d) or 
        isinstance(model.layer[i],nn.Linear)):
        print(model.layer[i].weight.dtype)

torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32


In [6]:
model.to(torch.device('cpu'))
batch_size = 64
H,W = 448,448
input = torch.rand((batch_size,3,H,W))
print(summary(model,input_data = input))

Layer (type:depth-idx)                   Output Shape              Param #
Yolov1                                   [64, 30, 7, 7]            --
├─Sequential: 1-1                        [64, 1470]                --
│    └─Conv2d: 2-1                       [64, 64, 224, 224]        9,472
│    └─ReLU: 2-2                         [64, 64, 224, 224]        --
│    └─MaxPool2d: 2-3                    [64, 64, 112, 112]        --
│    └─Conv2d: 2-4                       [64, 192, 112, 112]       110,784
│    └─ReLU: 2-5                         [64, 192, 112, 112]       --
│    └─MaxPool2d: 2-6                    [64, 192, 56, 56]         --
│    └─Conv2d: 2-7                       [64, 128, 56, 56]         24,704
│    └─ReLU: 2-8                         [64, 128, 56, 56]         --
│    └─Conv2d: 2-9                       [64, 256, 56, 56]         295,168
│    └─ReLU: 2-10                        [64, 256, 56, 56]         --
│    └─Conv2d: 2-11                      [64, 256, 56, 56]         6

In [53]:
model.to(device)

Yolov1(
  (layer): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))
    (7): ReLU()
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (11): ReLU()
    (12): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (16): ReLU()
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU()
    (19): Conv2d

**二、估计Qwen2.5VL-3B的显存**

In [4]:
from qwen_vl_utils import process_vision_info
from transformers import (
    AutoTokenizer,
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
)

In [5]:
model_dir = "pretrained/Qwen2.5-VL-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_dir,use_fast = True)
processsor = AutoProcessor.from_pretrained(model_dir,use_fast = True)

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [6]:

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_dir,
    torch_dtype = torch.bfloat16,
    attn_implementation="flash_attention_2",
    # device_map = "auto"
    device_map = 0,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]