### **yolov1论文**

In [1]:
import torch
from torch import nn
from torchinfo import summary
import math

**一、搭建模型**

**backbone采用VGG架构**

In [2]:
def get_yolov1(num_classes = 20,num_bboxes = 2):
    """获取yolov1o模型"""
    return nn.Sequential(
        nn.Conv2d(3,64,kernel_size = 7,stride = 2,padding = 3),nn.LeakyReLU(),
        nn.MaxPool2d(2,2),                    # k = 2,s = 2的MaxPool2d层使图像分辨率减半
        nn.Conv2d(64,192,kernel_size = 3,padding = 1),nn.LeakyReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(192,128,1),nn.LeakyReLU(),
        nn.Conv2d(128,256,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(256,256,1),nn.LeakyReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.LeakyReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(512,256,1),nn.LeakyReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(512,256,1),nn.LeakyReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(512,256,1),nn.LeakyReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(512,256,1),nn.LeakyReLU(),
        nn.Conv2d(256,512,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(512,512,1),nn.LeakyReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.LeakyReLU(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(1024,512,1),nn.LeakyReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(1024,512,1),nn.LeakyReLU(),
        nn.Conv2d(512,1024,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(1024,1024,3,stride = 2,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.LeakyReLU(),
        nn.Conv2d(1024,1024,3,padding = 1),nn.LeakyReLU(),
        nn.Flatten(),nn.Linear(7 * 7 * 1024,4096),
        nn.Linear(4096,7 * 7 * (num_bboxes * 5 + num_classes))
    )

class yolov1(nn.Module):
    def __init__(self,num_classes = 20,num_bboxes = 2):
        super().__init__()
        self.B = num_bboxes
        self.C = num_classes
        self.layer = get_yolov1(self.C,self.B)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
        
    def forward(self,X):
        X = self.layer(X)
        X = X.reshape(X.shape[0],self.B * 5 + 
                      self.C,7,7)
        return X

In [3]:
net = yolov1()

In [4]:
input = torch.rand((1,3,448,448)) # 通道优先
y_pred = net(input).reshape(1,30,7,7)
# y_pred = net(input)
print(y_pred.shape)

torch.Size([1, 30, 7, 7])


In [5]:
print(summary(net,input_data = input))

Layer (type:depth-idx)                   Output Shape              Param #
yolov1                                   [1, 30, 7, 7]             --
├─Sequential: 1-1                        [1, 1470]                 --
│    └─Conv2d: 2-1                       [1, 64, 224, 224]         9,472
│    └─LeakyReLU: 2-2                    [1, 64, 224, 224]         --
│    └─MaxPool2d: 2-3                    [1, 64, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 192, 112, 112]        110,784
│    └─LeakyReLU: 2-5                    [1, 192, 112, 112]        --
│    └─MaxPool2d: 2-6                    [1, 192, 56, 56]          --
│    └─Conv2d: 2-7                       [1, 128, 56, 56]          24,704
│    └─LeakyReLU: 2-8                    [1, 128, 56, 56]          --
│    └─Conv2d: 2-9                       [1, 256, 56, 56]          295,168
│    └─LeakyReLU: 2-10                   [1, 256, 56, 56]          --
│    └─Conv2d: 2-11                      [1, 256, 56, 56]          6

**二、定义损失函数**

In [6]:
import torch.nn.functional as F
from torch.autograd import Variable

class Detect_Loss(nn.Module):

    def __init__(self, feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):

        super(Detect_Loss, self).__init__()

        self.S = feature_size
        self.B = num_bboxes
        self.C = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj


    def compute_iou(self, bbox1, bbox2):

        N = bbox1.size(0)
        M = bbox2.size(0)

        lt = torch.max(
        bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, :2].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
            )

        rb = torch.min(
        bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
            )

        wh = rb - lt
        wh[wh < 0] = 0
        inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]

        area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]
        area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]
        area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]
        area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]

        union = area1 + area2 - inter # [N, M, 2]
        iou = inter / union # [N, M, 2]

        return iou

    def forward(self, pred_tensor, target_tensor):

        S, B, C = self.S, self.B, self.C
        N = 5 * B + C

        batch_size = pred_tensor.size(0)
        coord_mask = target_tensor[:, :, :, 4] > 0
        noobj_mask = target_tensor[:, :, :, 4] == 0

        coord_mask = coord_mask.unsqueeze(-1).expand_as(target_tensor)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target_tensor)

        coord_pred = pred_tensor[coord_mask].view(-1, N)

        bbox_pred = coord_pred[:, :5 * B].contiguous().view(-1,5)
        class_pred = coord_pred[:, 5 * B:]

        coord_target = target_tensor[coord_mask].view(-1,N)

        bbox_target = coord_target[:, :5 * B].contiguous().view(-1, 5)
        class_target = coord_target[:, 5 * B:]

        noobj_pred = pred_tensor[noobj_mask].view(-1,N)

        noobj_target = target_tensor[noobj_mask].view(-1,N)

        noobj_conf_mask = torch.cuda.BoolTensor(noobj_pred.size()).fill_(0)
        for b in range(B):
            noobj_conf_mask[:, 4 + b * 5] = 1
        noobj_pred_conf = noobj_pred[noobj_conf_mask]
        noobj_target_conf = noobj_target[noobj_conf_mask]
        loss_noobj = F.mse_loss(noobj_pred_conf, noobj_target_conf, reduction='sum')

        coord_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(0)
        coord_not_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(1)
        bbox_target_iou = torch.zeros(bbox_target.size()).cuda()

        for i in range(0, bbox_target.size(0), B):
            pred = bbox_pred[i:i + B]
            pred_xyxy = Variable(torch.FloatTensor(pred.size()))

            pred_xyxy[:, :2] = pred[:, :2] / float(S) - 0.5 * pred[:, 2:4]
            pred_xyxy[:, 2:4] = pred[:, :2] / float(S) + 0.5 * pred[:, 2:4]

            target = bbox_target[i].view(-1, 5)
            target_xyxy = Variable(torch.FloatTensor(target.size()))

            target_xyxy[:, :2] = target[:, :2] / float(S) - 0.5 * target[:, 2:4]
            target_xyxy[:, 2:4] = target[:, :2] / float(S) + 0.5 * target[:, 2:4]

            iou = self.compute_iou(pred_xyxy[:, :4], target_xyxy[:, :4])
            max_iou, max_index = iou.max(0)
            max_index = max_index.data.cuda()

            coord_response_mask[i + max_index] = 1
            coord_not_response_mask[i+max_index] = 0

            bbox_target_iou[i + max_index, torch.LongTensor([4]).cuda()] = (max_iou).data.cuda()
        bbox_target_iou = Variable(bbox_target_iou).cuda()

        bbox_pred_response = bbox_pred[coord_response_mask].view(-1, 5)
        bbox_target_response = bbox_target[coord_response_mask].view(-1,5)
        target_iou = bbox_target_iou[coord_response_mask].view(-1,5)
        loss_xy = F.mse_loss(bbox_pred_response[:, :2], bbox_target_response[:, :2], reduction='sum')
        loss_wh = F.mse_loss(torch.sqrt(bbox_pred_response[:, 2:4]), torch.sqrt(bbox_target_response[:, 2:4]),reduction='sum')
        loss_obj = F.mse_loss(bbox_pred_response[:, 4], target_iou[:, 4], reduction='sum')

        loss_class = F.mse_loss(class_pred, class_target, reduction='sum')

        loss = self.lambda_coord * (loss_xy + loss_wh) + loss_obj + self.lambda_noobj * loss_noobj + loss_class
        loss = loss / float(batch_size)

        return loss_xy

In [11]:
class YoloLoss(nn.Module):
    """定义一个为yolov1的损失函数"""

    def __init__(self,feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):
        super().__init__()
        self.S = feature_size
        self.B = num_bboxes
        self.C = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def compute_iou(self,bbox1,bbox2):
        """"
        计算两组边界框之间的交并比(IoU)
        
        参数:
        - bbox1: 形状为 [N, 4] 的边界框 (xmin, ymin, xmax, ymax)
        - bbox2: 形状为 [M, 4] 的边界框 (xmin, ymin, xmax, ymax)
        
        返回:
        - iou: 形状为 [N, M] 的 IoU 矩阵
        """


    def forward(self,pred,target):
        """
        计算 YOLOv1 损失
        
        参数:
        - pred_tensor: 模型预测的输出张量，形状为 [batch_size, S, S, B*5 + C]
        - target_tensor: 目标标签张量，形状与 pred_tensor 相同
        
        返回:
        - loss: 计算得到的损失值
        """
        # target/pred = (N,C,H,W) -> (N,H,W,C)
        target = target.permute(0,2,3,1)
        pred = pred.permute(0,2,3,1)

        # 设置临时参数，减少重复self引用
        S = self.S
        B = self.B
        C = self.C
        grid_size = 1.0 / S # 归一化的网格大小

        coord_mask = target[:,:,:,4] > 0
        noobj_mask = target[:,:,:,4] == 0
        coord_mask = coord_mask.expand_as(target)
        return None

In [12]:
loss1 = YoloLoss()
loss2 = Detect_Loss()

In [13]:
target = torch.randn((1,7,7,30)).to('cuda')
y_pred = torch.randn((1,7,7,30)).to('cuda')

In [14]:
loss_xy1 = loss1(y_pred,target)
loss_xy2 = loss2(y_pred,target)

RuntimeError: The expanded size of the tensor (7) must match the existing size (30) at non-singleton dimension 3.  Target sizes: [1, 7, 30, 7].  Tensor sizes: [1, 7, 30]

In [52]:
print(loss_xy1)
print(loss_xy2)

tensor(213.9551, device='cuda:0')
tensor(100.8497, device='cuda:0')


In [59]:
# batch_size = y_pred.size(0)
coord_mask = target[:, :, :, 4] > 0
noobj_mask = target[:, :, :, 4] == 0
coord_mask = coord_mask.unsqueeze(-1).expand_as(target)
noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target)
# 提取有目标位置的预测值和目标值
print(y_pred.shape)
print(y_pred[coord_mask].shape)
coord_pred = y_pred[coord_mask].view(-1, 20)
coord_target = target[coord_mask].view(-1, 20)
print(coord_pred.shape,coord_target.shape)

torch.Size([1, 7, 7, 30])
torch.Size([780])
torch.Size([39, 20]) torch.Size([39, 20])


In [58]:
bbox_pred = coord_pred[:, :5 * 2].contiguous().view(-1, 5)
print(bbox_pred.shape)

torch.Size([78, 5])


In [60]:
x = torch.rand((1,50,50,30))
mask = torch.randn((1,50,50,30)) > 0
y = x[mask]
print(y.shape)

torch.Size([37363])


In [62]:
x = torch.rand((2,2))
print(x.max(0))

torch.return_types.max(
values=tensor([0.9819, 0.9927]),
indices=tensor([1, 0]))
