In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
class Detect_Loss(nn.Module):
    def __init__(self, feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):
        """
        初始化 YOLOv1 损失函数
        
        参数:
        - feature_size: 特征图尺寸 (默认7x7)
        - num_bboxes: 每个网格预测的边界框数量 (默认2)
        - num_classes: 类别数量 (默认20)
        - lambda_coord: 坐标损失的权重系数 (默认5.0)
        - lambda_noobj: 无目标置信度损失的权重系数 (默认0.5)
        """
        super(Detect_Loss, self).__init__()
        
        self.S = feature_size       # 特征图大小 (SxS 网格)
        self.B = num_bboxes         # 每个网格预测的边界框数量
        self.C = num_classes        # 类别数量
        self.lambda_coord = lambda_coord  # 坐标损失权重
        self.lambda_noobj = lambda_noobj  # 无目标置信度损失权重

    def compute_iou(self, bbox1, bbox2):
        """
        计算两组边界框之间的交并比(IoU)
        
        参数:
        - bbox1: 形状为 [N, 4] 的边界框 (xmin, ymin, xmax, ymax)
        - bbox2: 形状为 [M, 4] 的边界框 (xmin, ymin, xmax, ymax)
        
        返回:
        - iou: 形状为 [N, M] 的 IoU 矩阵
        """
        # 获取边界框数量
        N = bbox1.size(0)
        M = bbox2.size(0)
        
        # 计算交集的左上角坐标 (left-top)
        lt = torch.max(
            bbox1[:, :2].unsqueeze(1).expand(N, M, 2),  # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, :2].unsqueeze(0).expand(N, M, 2)   # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        
        # 计算交集的右下角坐标 (right-bottom)
        rb = torch.min(
            bbox1[:, 2:].unsqueeze(1).expand(N, M, 2),  # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)    # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        
        # 计算交集的宽高
        wh = rb - lt
        wh[wh < 0] = 0  # 处理无重叠的情况
        inter = wh[:, :, 0] * wh[:, :, 1]  # 交集面积 [N, M]
        
        # 计算两个边界框各自的面积
        area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1])  # [N, ]
        area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1])  # [M, ]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N, ] -> [N, 1] -> [N, M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M, ] -> [1, M] -> [N, M]
        # print(area1)
        # print(area2)
        
        # 计算并集面积
        union = area1 + area2 - inter  # [N, M]

        # print(inter)
        # print(union)
        
        # 计算 IoU
        iou = inter / union  # [N, M]
        
        return iou

    def forward(self, pred_tensor, target_tensor):
        """
        计算 YOLOv1 损失
        
        参数:
        - pred_tensor: 模型预测的输出张量，形状为 [batch_size, S, S, B*5 + C]
        - target_tensor: 目标标签张量，形状与 pred_tensor 相同
        
        返回:
        - loss: 计算得到的损失值
        """
        # 获取参数
        S, B, C = self.S, self.B, self.C
        N = 5 * B + C  # 每个网格的预测值总数
        
        batch_size = pred_tensor.size(0)
        
        # 创建目标存在和不存在的位置掩码
        # target_tensor[:, :, :, 4] 是第一个边界框的置信度
        coord_mask = target_tensor[:, :, :, 4] > 0   # 有目标的网格位置
        noobj_mask = target_tensor[:, :, :, 4] == 0  # 无目标的网格位置
        
        # 扩展掩码维度以匹配目标张量
        coord_mask = coord_mask.unsqueeze(-1).expand_as(target_tensor)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target_tensor)
        
        # 提取有目标位置的预测值和目标值
        coord_pred = pred_tensor[coord_mask].view(-1, N)
        coord_target = target_tensor[coord_mask].view(-1, N)
        
        # 分割边界框预测和类别预测
        bbox_pred = coord_pred[:, :5 * B].contiguous().view(-1, 5)
        class_pred = coord_pred[:, 5 * B:]
        
        # 分割边界框目标和类别目标
        bbox_target = coord_target[:, :5 * B].contiguous().view(-1, 5)
        class_target = coord_target[:, 5 * B:]
        
        # 提取无目标位置的预测值和目标值
        noobj_pred = pred_tensor[noobj_mask].view(-1, N)
        noobj_target = target_tensor[noobj_mask].view(-1, N)
        
        # 处理无目标位置的置信度损失
        # 创建掩码以选择所有边界框的置信度 (位置 4, 9, ...)
        noobj_conf_mask = torch.BoolTensor(noobj_pred.size()).fill_(0)
        for b in range(B):
            noobj_conf_mask[:, 4 + b * 5] = 1  # 设置每个边界框的置信度位置
            
        # 提取无目标位置的置信度预测值和目标值
        noobj_pred_conf = noobj_pred[noobj_conf_mask]
        noobj_target_conf = noobj_target[noobj_conf_mask]
        
        # 计算无目标的置信度损失
        loss_noobj = F.mse_loss(noobj_pred_conf, noobj_target_conf, reduction='sum')
        
        # 初始化响应掩码
        coord_response_mask = torch.BoolTensor(bbox_target.size()).fill_(0)
        coord_not_response_mask = torch.BoolTensor(bbox_target.size()).fill_(1)
        bbox_target_iou = torch.zeros(bbox_target.size())
        
        # 遍历每个有目标的网格
        for i in range(0, bbox_target.size(0), B):
            # 获取当前网格的 B 个预测边界框
            pred = bbox_pred[i:i + B]
            
            # 将预测边界框转换为 (xmin, ymin, xmax, ymax) 格式
            pred_xyxy = Variable(torch.FloatTensor(pred.size()))
            pred_xyxy[:, :2] = pred[:, :2] / float(S) - 0.5 * pred[:, 2:4]  # 左上角
            pred_xyxy[:, 2:4] = pred[:, :2] / float(S) + 0.5 * pred[:, 2:4]  # 右下角
            
            # 获取当前网格的目标边界框
            target = bbox_target[i].view(-1, 5)
            
            # 将目标边界框转换为 (xmin, ymin, xmax, ymax) 格式
            target_xyxy = Variable(torch.FloatTensor(target.size()))
            target_xyxy[:, :2] = target[:, :2] / float(S) - 0.5 * target[:, 2:4]  # 左上角
            target_xyxy[:, 2:4] = target[:, :2] / float(S) + 0.5 * target[:, 2:4]  # 右下角
            
            # 计算预测边界框与目标边界框的 IoU
            iou = self.compute_iou(pred_xyxy[:, :4], target_xyxy[:, :4])
            max_iou, max_index = iou.max(0)  # 找出最大 IoU 及其索引
            max_index = max_index.data
            
            # 标记负责预测目标的边界框
            coord_response_mask[i + max_index] = 1
            coord_not_response_mask[i + max_index] = 0
            
            # 将最大 IoU 作为置信度目标
            bbox_target_iou[i + max_index, torch.LongTensor([4])] = max_iou.data
        
        bbox_target_iou = Variable(bbox_target_iou)
        
        # 提取负责预测目标的边界框
        bbox_pred_response = bbox_pred[coord_response_mask].view(-1, 5)
        bbox_target_response = bbox_target[coord_response_mask].view(-1, 5)
        target_iou = bbox_target_iou[coord_response_mask].view(-1, 5)
        
        # 计算坐标损失 (中心点坐标)
        loss_xy = F.mse_loss(bbox_pred_response[:, :2], bbox_target_response[:, :2], reduction='sum')
        
        # 计算宽高损失 (使用平方根以平衡大小目标)
        loss_wh = F.mse_loss(
            torch.sqrt(bbox_pred_response[:, 2:4]), 
            torch.sqrt(bbox_target_response[:, 2:4]),
            reduction='sum'
        )
        
        # 计算有目标的置信度损失
        loss_obj = F.mse_loss(bbox_pred_response[:, 4], target_iou[:, 4], reduction='sum')
        
        # 计算类别损失
        loss_class = F.mse_loss(class_pred, class_target, reduction='sum')

        print(loss_xy,loss_wh,loss_class,loss_obj,loss_noobj)
        
        # 计算总损失
        loss = (
            self.lambda_coord * (loss_xy + loss_wh) +  # 坐标损失
            loss_obj +                                # 有目标的置信度损失
            self.lambda_noobj * loss_noobj +           # 无目标的置信度损失
            loss_class                                # 类别损失
        )

        print(target_iou[:,4])
        
        # 平均损失
        loss = loss / float(batch_size)
        
        return loss

In [3]:
class YoloLoss(nn.Module):
    """定义一个为yolov1的损失函数"""

    def __init__(self,feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):
        super().__init__()
        self.S = feature_size
        self.B = num_bboxes
        self.C = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def compute_iou(self,bbox1,bbox2):
        """"
        计算两组边界框之间的交并比(IoU)
        
        参数:
        - bbox1: 形状为 [N, 4] 的边界框 (xmin, ymin, xmax, ymax)
        - bbox2: 形状为 [M, 4] 的边界框 (xmin, ymin, xmax, ymax)
        
        返回:
        - iou: 形状为 [N, M] 的 IoU 矩阵
        """
        # 获取边界框数量
        N = bbox1.size(0)
        M = bbox2.size(0)
        
        # 计算交集的左上角坐标 (left-top)
        lt = torch.max(
            bbox1[:, :2].unsqueeze(1).expand(N, M, 2),  # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, :2].unsqueeze(0).expand(N, M, 2)   # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        
        # 计算交集的右下角坐标 (right-bottom)
        rb = torch.min(
            bbox1[:, 2:].unsqueeze(1).expand(N, M, 2),  # [N, 2] -> [N, 1, 2] -> [N, M, 2]
            bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)    # [M, 2] -> [1, M, 2] -> [N, M, 2]
        )
        
        # 计算交集的宽高
        wh = rb - lt
        wh[wh < 0] = 0  # 处理无重叠的情况
        inter = wh[:, :, 0] * wh[:, :, 1]  # 交集面积 [N, M]
        
        # 计算两个边界框各自的面积
        area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1])  # [N, ]
        area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1])  # [M, ]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N, ] -> [N, 1] -> [N, M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M, ] -> [1, M] -> [N, M]
        
        # 计算并集面积
        union = area1 + area2 - inter  # [N, M]
        
        # 计算 IoU
        iou = inter / union  # [N, M]
        
        return iou


    def forward(self,pred:torch.Tensor,target:torch.Tensor):
        """
        计算 YOLOv1 损失
        
        参数:
        - pred_tensor: 模型预测的输出张量，形状为 [batch_size, S, S, B*5 + C]
        - target_tensor: 目标标签张量，形状与 pred_tensor 相同
        
        返回:
        - loss: 计算得到的损失值
        """
        # target/pred = (N,C,H,W) -> (N,H,W,C)
        target = target.permute(0,2,3,1)
        pred = pred.permute(0,2,3,1)
        batch_size = pred.shape[0]

        # 设置临时参数，减少重复self引用
        S = self.S
        B = self.B
        C = self.C
        grid_size = 1.0 / S # 归一化的网格大小

        # 设置有目标的mask和没目标的mask
        coord_mask = target[:,:,:,4] > 0
        noobj_mask = target[:,:,:,4] == 0
        coord_mask = coord_mask.unsqueeze(-1).expand_as(target)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target)

        # 提取有目标的pred和没目标的pred
        coord_pred = pred[coord_mask].reshape(-1, 5 * B + C)
        noobj_pred = pred[noobj_mask].reshape(-1, 5 * B + C)

        # 提取有目标的target和没目标的target
        coord_target = target[coord_mask].reshape(-1, 5 * B + C)
        noobj_target = target[noobj_mask].reshape(-1, 5 * B + C)

        # 提取bbox与class
        bbox_pred = coord_pred[:,:5 * B].reshape(-1, 5)
        class_pred = coord_pred[:,5 * B:]
        bbox_target = coord_target[:,:5 * B].reshape(-1, 5)
        class_target = coord_target[:,5 * B:]

        # 处理无目标位置的置信度损失
        noobj_conf_mask = torch.BoolTensor(noobj_pred.shape).fill_(0)
        for b in range(B):
            noobj_conf_mask[:,4 + b * 5] = 1 # 设置提取出置信度的位置
        noobj_conf_pred = noobj_pred[noobj_conf_mask]
        noonj_conf_target = noobj_target[noobj_conf_mask]

        # 计算noobj_loss_conf
        loss_conf_noobj = F.mse_loss(noobj_conf_pred,noonj_conf_target,reduction = 'sum')

        # 初始化响应掩码
        coord_response_mask = torch.BoolTensor(bbox_target.size()).fill_(0) # 响应初始化为0
        coord_not_response_mask = torch.BoolTensor(bbox_target.size()).fill_(1) # 非响应初始化为1
        bbox_target_iou = torch.zeros(bbox_target.size())

        # 遍历每个目标网格
        for i in range(0,bbox_pred.shape[0],B):
            # 获取当前网格的 B 个预测边界框
            pred = bbox_pred[i:i + B] 

            # 将预测边界框转换为 (xmin, ymin, xmax, ymax) 格式
            pred_xyxy = torch.zeros((pred.shape[0],4))
            pred_xyxy[:,:2] = pred[:,:2] * grid_size - pred[:,2:4] * 0.5    # 左上
            pred_xyxy[:,2:4] = pred[:,:2] * grid_size + pred[:,2:4] * 0.5   # 右下

            # 同样处理出目标坐标
            target = bbox_target[i].reshape(-1,5)
            target_xyxy = torch.zeros((target.shape[0],4))
            target_xyxy[:,:2] = target[:,:2] * grid_size - target[:,2:4] * 0.5    # 左上
            target_xyxy[:,2:4] = target[:,:2] * grid_size + target[:,2:4] * 0.5   # 右下

            # 计算iou
            iou = self.compute_iou(pred_xyxy,target_xyxy)
            max_iou, max_index = iou.max(0)  # 找出最大 IoU 及其索引

            # 标记负责预测目标的边界框
            coord_response_mask[i + max_index] = 1
            coord_not_response_mask[i + max_index] = 0

            # 将最大 IoU 作为置信度目标
            bbox_target_iou[i + max_index,4] = max_iou.data

        # 提取负责预测目标的边界框
        bbox_pred_response = bbox_pred[coord_response_mask].reshape(-1, 5)
        bbox_target_response = bbox_target[coord_response_mask].reshape(-1, 5)
        target_iou = bbox_target_iou[coord_response_mask].reshape(-1, 5)

        # 计算坐标损失 (中心点坐标)
        loss_xy = F.mse_loss(bbox_pred_response[:, :2], bbox_target_response[:, :2], reduction='sum')
        
        # 计算宽高损失
        loss_wh = F.mse_loss(
            torch.sqrt(bbox_pred_response[:,2:4]), 
            torch.sqrt(bbox_target_response[:,2:4]), 
            reduction = 'sum'
        )

        # 计算目标置信度损失
        loss_conf = F.mse_loss(bbox_pred_response[:,4], target_iou[:,4], reduction = 'sum')

        # 计算类别预测损失
        loss_class = F.mse_loss(class_pred,class_target,reduction = 'sum')

        print(loss_xy,loss_wh,loss_class,loss_conf,loss_conf_noobj)
        print(target_iou[:,4])

        loss = (self.lambda_coord * (loss_xy + loss_wh) + 
                self.lambda_noobj * loss_conf_noobj +
                loss_conf + loss_class)
        
        return loss / batch_size # 平均损失

In [4]:
loss = YoloLoss()
loss_re = Detect_Loss()

In [5]:
x = torch.zeros((2,2))
mask = torch.BoolTensor([[1,1],[0,0]])
x[mask] = 1
print(x)

tensor([[1., 1.],
        [0., 0.]])


In [6]:
pre = torch.randn(1,30,7,7)  # 随机预测张量 [batch, S, S, B*5+C]
tar = torch.randn(1,30,7,7)  # 随机目标张量
mask1 = pre > 0
mask2 = tar > 0
pre[~mask1] = 0
tar[~mask2] = 0

In [7]:
out1 = loss(pre,tar)

tensor(29.3235) tensor(21.7338) tensor(355.4330) tensor(nan) tensor(17.5007)
tensor([   nan, 0.2357,    nan,    nan,    nan,    nan,    nan,    nan,    nan,
           nan, 0.0000,    nan,    nan, 0.0000,    nan,    nan,    nan,    nan,
           nan,    nan, 0.0000,    nan,    nan,    nan, 0.0567,    nan, 0.0286,
        0.0376])


In [8]:
out2 = loss_re(pre.permute(0,2,3,1),tar.permute(0,2,3,1))

tensor(29.3235) tensor(21.7338) tensor(355.4330) tensor(nan) tensor(17.5007)
tensor([   nan, 0.2357,    nan,    nan,    nan,    nan,    nan,    nan,    nan,
           nan, 0.0000,    nan,    nan, 0.0000,    nan,    nan,    nan,    nan,
           nan,    nan, 0.0000,    nan,    nan,    nan, 0.0567,    nan, 0.0286,
        0.0376])


In [9]:
print(out1)
print(out2)

tensor(nan)
tensor(nan)
