Try to keep the code here. And then export into a .py file.

Do not try to modify the .py file directly until this is notebook is gone from each branch.

Remember to clear output

In [24]:
# needed for model

import torch
import torch.nn as nn

from torch.autograd import Variable

In [18]:
# needed for formulating final predictions

from torchvision.ops import nms


In [None]:
# yolo v3

# Utils and calculating loss

Load presaved model

In [None]:
# works by side effects
def load_pretask_weight_from_model(model, presaved_encoder):
    model.encoder.load_state_dict(presaved_encoder.state_dict())
    
    for param in model.encoder.parameters():
        param.requires_grad = False
        
    return model

In [None]:
# use this if you want Initialize Our Model with encoder weights from an existing pretask encoder in memory
def initialize_model_for_training(presaved_encoder):
    model = KobeModel()
    load_pretask_weight_from_model(model, presaved_encoder)
    
    return model

In [None]:
# use this if you want Initialize Our Model with encoder weights from a file
def initialize_model_for_training_file(presaved_encoder_file):
    presaved_encoder = PreTaskEncoder()
    presaved_encoder.load_state_dict(torch.load(presaved_encoder_file))
    presaved_encoder.eval()

    
    return initialize_model_for_training(presaved_encoder)

Converting predictions to the format for competition

Helper functions to calculate bounding boxes and such

In [3]:
# taken from https://github.com/eriklindernoren/PyTorch-YOLOv3/blob/master/utils/utils.py

def bbox_wh_iou(wh1, wh2):
    wh2 = wh2.t()
    w1, h1 = wh1[0], wh1[1]
    w2, h2 = wh2[0], wh2[1]
    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
    return inter_area / union_area


def bbox_iou(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes
    """
    if not x1y1x2y2:
        # Transform from center and width to exact coordinates
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
    else:
        # Get the coordinates of bounding boxes
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

    # get the corrdinates of the intersection rectangle
    inter_rect_x1 = torch.max(b1_x1, b2_x1)
    inter_rect_y1 = torch.max(b1_y1, b2_y1)
    inter_rect_x2 = torch.min(b1_x2, b2_x2)
    inter_rect_y2 = torch.min(b1_y2, b2_y2)
    # Intersection area
    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
        inter_rect_y2 - inter_rect_y1 + 1, min=0
    )
    # Union Area
    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

    return iou


def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
    """
    Removes detections with lower object confidence score than 'conf_thres' and performs
    Non-Maximum Suppression to further filter detections.
    Returns detections with shape:
        (x1, y1, x2, y2, object_conf, class_score, class_pred)
    """

    # From (center x, center y, width, height) to (x1, y1, x2, y2)
    prediction[..., :4] = xywh2xyxy(prediction[..., :4])
    output = [None for _ in range(len(prediction))]
    for image_i, image_pred in enumerate(prediction):
        # Filter out confidence scores below threshold
        image_pred = image_pred[image_pred[:, 4] >= conf_thres]
        # If none are remaining => process next image
        if not image_pred.size(0):
            continue
        # Object confidence times class confidence
        score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
        # Sort by it
        image_pred = image_pred[(-score).argsort()]
        class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
        detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
        # Perform non-maximum suppression
        keep_boxes = []
        while detections.size(0):
            large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
            label_match = detections[0, -1] == detections[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            weights = detections[invalid, 4:5]
            # Merge overlapping bboxes by order of confidence
            detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
            keep_boxes += [detections[0]]
            detections = detections[~invalid]
        if keep_boxes:
            output[image_i] = torch.stack(keep_boxes)

    return output


def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):

    ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor

    nB = pred_boxes.size(0)
    nA = pred_boxes.size(1)
    nC = pred_cls.size(-1)
    nG = pred_boxes.size(2)

    # Output tensors
    obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)
    noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)
    class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)
    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)
    
    tx = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty = FloatTensor(nB, nA, nG, nG).fill_(0)
    tw = FloatTensor(nB, nA, nG, nG).fill_(0)
    th = FloatTensor(nB, nA, nG, nG).fill_(0)
    tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
    
    ### predict additional coordinates for the center within the anchor box
    ##### THIS IS OUR ADDITION
    ##### RANGE OF THESE VALUES TO BE DETERMINED
    tx1 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx2 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx3 = FloatTensor(nB, nA, nG, nG).fill_(0)
    tx4 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty1 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty2 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty3 = FloatTensor(nB, nA, nG, nG).fill_(0)
    ty4 = FloatTensor(nB, nA, nG, nG).fill_(0)

    # Convert to position relative to box
    target_boxes = target[:, 2:6] * nG
    gxy = target_boxes[:, :2]
    gwh = target_boxes[:, 2:]

    
    
    # Get anchors with best iou
    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
    best_ious, best_n = ious.max(0)
    # Separate target values
    b, target_labels = target[:, :2].long().t()
    gx, gy = gxy.t()
    gw, gh = gwh.t()
    gi, gj = gxy.long().t()
    # Set masks
    obj_mask[b, best_n, gj, gi] = 1
    noobj_mask[b, best_n, gj, gi] = 0

    # Set noobj mask to zero where iou exceeds ignore threshold
    for i, anchor_ious in enumerate(ious.t()):
        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0

    # Coordinates
    tx[b, best_n, gj, gi] = gx - gx.floor()
    ty[b, best_n, gj, gi] = gy - gy.floor()
    # Width and height
    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
    # One-hot encoding of label
    tcls[b, best_n, gj, gi, target_labels] = 1
    # Compute label correctness and iou at best anchor
    class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
    iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)

    tconf = obj_mask.float()
    return (iou_scores, class_mask, obj_mask, noobj_mask, 
            tx, ty, tw, th, 
            tx1, tx2, tx3, tx4, 
            ty1, ty2, ty3, ty4, 
            tcls, tconf)

Losses

In [None]:
def YoloLoss(pred_boxes, pred_conf, pred_cls, targets, scaled_anchors, ignore_thres):
    
    mse_loss = nn.MSELoss()
    bce_loss = nn.BCELoss()
    
    iou_scores, class_mask, obj_mask, noobj_mask, 
            tx, ty, tw, th, 
            tx1, tx2, tx3, tx4, 
            ty1, ty2, ty3, ty4, 
            tcls, tconf = build_targets(
                pred_boxes=pred_boxes,
                pred_cls=pred_cls,
                target=targets,
                anchors=scaled_anchors,
                ignore_thres=ignore_thres,
            )
    
    loss_x = self.mse_loss(xc[obj_mask], tx[obj_mask])
    loss_y = self.mse_loss(yc[obj_mask], ty[obj_mask])
    loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
    loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
    
    loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
    loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
    loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
    loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])
    total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
    
    return total_loss

In [14]:
def RoadMapLoss(pred_rm, target_rm):
    bce_loss = nn.BCELoss()

    return bce_loss(pred_rm, target_rm)

In [2]:
def total_joint_loss(yolo_loss, rm_loss, lambd):
    return yolo_loss + lambd * rm_loss

Train Loop and test loops
Not necessarily using data loader.

Assuming targets are already pre-processed.

In [None]:
def train(data, targets_bb, targets_rm, kobe_model, kobe_optimizer, n_epochs, lambd = 0.5):
    for epoch in range(n_epochs):
        kobe_model.train()
        start_time = time.time()
        permutations = torch.randperm(df.shape[0])
        for i in range(math.ceil(len(df)/batch_size)):
            batch_ind = permutations[i * batch_size : (i+1) * batch_size]
            batch_images = data[batch_ind, :]
            batch_targets_bb = targets_bb[batch_ind, :]
            batch_rms = targets_rm[batch_ind, :]
            
            kobe_optimizer.zero_grad()

            imgs = Variable(batch_images.to(device))
            targets = Variable(batch_targets.to(device), requires_grad=False)

            outputs_yolo, outputs_rm = kobe_model(imgs)
            
            yolo_loss = YoloLoss(outputs_yolo[0], outputs_yolo[1], outputs_yolo[2], targets, 
                                 kobe_model.yolo_decoder.scaled_anchors, kobe_model.yolo_decoder.ignore_thres) 
            
            rm_loss = RoadMapLoss(outputs_rm, batch_rms)
            
            loss = total_joint_loss(yolo_loss, rm_loss, lambd)
            loss.backward()
            
            kobe_optimizer.step()
        

# Model Part

Pre-defined anchors. Should honestly come from KMeans on detection boxes but let's see how this does before going complex

In [None]:
### the code uses only the last 3 anchors so let ssee what this does
# width, height
anchors = [(10,13),  (16,30),  (33,23),  (30,61),  (62,45),  (59,119),  (116,90),  (156,198),  (373,326)]

Our YoloLayer for task of object localization

Ignoring orientation

In [None]:
class PreTaskEncoder(nn.Module):
    def __init__(self):
        super(PreTaskEncoder, self).__init__()
    

In [None]:
class YoloDecoder(nn.Module):
    
    def __init__(self, anchors, num_classes, img_dim=416):
        
        super(YoloDecoder, self).__init__()
        
        self.anchors = anchors
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.ignore_thres = 0.5
        
        self.obj_scale = 1
        self.noobj_scale = 100
        self.img_dim = img_dim
        self.grid_size = 16
        
        # takes in dense output from encoder or shared decoder and puts into an
        # image of dim img_dim
        self.m = nn.Sequential(
        
        )
        
    def compute_grid_offsets(self, grid_size, cuda=True):
        self.grid_size = grid_size
        g = self.grid_size
        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
        self.stride = self.img_dim / self.grid_size
        # Calculate offsets for each grid
        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
        
    def forward(self, x):
        # Tensors for cuda support
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

        self.img_dim = img_dim
        
        num_samples = x.size(0)
        grid_size = x.size(2)

        prediction = (
            # 4 from x, y, w, h
            # 8 from x1, ..., x4, y1, ..., y4 (OUR ADDITION)
            # 1 from confidence of if here or not
            # 13 total
            # was originally just five
            x.view(num_samples, self.num_anchors, self.num_classes + 13, grid_size, grid_size)
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )
        
        # Get outputs
        x = torch.sigmoid(prediction[..., 0])  # Center x
        y = torch.sigmoid(prediction[..., 1])  # Center y
        w = prediction[..., 2]  # Width
        h = prediction[..., 3]  # Height
        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

        # If grid size does not match current we compute new offsets
        if grid_size != self.grid_size:
            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)

        # Add offset and scale with anchors
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + self.grid_x
        pred_boxes[..., 1] = y.data + self.grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h

        output = torch.cat(
            (
                pred_boxes.view(num_samples, -1, 4) * self.stride,
                pred_conf.view(num_samples, -1, 1),
                pred_cls.view(num_samples, -1, self.num_classes),
            ),
            -1,
        )
        
        return pred_boxes, pred_conf, pred_cls, output


Our Model does that does both tasks

In [None]:
class KobeModel(nn.Module):
    
    def __init__(self, num_classes, yolo_dim, rm_dim):
        super(KobeModel, self).__init__()
        
        # 
        
        self.encoder = PreTaskEncoder()
        
        
        self.shared_decoder = nn.Sequential(
            
        )
        
        self.yolo_decoder = YoloDecoder(anchors, num_classes, img_dim=yolo_dim)
        
        self.rm_decoder = RmDecoder(rm_dim)
        
    def forward(self, x):
        x = self.encoder(x)
        
        #convert from dense representation from encoder into an image
        x.view(...)
        
        x = self.shared_decoder(x)
        
        # output_1[0] corresponds to the bounding boxes
        # output_1[1] corresponds to confidences
        # output_1[2] corresponds to the class labels
        # oputput_1[3] is them spread out
        output_1 = self.yolo_decoder(x)
        # roadmap decoder
        output_2 = self.rm_decoder(x)
        
        return output_1, output_2