In [None]:
!ssh-keygen -t rsa -b 4096
!ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
!cat /root/.ssh/id_rsa.pub

In [None]:
!ssh -T git@github.com
!git config --global user.email "justin.deschenauxy@epfl.com"
!git config --global user.name "Justin-Collab"
!git clone git@github.com:deschena/colab_unet_train.git
!mv colab_unet_train/* .
from google.colab import drive
drive.mount('/content/gdrive')
!nvidia-smi -L

In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torch.utils.data import DataLoader, Dataset
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import os, sys, io, random
from PIL import Image
from collections import OrderedDict

from datasets.AugmDataset import AugmDataset
from datasets.ToLabelDataset import ToLabelDataset
from models.Unet import Unet
from models.DenseUnet import DenseUnet
from utils import *
%matplotlib inline

# Model Selection

In [11]:
device = "cuda"
root_path = "datasets/augmented_dataset/"
train_name_simple = "train_clean/"
valid_name_simple = "valid_clean/"

train_name_massa = "train_base_n_massach"
valid_name_massa = "valid_base_n_massach"

In [4]:
def train_net(net, train_name, valid_name, seed=999, max_epoch=50, net_name="DEFAULT", patience=5, verbose=True, batch_size=4):
    torch.random.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    root_path = "datasets/augmented_dataset/"
    
    # Since we had the best results with only the binary cross entropy, we combine the final sigmoïd 
    # activation with the loss, since that way we have a numerically more stable result, as the 
    # log-sum-exp trick is used.
    criterion = nn.BCEWithLogitsLoss()
    
    train_set = AugmDataset(root_dir=root_path, name=train_name)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=batch_size)    

    validation_set = AugmDataset(root_dir=root_path,name=valid_name)
    validation_loader = DataLoader(validation_set, batch_size=2*batch_size, shuffle=False, num_workers=2*batch_size)
    
    # Send to GPU, prepare optimizer and learning rate scheduler
    net.to(device)
    optimizer = optim.Adam(net.parameters())
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=patience, verbose=verbose)
    
    validation_loss = []
    training_loss = []
    loss = -1
    best_current_loss = -1
    
    for epoch in range(max_epoch):
        net.train()
        for batch_train, batch_gt in train_loader:
            
            # Send data to gpu
            batch_train = batch_train.to(device)
            batch_gt = batch_gt.to(device)
            
            # Clear accumulated gradients & compute prediction
            optimizer.zero_grad()
            output = net(batch_train)
            # Compute loss, gradient & update parameters
            loss = criterion(output, batch_gt)
            loss.backward()
            optimizer.step()
        # After each epoch, compute & save loss on training and validation sets
        v_perf = validation_perf(net, validation_loader)
        validation_loss.append(v_perf)
        training_loss.append(loss)
        # Check if scheduler must decrease learning rate
        scheduler.step(v_perf)
        if v_perf > best_current_loss:
            # Save best net
            torch.save(net.state_dict(), f"/content/gdrive/My Drive/ML files/model_selection/{net_name}.pth")
            v_perf = best_current_loss
        if verbose and epoch % 10 == 0:
            print(f"{epoch} epochs elapsed")
            
    return training_loss, validation_loss

## Train the models
**Models considered**:
1. Standard Unet
2. Dense Attention Unet (pixel attention)

These models are the ones that yielded the best results in the previous phase. Therefore, we train them both with 80% of labelled data as training samples. We also train them once by including the massachussets dataset. Since we want the highest score possible, we do not keep a test set, we will test the result directly on aicrowd

**Important note**: The last layer of the sigmoid is deactivated during training because it is included in the loss, indeed, it yields a more stable function by leveraging the "log-sum-exp" trick. When the model is in eval mode, or activation_output is True, the last layer is there.

In [None]:
%%time
net1 = Unet(activation_output=False)
net1_tr, net1_val = train_net(net1, train_name, valid_name, net_name="unet_simple", seed=4432)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/unet_tr_loss", net1_tr)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/unet_val_loss", net1_val)

In [None]:
%%time
net2 = Unet(activation_output=False)
net2_tr, net2_val = train_net(net2, train_name, valid_name, net_name="unet_massach", seed=98893)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/unet_massach_tr_loss", net2_tr)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/unet_massach_val_loss", net2_val)

In [None]:
%%time
net3 = DenseUnet(down_config=(4, 8, 16, 32), bottom=64, up_channels=(256, 128, 64, 32), activation_output=False, attention="grid")
net3_tr, net3_val = train_net(net3, train_name, valid_name, net_name="dense_unet_simple", seed=123123)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/densenet_tr_loss", net3_tr)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/densetnet_val_loss", net3_val)

In [None]:
%%time
net4 = DenseUnet(down_config=(4, 8, 16, 32), bottom=64, up_channels=(256, 128, 64, 32), activation_output=False, attention="grid")
net4_tr, net4_val = train_net(net4, train_name, valid_name, net_name="dense_unet_massach", seed=34422)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/densenet_massach_tr_loss", net4_tr)
np.save(f"/content/gdrive/My Drive/ML files/model_selection/densenet_massach_val_loss", net4_val)

### Evaluating the performance of each loss
After training those 4 U-nets, we created submissions for each of them in order to assess their perf on aicrowd. No test set this time, as we want the score to be as high as possible for the leaderboard!

In [8]:
def predict_larger_image(img, model, excess = 0, model_input = 256):
    """Split the original image in a set of 256*256 images that cover the first image"""
    # img shape: (batch size = 1, channels, height, width)
    width = img.shape[2]
    height = img.shape[3]
    N_V = height // model_input + 1 + excess # Number of images on vertical axis
    N_H = width // model_input + 1 + excess # Number of images on horizontal axis
    r_h = np.round(np.linspace(0, width  - model_input,  N_H)) # starting points on h axis
    r_v = np.round(np.linspace(0, height - model_input, N_V)).astype("int") # starting points on v axis
    
    result = np.zeros((width, height))
    mask = np.zeros((width, height))
    # NOTE : for some reason, using v & h as indices yields an error. So we cast them in int manually
    for v in r_v:
        v = int(v)
        for h in r_h:
            h = int(h)
            sub_image = img[:, :, v: v + model_input, h: h + model_input]
            if sub_image.shape[2] != model_input or sub_image.shape[3] != model_input:
                raise Exception('Wrong input size')
            with torch.no_grad():
                # Predict
                sub_pred = model(sub_image).squeeze()
                sub_pred = sub_pred.to("cpu").numpy()
                # Add to total
                result[v: v + model_input, h: h + model_input] += sub_pred
                mask[v: v + model_input, h: h + model_input] += np.ones((model_input, model_input))
    result /= mask
    return result

In [17]:
from mask_to_submission import *
def create_submission(submission_filename, temp_dirname, model):
    
    test_dataset = ToLabelDataset()
    if not os.path.exists(temp_dirname):
        os.mkdir(temp_dirname)

    image_filenames = []
    for i in range(50):
        img = test_dataset[i]
        img = img.to(device).reshape(1, 3, 608, 608)
        pred = predict_larger_image(img, model)
        # threshold
        pred[pred <= 0.5] = 0
        pred[pred > 0.5] = 1
        i = i + 1 # This is due to the way the images are labelled from 1 to 50, instead of the standard 0 to 49
        image_filename = temp_dirname + 'prediction_' + '%.3d' % i + '.png'
        Image.fromarray((pred * 255).astype(np.uint8)).save(image_filename)
        image_filenames.append(image_filename)

    masks_to_submission(submission_filename, *image_filenames)

In [5]:
def load_net_params(net, name):
    path = "experiments/best_arch/" + name + ".pth"
    params = torch.load(path)
    net.load_state_dict(params)
    net.eval()
    net.to("cuda")
    return net

In [None]:
net1 = Unet(activation_output=True) # This time we don't combine it with the loss, so we want to have the last activation
net1 = load_net_params(net1, "unet_simple")
create_submission("unet_simple.csv", "unet_simple_preds/", net1)
del net1 # avoid filling the CPU

In [20]:
net2 = Unet(activation_output=True) # This time we don't combine it with the loss, so we want to have the last activation
net2 = load_net_params(net2, "unet_massach")
create_submission("unet_massach.csv", "unet_massach_preds/", net2)
del net2

In [22]:
net3 = DenseUnet(down_config=(4, 8, 16, 32), bottom=64, up_channels=(256, 128, 64, 32), activation_output=True, attention="grid")
net3 = load_net_params(net3, "dense_unet_simple")
create_submission("dense_unet_simple.csv", "denseunet_simple_preds/", net3)
del net3

In [23]:
net4 = DenseUnet(down_config=(4, 8, 16, 32), bottom=64, up_channels=(256, 128, 64, 32), activation_output=True, attention="grid")
net4 = load_net_params(net4, "dense_unet_massach")
create_submission("dense_unet_massach.csv", "denseunet_massach_preds/", net4)
del net4