In [1]:
# import os
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# path = "/content/drive/My Drive/segmentation-pytorch/dataset/"
# os.chdir(path)
# os.listdir(path)

# 0. parameters

In [3]:
import torch.utils.data as data
import torch
import numpy as np
import h5py
# import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
# import torch.utils.data as D
# import torchvision
# from torchvision import transforms as T

import cv2 as cv
from PIL import Image
import time

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
n_epochs = 3 # training epochs
class_num = 34
batch_size = 4
learning_rate = 2e-4
weight_decay = 5e-4
log_interval = 10
random_seed = 42
val_percent = 0.1 # training set : validation set = 9:1
torch.manual_seed(random_seed)
bn_momentum = 0.1  # BN层的momentum

cate_weight = [1/34]*34 # 损失函数中类别的权重
dir_pre_train_weights = "vgg16_bn-6c64b313.pth" # 编码器预训练权重路径
dir_weights = "./weights"
dir_checkpoint = './checkpoints'


# 1.Implement a data loader class to handle the downloaded data. (5 points)
For more information on the dataset please refer to: CityScapes dataset. 

In [5]:
# 'rgb' stores the raw images, while 'seg' stores segmentation maps
class DataFromH5File(data.Dataset):
    def __init__(self, filepath):
        h5File = h5py.File(filepath, 'r')
        self.color_codes = h5File['color_codes']
        self.rgb = h5File['rgb']
        self.seg = h5File['seg']
        
    def __getitem__(self, idx):
        label = torch.from_numpy(self.seg[idx]).float()
        # data = torch.from_numpy(self.rgb[idx]).float()
        data = torch.from_numpy(cv.resize(self.rgb[idx], (224, 224))).float()
        data = data/255.0 # 归一化输入
        data = data.permute(2,0,1) # 将图片的维度转换成网络输入的维度（channel, width, height）
        return data, label
    
    def __len__(self):
        assert self.rgb.shape[0] == self.seg.shape[0], "Wrong data length" # 增强鲁棒性
        return self.rgb.shape[0]

In [6]:
# load training data from lab2_train_data.h5
dataset = DataFromH5File("lab2_train_data.h5")
n_val = int(len(dataset) * val_percent)
n_train = len(dataset) - n_val

# split train&val
train, val = data.random_split(dataset, [n_train, n_val])
train_loader = data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = data.DataLoader(dataset=val, batch_size=batch_size, shuffle=False, pin_memory=True) # drop_last=True

# load testing data from lab2_test_data.h5
testset = DataFromH5File("lab2_test_data.h5")
test_loader = data.DataLoader(dataset=testset, batch_size=batch_size, shuffle=False, pin_memory=True)

In [7]:
# test the data loader
for step, (x, y) in enumerate(train_loader):
    print(x.min(),x.max())
    print(y.min(),y.max())
    print(step)
    break

print(len(train_loader), len(val_loader), len(test_loader)) # 670 75 125 when batch_size==4

tensor(0.) tensor(1.)
tensor(1.) tensor(33.)
0
670 75 125


# 2. Define the model. Provide a schematic of your architecture depicting its overall structure and the relevant parameters. (20 points)

In [8]:
# 编码器
class Encoder(nn.Module):
    def __init__(self, input_channels):
        super(Encoder, self).__init__()

        self.enco1 = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64, momentum=bn_momentum),
            nn.ReLU()
        )
        self.enco2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128, momentum=bn_momentum),
            nn.ReLU()
        )
        self.enco3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU()
        )
        self.enco4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU()
        )
        self.enco5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU()
        )

    def forward(self, x):
        id = []

        x = self.enco1(x)
        x, id1 = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)  # 保留最大值的位置
        id.append(id1)
        x = self.enco2(x)
        x, id2 = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)
        id.append(id2)
        x = self.enco3(x)
        x, id3 = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)
        id.append(id3)
        x = self.enco4(x)
        x, id4 = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)
        id.append(id4)
        x = self.enco5(x)
        x, id5 = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)
        id.append(id5)

        return x, id


# 编码器+解码器
class SegNet(nn.Module):
    def __init__(self, input_channels, output_channels):
        super(SegNet, self).__init__()

        self.weights_new = self.state_dict()
        self.encoder = Encoder(input_channels)

        self.deco1 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU()
        )
        self.deco2 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU()
        )
        self.deco3 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128, momentum=bn_momentum),
            nn.ReLU()
        )
        self.deco4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64, momentum=bn_momentum),
            nn.ReLU()
        )
        self.deco5 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64, momentum=bn_momentum),
            nn.ReLU(),
            nn.Conv2d(64, output_channels, kernel_size=3, stride=1, padding=1),
        )

    def forward(self, x):
        x, id = self.encoder(x)

        x = F.max_unpool2d(x, id[4], kernel_size=2, stride=2)
        x = self.deco1(x)
        x = F.max_unpool2d(x, id[3], kernel_size=2, stride=2)
        x = self.deco2(x)
        x = F.max_unpool2d(x, id[2], kernel_size=2, stride=2)
        x = self.deco3(x)
        x = F.max_unpool2d(x, id[1], kernel_size=2, stride=2)
        x = self.deco4(x)
        x = F.max_unpool2d(x, id[0], kernel_size=2, stride=2)
        x = self.deco5(x)

        return x

    # 删掉VGG-16后面三个全连接层的权重
    def load_weights(self, weights_path):
        weights = torch.load(weights_path)
        del weights["classifier.0.weight"]
        del weights["classifier.0.bias"]
        del weights["classifier.3.weight"]
        del weights["classifier.3.bias"]
        del weights["classifier.6.weight"]
        del weights["classifier.6.bias"]

        names = []
        for key, value in self.encoder.state_dict().items():
            if "num_batches_tracked" in key:
                continue
            names.append(key)

        for name, dict in zip(names, weights.items()):
            self.weights_new[name] = dict[1]

        self.encoder.load_state_dict(self.weights_new)

In [10]:
from torchsummary import summary

In [13]:
model = SegNet(input_channels=3, output_channels=class_num) # RGB images so the input_channels=3
model = model.to(device)
x = torch.ones([batch_size, 3, 224, 224]) # input shape
x = x.cuda()
y = model(x)
print(y.shape)
summary(model, input_size=(3, 224, 224))

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 2.00 GiB total capacity; 1.08 GiB already allocated; 0 bytes free; 1.11 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# 3. Define the loss function and optimizer. (10 points)

In [None]:
import torch.optim as optim

In [None]:
network = VGG16_LargeFOV()
network.to(device)
optimizer = optim.Adam(network.parameters(), lr=learning_rate, weight_decay=weight_decay)
# cross entropy loss
criterion = nn.CrossEntropyLoss()

In [None]:
from torchsummary import summary
summary(network, input_size=(3, 128, 256))

In [None]:
device

# 4. Train the network. (5 points)

In [None]:
def train(SegNet):

    SegNet = SegNet.cuda()
    SegNet.load_weights(PRE_TRAINING)

    train_loader = Data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    optimizer = torch.optim.SGD(SegNet.parameters(), lr=LR, momentum=MOMENTUM)

    loss_func = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array(CATE_WEIGHT)).float()).cuda()

    SegNet.train()
    for epoch in range(EPOCH):
        for step, (b_x, b_y) in enumerate(train_loader):
            b_x = b_x.cuda()
            b_y = b_y.cuda()
            b_y = b_y.view(BATCH_SIZE, 224, 224)
            output = SegNet(b_x)
            loss = loss_func(output, b_y.long())
            loss = loss.cuda()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step % 1 == 0:
                print("Epoch:{0} || Step:{1} || Loss:{2}".format(epoch, step, format(loss, ".4f")))

    torch.save(SegNet.state_dict(), WEIGHTS + "SegNet_weights" + str(time.time()) + ".pth")


In [None]:
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]

In [None]:
def train(epoch):
    network.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data = data.permute(0,3,1,2) .to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = network(data)
        target = target.squeeze().long()
        # print(output.shape)
        # print(target.shape)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(f"Epoch: {epoch} | Loss: {loss.item():.4f}")
        if batch_idx % log_interval == 0:
            train_losses.append(loss.item())
            train_counter.append((batch_idx*batch_size_train) + ((epoch-1)*len(train_loader.dataset)))
            # save the parameters
            torch.save(network.state_dict(), './model.pth')
            torch.save(optimizer.state_dict(), './optimizer.pth')

In [None]:
for epoch in range(1, n_epochs + 1):
    train(epoch)

# 5. Test the resulting network on examples from an independent test set. Implement and present: (40 points)
a. Predictions for (μ, aleatoric, epistemic) .            
b. Visualizations for (μ, aleatoric, epistemic) on 5 different input examples.         
c. Comment briefly on how the model’s performance could be improved.          
d. Please save your code and results for submission.

# References
[1] https://blog.csdn.net/shwan_ma/article/details/100012808         
[2] https://blog.csdn.net/oYeZhou/article/details/112270908