# Scene Segmentation from 3D-Point Clouds Data
In this tutorial, we are going to explore semantic segmentation for indoor scenes represented in S3DIS dataset using PointNet.

All the theoretical intuitions for the PointNet and the S3DIS dataset is available in [Introduction Notebook](0-Introduction.ipynb).

In [2]:
### Import the required modules.
import os
import torch
import time
import numpy as np
from tqdm import tqdm
from dataloaders.S3DISDataLoader import S3DISDataset
from utilities.activation import inplace_relu    ### Saves memory
from utilities.data_description import seg_label_to_cat  
from utilities.data_manipulation import rotate_point_cloud_z



In the following, we define the parameters for our model to train.

In [3]:
class Args:
    gpu='0'
    batch_size = 16
    model='pointnet_sem_seg'
    epoch=32
    learning_rate=0.001
    num_point=1024
    optimizer='Adam'
    log_dir = 'runs'
    decay_rate=1e-4
    npoint = 4096
    step_size =10
    lr_decay = 0.7
    test_area=5
args = Args()

In the following block, we define the test function. We will use this function inside our training loop to validate and see the performance of our model.

In [4]:
def test(model, loader):    
    num_batches = len(loader)
    total_correct = 0
    total_seen = 0
    loss_sum = 0
    labelweights = np.zeros(NUM_CLASSES)
    total_seen_class = [0 for _ in range(NUM_CLASSES)]
    total_correct_class = [0 for _ in range(NUM_CLASSES)]
    total_iou_deno_class = [0 for _ in range(NUM_CLASSES)]
    classifier = model.eval()

    print('---- EPOCH %03d EVALUATION ----' % (global_epoch + 1))
    for i, (points, target) in tqdm(enumerate(loader), total=len(loader), smoothing=0.9):
        points = points.data.numpy()
        points = torch.Tensor(points)
        points, target = points.float().cuda(), target.long().cuda()
        points = points.transpose(2, 1)

        seg_pred, trans_feat = classifier(points)
        pred_val = seg_pred.contiguous().cpu().data.numpy()
        seg_pred = seg_pred.contiguous().view(-1, NUM_CLASSES)

        batch_label = target.cpu().data.numpy()
        target = target.view(-1, 1)[:, 0]
        loss = criterion(seg_pred, target, trans_feat, weights)
        loss_sum += loss
        pred_val = np.argmax(pred_val, 2)
        correct = np.sum((pred_val == batch_label))
        total_correct += correct
        total_seen += (BATCH_SIZE * NUM_POINT)
        tmp, _ = np.histogram(batch_label, range(NUM_CLASSES + 1))
        labelweights += tmp

        for l in range(NUM_CLASSES):
            total_seen_class[l] += np.sum((batch_label == l))
            total_correct_class[l] += np.sum((pred_val == l) & (batch_label == l))
            total_iou_deno_class[l] += np.sum(((pred_val == l) | (batch_label == l)))

    labelweights = labelweights.astype(np.float32) / np.sum(labelweights.astype(np.float32))
    mIoU = np.mean(np.array(total_correct_class) / (np.array(total_iou_deno_class, dtype=np.float) + 1e-6))
    print('eval mean loss: %f' % (loss_sum / float(num_batches)))
    print('eval point avg class IoU: %f' % (mIoU))
    print('eval point accuracy: %f' % (total_correct / float(total_seen)))
    print('eval point avg class acc: %f' % (
        np.mean(np.array(total_correct_class) / (np.array(total_seen_class, dtype=np.float) + 1e-6))))

    iou_per_class_str = '------- IoU --------\n'
    for l in range(NUM_CLASSES):
        iou_per_class_str += 'class %s weight: %.3f, IoU: %.3f \n' % (
            seg_label_to_cat[l] + ' ' * (14 - len(seg_label_to_cat[l])), labelweights[l - 1],
            total_correct_class[l] / float(total_iou_deno_class[l]))

    print(iou_per_class_str)
    print('Eval mean loss: %f' % (loss_sum / num_batches))
    print('Eval accuracy: %f' % (total_correct / float(total_seen)))

    return mIoU


Verify if a Nvidia GPU is available for training the network. Check args.gpu value to define the available GPU in a cluster of GPUs.

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

In the following, we will load the S3DIS Dataset from the disk. To download the dataset, follow the download and preparation instructions given in [Introduction Notebook](0-Introduction.ipynb). The same notebook also explains the dataset, its classes, and helps to visualizes the scenes.

In [6]:
root = '../data/stanford_indoor3d/'
NUM_CLASSES = 13
NUM_POINT = args.npoint
BATCH_SIZE = args.batch_size

print("start loading training data ...")
TRAIN_DATASET = S3DISDataset(split='train', data_root=root, num_point=NUM_POINT, test_area=args.test_area, block_size=1.0, sample_rate=1.0, transform=None)
print("start loading test data ...")
TEST_DATASET = S3DISDataset(split='test', data_root=root, num_point=NUM_POINT, test_area=args.test_area, block_size=1.0, sample_rate=1.0, transform=None)

trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET, batch_size=BATCH_SIZE, shuffle=True, num_workers=10,
                                                pin_memory=True, drop_last=True,
                                                worker_init_fn=lambda x: np.random.seed(x + int(time.time())))
testDataLoader = torch.utils.data.DataLoader(TEST_DATASET, batch_size=BATCH_SIZE, shuffle=False, num_workers=10,
                                                pin_memory=True, drop_last=True)
weights = torch.Tensor(TRAIN_DATASET.labelweights).cuda()

start loading training data ...


100%|██████████| 204/204 [00:15<00:00, 13.31it/s]


47576 samples in train set.
start loading test data ...


100%|██████████| 67/67 [00:06<00:00, 10.63it/s]


18822 samples in test set.


In the following we define the PointNet segmentation model. The insights of PointNet and their different parts are discussed in [Introduction Notebook](0-Introduction.ipynb).

In [8]:
### load pointnet segmentation model
from models.pointnet_sem_seg import get_model, get_loss

classifier = get_model(NUM_CLASSES).cuda()
criterion = get_loss().cuda()
classifier.apply(inplace_relu)

get_model(
  (feat): PointNetEncoder(
    (stn): STN3d(
      (conv1): Conv1d(9, 64, kernel_size=(1,), stride=(1,))
      (conv2): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
      (conv3): Conv1d(128, 1024, kernel_size=(1,), stride=(1,))
      (fc1): Linear(in_features=1024, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=256, bias=True)
      (fc3): Linear(in_features=256, out_features=9, bias=True)
      (relu): ReLU(inplace=True)
      (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv1): Conv1d(9, 64, kernel_size=(1,), stride=

In the following, we initialize the optimizer for our model.

In [None]:
if args.optimizer == 'Adam':
        optimizer = torch.optim.Adam(
            classifier.parameters(),
            lr=args.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-08,
            weight_decay=args.decay_rate
        )
else:
    optimizer = torch.optim.SGD(classifier.parameters(), 
    lr=args.learning_rate, momentum=0.9)

In the following, we define the momentum. Momentum is used along with batch normalization. Momentum helps in reducing the noise in Gradient update term. Batch normalization reduces the coupling of prior layers parameters to the later stage parameters. So, it helps in stablizing the inputs feed to a layer. Both momentum and batch normalization help in faster convergence of the network.

In [10]:
def bn_momentum_adjust(m, momentum):
    if isinstance(m, torch.nn.BatchNorm2d) or isinstance(m, torch.nn.BatchNorm1d):
        m.momentum = momentum

In the following, we will check if there exists a pre-trained weight. If it exists, then we load the weights and resume the training process. If not, then we start the training from the beginning. 

In [11]:
try:
    checkpoint = torch.load('save_weights/best_model_segmentation.pth')
    start_epoch = checkpoint['epoch']
    classifier.load_state_dict(checkpoint['model_state_dict'])
    print('Use pretrain model')
except:
    print('No existing model, starting training from scratch...')
    start_epoch = 0

No existing model, starting training from scratch...


In the following, we initialize the training parameters to measure the performance of the model. Following the literature, we calculate the mIoU (mean intersection over union) to evaluate the model.

In [13]:
## training parameters.

LEARNING_RATE_CLIP = 1e-5
MOMENTUM_ORIGINAL = 0.1
MOMENTUM_DECCAY = 0.5
MOMENTUM_DECCAY_STEP = args.step_size

global_epoch = 0
best_iou = 0

In [14]:
for epoch in range(start_epoch, args.epoch):
    '''Train on chopped scenes'''
    print('**** Epoch %d (%d/%s) ****' % (global_epoch + 1, epoch + 1, args.epoch))
    lr = max(args.learning_rate * (args.lr_decay ** (epoch // args.step_size)), LEARNING_RATE_CLIP)
    print('Learning rate:%f' % lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    momentum = MOMENTUM_ORIGINAL * (MOMENTUM_DECCAY ** (epoch // MOMENTUM_DECCAY_STEP))
    if momentum < 0.01:
        momentum = 0.01
    print('BN momentum updated to: %f' % momentum)
    classifier = classifier.apply(lambda x: bn_momentum_adjust(x, momentum))
    num_batches = len(trainDataLoader)
    total_correct = 0
    total_seen = 0
    loss_sum = 0
    classifier = classifier.train()

    for i, (points, target) in tqdm(enumerate(trainDataLoader), total=len(trainDataLoader), smoothing=0.9):
        optimizer.zero_grad()

        points = points.data.numpy()
        points[:, :, :3] = rotate_point_cloud_z(points[:, :, :3])
        points = torch.Tensor(points)
        points, target = points.float().cuda(), target.long().cuda()
        points = points.transpose(2, 1)

        seg_pred, trans_feat = classifier(points)
        seg_pred = seg_pred.contiguous().view(-1, NUM_CLASSES)

        batch_label = target.view(-1, 1)[:, 0].cpu().data.numpy()
        target = target.view(-1, 1)[:, 0]
        loss = criterion(seg_pred, target, trans_feat, weights)
        loss.backward()
        optimizer.step()

        pred_choice = seg_pred.cpu().data.max(1)[1].numpy()
        correct = np.sum(pred_choice == batch_label)
        total_correct += correct
        total_seen += (BATCH_SIZE * NUM_POINT)
        loss_sum += loss
    print('Training mean loss: %f' % (loss_sum / num_batches))
    print('Training accuracy: %f' % (total_correct / float(total_seen)))

    with torch.no_grad():
        
        mIoU = test(classifier.eval(), testDataLoader)

        if mIoU >= best_iou:
            best_iou = mIoU
            print('Save model...')
            savepath = 'weights' + '/best_model_segmentation.pth'
            print('Saving at %s' % savepath)
            state = {
                'epoch': epoch,
                'class_avg_iou': mIoU,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }
            torch.save(state, savepath)
            print('Saving model....')
        print('Best mIoU: %f' % best_iou)
    global_epoch += 1



**** Epoch 1 (1/32) ****
Learning rate:0.001000
BN momentum updated to: 0.100000


100%|██████████| 2973/2973 [12:03<00:00,  4.11it/s]

Training mean loss: 1.112368
Training accuracy: 0.674392
---- EPOCH 001 EVALUATION ----



 43%|████▎     | 501/1176 [01:24<01:03, 10.67it/s]