In [None]:
import os
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import albumentations as A
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
import keras
import tensorflow as tf
from tensorflow.keras.models import load_model

In [None]:
!pip install efficientnet
import efficientnet.tfkeras

In [None]:
!pip install git+https://github.com/qubvel/segmentation_models
import segmentation_models as sm

In [None]:
DATA_DIR = "../input/cityscapes-dataset-1000/dataset_square1024_1000items/dataset_square1024_1000items/"
IMAGES_DIR = DATA_DIR + "images"
MASKS_DIR = DATA_DIR + "masks"

In [None]:
all_ids = os.listdir(IMAGES_DIR)
ids_train, ids_val = train_test_split(all_ids, test_size = 0.15, random_state = 1)
assert set(all_ids) == set(ids_train + ids_val)

# Dataloader and utility functions

In [None]:
# helper function for data visualization
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()
    
# helper function for data visualization    
def denormalize(x):
    """Scale image to range 0..1 for correct plot"""
    x_max = np.percentile(x, 98)
    x_min = np.percentile(x, 2)    
    x = (x - x_min) / (x_max - x_min)
    x = x.clip(0, 1)
    return x
    
class Dataset():
    
    """
    Read images, apply augmentation and preprocessing transformations.
    
    Args:
        images_dir: path to images folder
        masks_dir: path to segmentation masks folder
        class_values: values of classes to extract from segmentation mask
        ids: list of image/mask names to use in this dataset, if not given all the files in given directories would be used
        augmentation: data transfromation pipeline
        preprocessing: data preprocessing 
    """
    
    CLASSES = ['ground', 'road', 'sidewalk', 'building', 'fence',
              'traffic light', 'traffic sign', 'vegetation', 'sky',
              'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']
    
    def __init__(self, images_dir, masks_dir, ids = None, augmentation = None, preprocessing = None):

        if ids != None:
            self.ids = ids
        else:
            self.ids = os.listdir(images_dir)
            
        self.images_paths = [os.path.join(images_dir, path) for path in self.ids]
        self.masks_paths_by_class = {cls: [os.path.join(masks_dir, cls, path) for path in self.ids] for cls in self.CLASSES}
        
        self.augmentation = augmentation
        self.preprocessing = preprocessing
    
    
    def __getitem__(self, i):
        image = cv2.cvtColor(cv2.imread(self.images_paths[i]), cv2.COLOR_BGR2RGB)
        
        # array of one-class masks with boolean values
        masks = []
        
        for cls in self.CLASSES:
            # rgb mask to boolean 
            mask = cv2.imread(self.masks_paths_by_class[cls][i])
            mask = (mask[:, :, 0:3] == [255,255,255]).all(2)
            masks.append(mask)
        
        # Combine one-class masks to multi-class mask where each pixel is represented by one-hot vector
        mask = np.stack(masks, axis = -1).astype("float")

        # add background if mask is not binary
        if mask.shape[-1] != 1:
            background = 1 - mask.sum(axis=-1, keepdims=True)
            mask = np.concatenate((mask, background), axis=-1)
            
        # augmentation
        if self.augmentation != None: 
            sample = self.augmentation(image = image, mask = mask)
            image, mask = sample["image"], sample["mask"]
    
        # preprocessing
        if self.preprocessing != None: 
            image = self.preprocessing(image = image)["image"]
        
        return image, mask
    
    def __len__(self):
        return len(self.ids)

    
class Dataloader(keras.utils.Sequence):
    
    """
    Wrapper for dataset instance which samples batches of images
    
    Args:
        dataset: Dataset class instance for image loading and preprocessing
        batch_size: the number of image-mask pairs sampled in each batch
        shuffle: if shuffle the dataset after each epoch
    """
    
    def __init__(self, dataset, batch_size = 1, shuffle = False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(dataset))
        
        self.on_epoch_end()
        
    def __getitem__(self, i):
        start = self.batch_size * i
        stop = self.batch_size * (i + 1)
        data = []
        
        for j in range(start, stop):
            data.append(self.dataset[j])
        
        # List of two np darrays: with images and with masks
        batch = [np.stack(samples, axis = 0) for samples in zip(*data)]
        
        return batch
        
    def __len__(self):
        return len(self.indexes) // self.batch_size
    
    def on_epoch_end(self):
        if self.shuffle:
            self.indexes = np.random.permutation(self.indexes)    

In [None]:
# Augmentations were selected so that to maximally close mosreg dataset to google maps images distribution
def get_training_augmentation():
    """
    Return: albumentations.Compose with augmentations
    """
    # We don't care about the rotation of aerial image
    train_transform = [A.Flip(p=0.75)]
    
    return A.Compose(train_transform)

# Image and mask increasing
def get_preprocessing(preprocessing_fn = None):
    """
    Construct preprocessing transform
    
    Arguments:
        preprocessing_fn (callbale): data normalization function 
            (can be specific for each pretrained neural network)
    Return:
        transform: albumentations.Compose
    
    """
    preprocessing_transform = [A.Lambda(image=preprocessing_fn)]
    
    return A.Compose(preprocessing_transform)


## Visualization of dataset

In [None]:
def visualize_image_with_bin_masks(image, mask):
    visualize(image = image, 
              cloud_dense = mask[:, :, 0], 
              cloud_translucent = mask[:, :, 1], 
              shadow = mask[:, :, 2], 
              snow = mask[:, :, 3])
    

# Training Setup

In [None]:
LR = 0.0001
EPOCHS = 20
BATCH_SIZE = 1
INPUT_SHAPE = (1024, 1024, 3)

In [None]:
BACKBONE = "efficientnetb5"
CLASSES = ['ground', 'road', 'sidewalk', 'building', 'fence',
              'traffic light', 'traffic sign', 'vegetation', 'sky',
              'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']

preprocess_input = sm.get_preprocessing(BACKBONE)

# define network parameters
n_classes = 1 if len(CLASSES) == 1 else (len(CLASSES) + 1)  # case for binary and multiclass segmentation
activation = 'sigmoid' if n_classes == 1 else 'softmax'

# create model and freeze encoder weights to prevent spoiling them by high dirst gradients
model = sm.Unet(BACKBONE, encoder_weights="imagenet", input_shape = INPUT_SHAPE, classes = n_classes, activation = activation)

In [None]:
# load weights to continue training
model.load_weights("../input/cityscapes-models/cityscapes_1000_20ep.h5")

In [None]:
train_dataset = Dataset(IMAGES_DIR, MASKS_DIR, ids = ids_train, augmentation = None, preprocessing = get_preprocessing(preprocess_input))
val_dataset = Dataset(IMAGES_DIR, MASKS_DIR, ids = ids_val, preprocessing = get_preprocessing(preprocess_input))

train_dataloader = Dataloader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_dataloader = Dataloader(dataset = val_dataset, batch_size = 1, shuffle = False)

# check shapes for errors
assert train_dataloader[0][0].shape == (BATCH_SIZE, INPUT_SHAPE[0], INPUT_SHAPE[1], INPUT_SHAPE[2])
assert train_dataloader[0][1].shape == (BATCH_SIZE, INPUT_SHAPE[0], INPUT_SHAPE[1], n_classes)

# Training

In [None]:
# define callbacks for learning rate scheduling and best checkpoints saving
callbacks = [
#     keras.callbacks.ModelCheckpoint('./clean_mosaic_unet_b4_40ep_dataset_1_best.hdf5', save_best_only=True, mode = "min"),
    keras.callbacks.ReduceLROnPlateau(verbose = 1),
]

# define optimizer
optim = keras.optimizers.Adam(LR)
total_loss = sm.losses.binary_focal_dice_loss  if n_classes == 1 else sm.losses.categorical_focal_dice_loss 
metrics = [sm.metrics.IOUScore(), sm.metrics.FScore()]

# compile keras model with defined optimozer, loss and metrics
model.compile(optim, total_loss, metrics)

In [None]:
history = model.fit_generator(
    train_dataloader, 
    steps_per_epoch=len(train_dataloader), 
    epochs=EPOCHS, 
    callbacks=callbacks,
    validation_data=val_dataloader, 
    validation_steps=len(val_dataloader),
)

model.save("cityscapes_1000_40ep.hdf5")
model.save_weights("cityscapes_1000_40ep.h5")


In [None]:
model.save("cityscapes_1000_35ep.hdf5")
model.save_weights("cityscapes_1000_35ep.h5")

## Video processing

In [None]:
def video_processing(file_path: str, output_name: str, model, colors_to_classes: dict, total_frames: int = None) -> None:
    """
    Open videofile, make segmentation of every frame and generate new video with contours of predicted classes
    
    Args:
        file_path: full path to videofile
        output_name: name of final video
        model: the model which would be used for multiclass semantic segmentation
        colors_to_classes: dictionary with colors of every class
        total_frames: (optional) total number of frames in final video
    """
    
    def frame_processing(frame: np.ndarray, model, colors_to_classes: dict) -> np.ndarray:
        """
        Transform frame shape to shape of model's input, predict masks of classes, drow contours and transform to original shape

        Args:
            frame: a frame of 720p video as np.ndarray
            model: the model which would be used for multiclass semantic segmentation
            colors_to_classes: dictionary with colors of every class
        Returns:
            processed_frame: frame of new 720p video, with drew contours
        """
        def predict_multiclass_mask(model, image: np.ndarray, image_preprocessing=get_preprocessing(preprocess_input)) -> np.ndarray:
            """
            Arguments:
            model: the model which would be used for multiclass semantic segmentation
            image: an RGB image as numpy.darray of shape (H, W, 3), based on which masks are predicted 
            image_preprocessing: function for preprocessing rgb image e.g. normalization, padding


            Returns: 
            mask: numpy.darray of shape (H, W), where every pixel has value from 0 to 4
            """
            # Convert RGB image to array if needed
            image = np.asarray(image)

            # Preprocess rgb image if needed
            if image_preprocessing != None:
                image = image_preprocessing(image = image)["image"]

            if len(image.shape) == 3:
            # input to the model should be of shape: (1, H, W, 3)
                image = np.expand_dims(image, 0)

            # Prediction is of shape: (1, H, W, len(classes)), last dimension is one-hot vector with probability of pixel being of the class predicted
            # Prediction array is squeezed to (H, W, len(classes))
            prediction = model.predict(image).squeeze()

            #transform one-hot vector to index of class with max probability
            #masks_idxs is of shape (H, W)
            mask = np.argmax(prediction, axis = 2)

            return mask
        
        def masks_dict_from_multiclass_mask(mask, classes = CLASSES,
                                    class_pixel_value=np.array([255,255,255]), background_pixel_value=np.array([0,0,0])):
            """
            Arguments:
                mask: numpy.darray of shape (H, W), where every pixel has value from 0 to 4
                class_pixel_value: RGB color of the class

            Returns: 
                mask_dict: dictionary with classes as keys and masks as numpy.darray of shape (H, W, 3) for every class
            """
            #Create dictionary for masks of every class
            masks_dict = {}

            idx = 0
            for class_name in classes:
                masks_dict[class_name] = np.where(mask[..., None] == idx, class_pixel_value, background_pixel_value).astype(np.uint8)
                idx += 1

            return masks_dict 
        
        def draw_contours_from_multiclass_masks(image: np.ndarray, class_to_mask_dict: dict, colors_to_classes: dict) -> np.ndarray:
            '''
            Arguments:
                image: an RGB image as numpy.darray of shape (H, W, 3)
                class_to_mask_dict: dictionary with classes as keys and masks as numpy.darray of shape (H, W, 3) for every class
                colors: list of tuples of format (r, g, b), where r,g,b integers from 0 to 255

            Returns:
                image: an RGB image with contours as numpy.darray of shape (H, W, 3) 
            '''

            def draw_contours_from_mask_on_image(image: np.ndarray, mask: np.ndarray, color: tuple) -> np.ndarray:
                '''
                Arguments:
                    image: an RGB image as numpy.darray of shape (H, W, 3)
                    mask: an RGB image as numpy.darray of shape (H, W, 3)
                    color: tuple of format (r, g, b), where r,g,b integers from 0 to 255

                Returns:
                    image: an RGB image with contours as numpy.darray of shape (H, W, 3) 
                '''

                mask = mask.astype(np.uint8)[:, :, 1]
                contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
                for cnt in contours:
                    cv2.drawContours(image, [cnt], -1, color, -1)
                return image
    
            for class_name in colors_to_classes.keys():
                color = colors_to_classes[class_name]
                if color is None:
                    pass
                else:
                    draw_contours_from_mask_on_image(image, class_to_mask_dict[class_name], color)

            return image
        
        shape_for_model_input = (1024, 576)
        pad_width = ((224,224), (0,0), (0,0))
        video_resolution = (1280, 720)
        alpha = 0.5  # Transparency factor.

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        downscaled = cv2.resize(frame, shape_for_model_input)
        downscaled_and_padded = np.pad(downscaled, pad_width)
        multiclass_mask = predict_multiclass_mask(model, downscaled_and_padded)
        masks = masks_dict_from_multiclass_mask(multiclass_mask)
        downscaled_with_contours = draw_contours_from_multiclass_masks(downscaled_and_padded, masks, colors_to_classes)[224:224+576,:,:]
        frame_with_contours = cv2.resize(downscaled_with_contours, video_resolution)
        processed_frame = cv2.cvtColor(frame_with_contours, cv2.COLOR_RGB2BGR)
        
        processed_frame = cv2.addWeighted(processed_frame, alpha, frame, 1 - alpha, 0)

        return processed_frame
    
    video = cv2.VideoCapture(file_path)
    
    width, height = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    fps = int(video.get(cv2.CAP_PROP_FPS))
    
    # Define codec and create VideoWriter
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    out = cv2.VideoWriter()
    output_file_name = f"{output_name}.mp4"
    out.open(output_file_name, fourcc, fps, (width, height), True) 
    
    if total_frames is not None:
        i = 0
        pbar = tqdm()
        pbar.reset(total = total_frames)
    
    while video.isOpened():
        _, frame = video.read()
        final_frame = frame_processing(frame, model, colors_to_classes)
        out.write(final_frame)
        
        if total_frames is not None:
            i += 1
            pbar.update()
            if i == total_frames :
                break

    video.release()
    out.release()

In [None]:
colors_to_classes = {
    'ground': None,
    'sky': (0,0,255),
    'road': (255,0,255), 
    'sidewalk': (0,255,255),
    'building': (255,127,0),
    'fence': (128,128,128),
    'vegetation': (0,255,0),
    'car': (255,0,0),
    'traffic light': (255,255,0),
    'traffic sign': (139,0,139),
    'truck': (255,0,0),
    'bus': (255,0,0),
    'train': None,
    'motorcycle': (255,105,180),
    'bicycle': (255,105,180)
}
file_path = '../input/test-video/test_video_msk_720.mp4'
final_video_name = 'test_msk_full'
total_frames = 12600


In [None]:
video_processing(file_path, final_video_name, model, colors_to_classes, total_frames)