# SHIP DETECTION CHALLENGE

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from skimage.measure import label, regionprops

## TOOLS : CUSTOM FUNCTIONS AND CLASSES

In [None]:
class SatelliteImage(object):       
    def load_file(filePath):
        matrix = cv2.imread(filePath)
        matrix = cv2.cvtColor(matrix, cv2.COLOR_BGR2RGB)
        
        instance = SatelliteImage()
        instance.image = matrix
        instance.height = matrix.shape[0]
        instance.width = matrix.shape[1]
        instance.shape = (instance.height, instance.width)
        
        return instance
    
    def __init__(self):
        self.title = ""
        self.image = None
        self.width = 0
        self.height = 0
        self.shape = (0, 0)
        self.objects_mask_image = None
    
    def set_title(self, title):
        self.title = title
    
    def add_objects(self, rle_masks):   
        masks = np.zeros(self.shape, dtype = np.uint8)
        
        for rle_mask in rle_masks:
            if isinstance(rle_mask, str):
                masks += RDE.decode(rle_mask, self.shape)
        
        self.objects_mask_image = np.expand_dims(masks, -1)
    
    def show(self, with_object_rectangles = False):
        plt.axis("off")
        plt.title(self.title)
        
        if with_object_rectangles:
            copy = self.image.copy()
            for prop in regionprops(label(self.objects_mask_image)): 
                cv2.rectangle(
                    img = copy, 
                    pt1 = (prop.bbox[1], prop.bbox[0]), 
                    pt2 = (prop.bbox[3], prop.bbox[2]), 
                    color = (250, 0, 0), 
                    thickness = 2
                )
            plt.imshow(copy)
            
        else:
            plt.imshow(self.image)

In [None]:
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
class RDE(object):
    def encode(image):
        pixels = image.T.flatten()
        pixels = np.concatenate([[0], pixels, [0]])
        runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
        runs[1::2] -= runs[::2]
        
        return ' '.join(str(x) for x in runs)
    
    def decode(encoding, shape):
        strings = encoding.split()
        starts, lengths = [np.asarray(x, dtype = int) for x in (strings[0:][::2], strings[1:][::2])]
        starts -= 1
        ends = starts + lengths
        image = np.zeros(shape[0] * shape[1], dtype = np.uint8)
        
        for lo, hi in zip(starts, ends):
            image[lo:hi] = 1
        
        return image.reshape(shape).T

## IMPORTING DATA

In [None]:
PROJECT_DIR = os.path.join("..", "input")
SHIP_MASKS_DATASET_FILE = os.path.join(PROJECT_DIR, "train_ship_segmentations.csv")
IMAGES_PROCESSED_DIR = os.path.join(PROJECT_DIR, "train")
IMAGES_UNPROCESSED_DIR = os.path.join(PROJECT_DIR, "test")

In [None]:
def create_and_populate_images_dataSet(images_dir):
    images_dataSet = pd.DataFrame(os.listdir(images_dir), columns = ["filename"])
    images_dataSet["id"] = images_dataSet["filename"].str.split(".").str[0].astype(str)
    images_dataSet["filePath"] = images_dir + "/" + images_dataSet["filename"]
    images_dataSet.set_index(["id"], inplace = True)
    
    return images_dataSet

In [None]:
IMAGES_PROCESSED_DATASET = create_and_populate_images_dataSet(IMAGES_PROCESSED_DIR)
IMAGES_UNPROCESSED_DATASET = create_and_populate_images_dataSet(IMAGES_UNPROCESSED_DIR)

IMAGES_PROCESSED_COUNT = IMAGES_PROCESSED_DATASET.shape[0]
IMAGES_UNPROCESSED_COUNT = IMAGES_UNPROCESSED_DATASET.shape[0]
IMAGES_COUNT = IMAGES_PROCESSED_COUNT + IMAGES_UNPROCESSED_COUNT

print("{:,d} images | {:,d} processed images ({:.2f}%) | {:,d} unprocessed images ({:.2f}%)".format(
    IMAGES_COUNT,
    IMAGES_PROCESSED_COUNT, (IMAGES_PROCESSED_COUNT / IMAGES_COUNT) * 100,
    IMAGES_UNPROCESSED_COUNT, (IMAGES_UNPROCESSED_COUNT / IMAGES_COUNT) * 100
))

In [None]:
SHIP_MASKS_DATASET = pd.read_csv(SHIP_MASKS_DATASET_FILE, names = ["filename", "mask"], header = 0)
SHIP_MASKS_DATASET["imageId"] = SHIP_MASKS_DATASET["filename"].str.split(".").str[0].astype(str)
SHIP_MASKS_DATASET = SHIP_MASKS_DATASET.drop("filename", axis = 1)
SHIP_MASKS_DATASET.set_index(["imageId"], inplace = True)

IMAGES_WITH_SHIPS_COUNT = SHIP_MASKS_DATASET.groupby("imageId").count().astype(bool).sum().values[0]
IMAGES_WITHOUT_SHIPS_COUNT = IMAGES_PROCESSED_COUNT - IMAGES_WITH_SHIPS_COUNT

print("{:,d} processed images | {:,d} images with ships ({:.2f}%) | {:,d} images without ships ({:.2f}%)".format(
    IMAGES_PROCESSED_COUNT,
    IMAGES_WITH_SHIPS_COUNT, (IMAGES_WITH_SHIPS_COUNT / IMAGES_PROCESSED_COUNT) * 100,
    IMAGES_WITHOUT_SHIPS_COUNT, (IMAGES_WITHOUT_SHIPS_COUNT / IMAGES_PROCESSED_COUNT) * 100
))

## DATA ANALYSIS

In [None]:
def create_SatelliteImage_from_sample(index, sample):
    satelliteImage = SatelliteImage.load_file(sample["filePath"])
    satelliteImage.set_title("#{:s} ({:d} ships)".format(index.upper(), sample["shipsCount"]))
    satelliteImage.add_objects(pd.Series(SHIP_MASKS_DATASET.loc[index]["mask"]))
    
    return satelliteImage

In [None]:
def display_SatelliteImages(satelliteImages, cols = 4, figsize = (5, 5)):
    w, h = figsize
    rows = len(satelliteImages) * 1 // cols + 1
    plt.figure(figsize = (cols * w, rows * h))

    i = 1    
    for satelliteImage in satelliteImages:
        plt.subplot(rows, cols, i)
        satelliteImage.show(True)
        i += 1

    plt.tight_layout()
    plt.show()

In [None]:
SHIPS_BY_IMAGE = SHIP_MASKS_DATASET.groupby("imageId").count().astype(int)
SHIPS_BY_IMAGE = SHIPS_BY_IMAGE.rename(columns = {"mask" : "shipsCount"})
IMAGES_PROCESSED_DATASET = IMAGES_PROCESSED_DATASET.join(SHIPS_BY_IMAGE, how = "inner")

### DATA ANALYSIS : SHIPS FREQUENCY

In [None]:
SHIPS_BY_IMAGE_COUNT = len(SHIPS_BY_IMAGE)

SHIPS_BY_IMAGE_MEAN = SHIPS_BY_IMAGE["shipsCount"].mean()
SHIPS_BY_IMAGE_STD = SHIPS_BY_IMAGE["shipsCount"].std()
SHIPS_BY_IMAGE_LOWER = int(max(0, SHIPS_BY_IMAGE_MEAN - 3 * SHIPS_BY_IMAGE_STD))
SHIPS_BY_IMAGE_UPPER = int(min(SHIPS_BY_IMAGE_MEAN + 3 * SHIPS_BY_IMAGE_STD, SHIPS_BY_IMAGE["shipsCount"].max()))

SHIPS_BY_IMAGE_OUTLIERS = SHIPS_BY_IMAGE[(SHIPS_BY_IMAGE["shipsCount"] < SHIPS_BY_IMAGE_LOWER) | (SHIPS_BY_IMAGE["shipsCount"] > SHIPS_BY_IMAGE_UPPER)]
SHIPS_BY_IMAGE_OUTLIERS_COUNT = SHIPS_BY_IMAGE_OUTLIERS.shape[0]

print("{:.2f} ± {:.2f} ships by image ({:.2f}%) | {:,d} outlier images ({:.2f}%)".format(
    SHIPS_BY_IMAGE_MEAN, 3 * SHIPS_BY_IMAGE_STD, ((SHIPS_BY_IMAGE_COUNT - SHIPS_BY_IMAGE_OUTLIERS_COUNT) / SHIPS_BY_IMAGE_COUNT) * 100,
    SHIPS_BY_IMAGE_OUTLIERS_COUNT, (SHIPS_BY_IMAGE_OUTLIERS_COUNT / SHIPS_BY_IMAGE_COUNT) * 100
))

In [None]:
def display_shipsCount_frequency(shipsCounts):
    plt.figure(figsize = (23, 8))
    ax = sns.countplot(data = shipsCounts, x = "shipsCount")
    
    for p in ax.patches:
        x = p.get_bbox().get_points()[:,0]
        y = p.get_bbox().get_points()[1,1]
        ax.annotate("{:.2f}%".format(y / len(shipsCounts) * 100), (x.mean(), y), ha = "center", va = "bottom")
    
    plt.title("Ships Frequency Distribution")
    plt.ylabel("Frequency")
    plt.xlabel("Ships count")
    plt.show()
    
display_shipsCount_frequency(SHIPS_BY_IMAGE)

#### Insights :
* Most of the images are empty.

### DATA ANALYSIS : IMAGES WITHOUT SHIP

In [None]:
SAMPLES = IMAGES_PROCESSED_DATASET[IMAGES_PROCESSED_DATASET["shipsCount"] == 0].sample(32)
SATELLITE_IMAGES = [create_SatelliteImage_from_sample(index, sample) for index, sample in SAMPLES.iterrows()]
display_SatelliteImages(SATELLITE_IMAGES)

### DATA ANALYSIS : IMAGES WITH SHIPS

In [None]:
SAMPLES = IMAGES_PROCESSED_DATASET[(IMAGES_PROCESSED_DATASET["shipsCount"] > 0) & (IMAGES_PROCESSED_DATASET["shipsCount"] < SHIPS_BY_IMAGE_UPPER)].sample(32)
SATELLITE_IMAGES = [create_SatelliteImage_from_sample(index, sample) for index, sample in SAMPLES.iterrows()]
display_SatelliteImages(SATELLITE_IMAGES)

#### Insights :
* Some images are corrupted.
* There are 4 kinds of image : all sea, no sea, shore and sea, cloud and sea.
* Images are not take all at the same zoom level.

### DATA ANALYSIS : IMAGES WITH MANY SHIPS (OUTLIERS)

In [None]:
SAMPLES = IMAGES_PROCESSED_DATASET[IMAGES_PROCESSED_DATASET["shipsCount"] > SHIPS_BY_IMAGE_UPPER].sample(32)
SATELLITE_IMAGES = [create_SatelliteImage_from_sample(index, sample) for index, sample in SAMPLES.iterrows()]
display_SatelliteImages(SATELLITE_IMAGES)

#### Insights :