In [None]:
import os
import random
import collections

import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import skimage
from skimage import exposure, img_as_float
#plt.style.use('dark_background')
#plt.style.use('seaborn-deep')
plt.style.use('Solarize_Light2')
%matplotlib inline

In [None]:
print(plt.style.available)

In [None]:
IMAGE_DIMENSIONS_NUM = 3
images_dir = '../input/train'
train_images_description_file_path = '../input/train.csv'
full_cwd_path = os.getcwd()
path_prefix, cwd_itself = os.path.split(full_cwd_path)
if cwd_itself != 'code':
    os.chdir(os.path.join(path_prefix, 'code'))
    print(os.getcwd())

In [None]:
train_images_names = os.listdir(images_dir)

In [None]:
#len(train_images_names)
print("train_images_names len: {}".format(len(train_images_names)))
#print("train_images_masks_names len: {}".format(len(train_images_masks_names)))
print(train_images_names[:10]) # Is an image name and mask name with the same index matd_df = pd.read_csv(segmentation_file_path)
#print(train_images_masks_names[:10])
train_images_description_df = pd.read_csv(train_images_description_file_path)
print(train_images_description_df.head())
print(train_images_description_df.shape)

In [None]:
train_images_description_df.describe()

In [None]:
train_images_description_df.info()

In [None]:
train_images_description_df['Id'].unique().shape

In [None]:
print(train_images_description_df['Id'].unique())

In [None]:
def select_objects(indexes_list, objects_names):
    return tuple(objects_names[i] for i in indexes_list)

In [None]:
def select_random_indexses_subset(size, subset_size):
    return random.sample(tuple(range(size)), subset_size)

In [None]:
def random_objects_select(objects_names, subset_size):
    objects_names_len = len(objects_names)
    indexes = select_random_indexses_subset(objects_names_len, subset_size)
    return select_objects(indexes, objects_names)

In [None]:
def select_offset_indexses_subset(size, subset_size, offset):
    return tuple(range(size))[offset:offset + subset_size]

In [None]:
def offset_objects_select(objects_names, subset_size, offset):
    objects_names_len = len(objects_names)
    indexes = select_offset_indexses_subset(objects_names_len, subset_size, offset)
    return select_objects(indexes, objects_names)

In [None]:
def show_images_grid(images_list, columns_num, figsize, images_names_list=None):
    rows_num = len(images_list) // columns_num
    fig, ax = plt.subplots(rows_num, columns_num, figsize=figsize)
    images_list_len = len(images_list)
    slice_len = columns_num
    images_list_slices_len = images_list_len // slice_len
    images_names_list_slices = []
    images_list_slices = [
        images_list[i * slice_len:(i+1) * slice_len]\
        for i in range(images_list_slices_len)
    ]
    if images_names_list:
        images_names_list_slices = [
            images_names_list[i * slice_len:(i+1) * slice_len]\
            for i in range(images_list_slices_len)
        ]
    #print("show_images_grid, images_names_list_slices:\n", images_names_list_slices)
    for i in range(ax.shape[0]):
        images_list_slice = images_list_slices[i]
        images_names_list_slice = []
        if images_names_list:
            images_names_list_slice = images_names_list_slices[i]
        #images_list_0 = [images_pair[0] for images_pair in paired_images_list_slice]
        #images_list_1 = [images_pair[1] for images_pair in paired_images_list_slice]
        for j in range(columns_num):
            if images_names_list:
                #print("show_images_grid, images_names_list_slice[{}]:\n".format(j), images_names_list_slice[j])
                ax[i, j].set_title(images_names_list_slice[j])            
            ax[i, j].imshow(images_list_slice[j])

            #ax[2 * i + 1, j].imshow(images_list_1[j])
    plt.tight_layout()
    plt.show()

In [None]:
def draw_images_with_histograms(images_list, columns_num, figsize, nbins):
    rows_num = 3 * len(images_list) // columns_num
    fig, ax = plt.subplots(rows_num, columns_num, figsize=figsize)
    images_list_len = len(images_list)
    slice_len = columns_num
    images_list_slices_len = images_list_len // slice_len
    images_list_slices = [
    images_list[i * slice_len:(i+1) * slice_len]\
        for i in range(images_list_slices_len)
    ]
    for i in range(ax.shape[0] // 3):
        images_list_slice = images_list_slices[i]
        for j in range(columns_num):
            ax[3 * i, j].imshow(images_list_slice[j])
            ax[3 * i, j].grid(False)
            img_cdf, cdf_bins = exposure.cumulative_distribution(images_list_slice[j].flatten())
            ax[3 * i + 1, j].hist(exposure.histogram(images_list_slice[j].flatten(), nbins=nbins))
            ax[3 * i + 2, j].plot(cdf_bins, img_cdf, color='green')
    plt.show()

In [None]:
def extract_object_pixels_by_mask(image, mask):
    return image * mask
apply_mask_to_color_channel = extract_object_pixels_by_mask

In [None]:
def extract_not_object_pixels_by_mask(image, mask):
    #return image * (np.ones(mask.shape) - mask)
    not_object_pixels = image * (np.ones(mask.shape) - mask)
    #print("extract_non_salt_pixels_by_mask:\n", non_salt_pixels)
    return not_object_pixels
apply_negative_mask_to_color_channel = extract_not_object_pixels_by_mask

In [None]:
def get_mask_complement(mask):
    return np.ones(mask.shape) - mask

In [None]:
def compute_object_pixels_num(mask):
    return mask.sum()

def compute_not_object_pixels_num(mask):
    return mask.size - mask.sum()

In [None]:
def compute_whole_image_mean_brightnes(image):
    return image.mean()

In [None]:
def compute_color_channel_mean_brightness(color_channel):
    return color_channel.mean()

In [None]:
def compute_color_channels_mean_brightness(color_channels):
    return [
        compute_color_channel_mean_brightness(color_channel) for color_channel in color_channels
    ]

In [None]:
def compute_images_list_color_channels_mean_brightness(images_color_channels_list):
    return [
        compute_color_channels_mean_brightness(
            image_color_channels
        ) for image_color_channels in images_color_channels_list
    ]

In [None]:
def divide_color_channels_values(color_channels_values):
    return (
        [color_channel_value[0] for color_channel_value in color_channels_values],
        [color_channel_value[1] for color_channel_value in color_channels_values],
        [color_channel_value[2] for color_channel_value in color_channels_values],
    )


In [None]:
def compute_object_pixels_mean_brightness(image, mask):
    full_color_image_salt_pixels = apply_mask_to_full_color_image(image, mask)
    #print(compute_salt_pixels_num(mask))
    salt_pixels_num = compute_salt_pixels_num(mask) * IMAGE_DIMENSIONS_NUM
    salt_pixels_sum = full_color_image_salt_pixels.sum()
    return 0.0 if salt_pixels_num == 0 else salt_pixels_sum / salt_pixels_num

In [None]:
def get_image_color_channels(image):
    return [image[:, :, i] for i in range(IMAGE_DIMENSIONS_NUM)]

def collapse_color_channels_to_image(color_components):
    return np.array([color_components[i].T for i in range(IMAGE_DIMENSIONS_NUM)]).T

In [None]:
def get_images_color_channels(images):
    return [get_image_color_channels(image) for image in images]

In [None]:
def apply_mask_to_color_channels(color_channels, mask):
    return [apply_mask_to_color_channel(color_channel, mask) for color_channel in color_channels]

In [None]:
def apply_negative_mask_to_color_channels(color_channels, mask):
    #print("apply_negative_mask_to_color_channels, color_channels:\n", color_channels)
    return [apply_negative_mask_to_color_channel(color_channel, mask) for color_channel in color_channels]

In [None]:
def apply_mask_to_full_color_image(image, mask):
    color_channels = get_image_color_channels(image)
    masked_color_channels = apply_mask_to_color_channels(
        color_channels,
        mask
    )
    return collapse_color_channels_to_image(masked_color_channels)

In [None]:
def apply_negative_mask_to_full_color_image(image, mask):
    color_channels = get_image_color_channels(image)
    masked_color_channels = apply_negative_mask_to_color_channels(
        color_channels,
        mask
    )
    return collapse_color_channels_to_image(masked_color_channels)

In [None]:
def calculate_mode(sample):
    flatten_sample = sample.flatten()
    counter = collections.Counter(sample.flatten())
    most_common = counter.most_common()
    if len(most_common) > 0:
        if len(most_common) == 1:
            return most_common[0]
        else:
            if most_common[0][0] == 0:
                return most_common[1]
            else:
                return most_common[0]
    else:
        return None
compute_color_channel_mode = calculate_mode

In [None]:
def collect_modes_and_frequencies(modes):
    modes_frequencies_dict = {}
    print("collect_modes_and_frequencies,  modes:\n", modes)
    for color_channels_modes in modes:
        mode_value, frequency = color_channels_modes[0]
        if mode_value in modes_frequencies_dict:
            modes_frequencies_dict[mode_value] += frequency
        else:
            modes_frequencies_dict[mode_value] = frequency
    return modes_frequencies_dict

In [None]:
def gather_color_channel_modes_and_frequencies(modes):
    modes_frequencies_dict = {}
    for color_channels_modes in modes:
        mode_value, frequency = color_channels_modes
        if mode_value in modes_frequencies_dict:
            modes_frequencies_dict[mode_value] += frequency
        else:
            modes_frequencies_dict[mode_value] = frequency
    return modes_frequencies_dict

In [None]:
def gather_objects_and_frequencies(objects_list):
    return collections.Counter(objects_list)

In [None]:
def gather_objects_and_frequencies(objects_list):
    frequencies_dict = {}
    for obj in objects_list:
        if obj in frequencies_dict:
            frequencies_dict[obj] += 1
        else:
            frequencies_dict[obj] = 1
    return frequencies_dict

In [None]:
def sort_gathered_objects(objects):
    objects_sorted = sorted(objects)
    objects_frequencies_sorted = []
    for obj in objects_sorted:
        objects_frequencies_sorted.append(objects[obj])
    return objects_sorted, objects_frequencies_sorted

In [None]:
def compute_color_channels_modes(color_channels):
    return [compute_color_channel_mode(color_channel) for color_channel in color_channels]

In [None]:
def compute_images_color_channels_modes(images_color_channels):
    return [
        compute_color_channels_modes(image_color_channels)\
        for image_color_channels in images_color_channels
    ]

In [None]:
def get_image_pixels(color_channels):
    return [
        (channel0value, channel1value, channel2value) for channel0value, channel1value, channel2value
           in zip(color_channels[0], color_channels[1], color_channels[2])
    ]

In [None]:
def get_image_pixels(image):
    color_channel0 = image[:, :, 0].ravel()
    color_channel1 = image[:, :, 1].ravel()
    color_channel2 = image[:, :, 2].ravel()
    return [(x, y, z) for x, y, z in zip(color_channel0, color_channel1, color_channel2)]

In [None]:
def compare_color_channels_by_pixels(image):
    image_pixels = get_image_pixels(image)
    return [compare_pixel_color_channels_values(pixel) for pixel in image_pixels]

In [None]:
def compare_images_color_channels(images):
    return [compare_color_channels_by_pixels(image) for image in images]

In [None]:
def count_image_color_channels(image):
    if len(image.shape) == 2:
        return 1
    else:
        return 3

In [None]:
whales_train_images_num = train_images_description_df.shape[0]
whales_train_images_names_sample = random_objects_select(train_images_names, int(whales_train_images_num / 4))

In [None]:
'''
whales_train_images = [
    mpimg.imread(os.path.join(images_dir, image_name)) for image_name in whales_train_images_names_sample
]
'''

'''
whales_train_images = [
    mpimg.imread(os.path.join(images_dir, image_name)) for image_name in train_images_names
]
'''

In [None]:
whales_train_images_dict = {
    image_name: mpimg.imread(os.path.join(images_dir, image_name)) for image_name in train_images_names
}

In [None]:
print(len(whales_train_images_dict))

In [None]:
print("whales_train_images_dict.values()[0].shape: ", tuple(whales_train_images_dict.values())[0].shape)

In [None]:
img = next(iter(whales_train_images_dict.values()))

In [None]:
img

In [None]:
plt.imshow(img)
plt.show()

In [None]:
whales_train_images = tuple(whales_train_images_dict.values())

In [None]:
print("whales_train_images[3471].shape: ", whales_train_images[3471].shape)

In [None]:
whales_train_images[3471]

In [None]:

plt.imshow(whales_train_images[3471])
plt.show()

In [None]:
#whales_images_subsample = whales_train_images[:40]
whales_images_subsample = [whales_train_images_dict[img_name] for img_name in whales_train_images_names_sample[:40]]

In [None]:
show_images_grid(whales_images_subsample, 8, (24, 24), images_names_list=whales_train_images_names_sample[:40])

In [None]:
draw_images_with_histograms(whales_images_subsample[:40], 8, (24, 24), 20)

In [None]:
whales_train_images_mean_brightness = [
    compute_whole_image_mean_brightnes(image) for image in whales_train_images
]

In [None]:
fig = plt.figure(figsize=(24, 24))
ax = fig.add_subplot(111)
ax.hist(whales_train_images_mean_brightness, bins=300)
ax.set_title("Mean brightnes distribution for train images")
#ax.set(title="Mean brightnes distribution for images with chips")
ax.set_xlabel("Mean brightness")
ax.set_ylabel("Num of images")
plt.show()

In [None]:
#random_selected_train_images = random_objects_select(train_images, 9000)

In [None]:
#print(type(random_selected_train_images[0]))
#print(random_selected_train_images[0])

In [None]:
#random_selected_train_images_mean_brightness = [
#    compute_whole_image_mean_brightnes(image) for image in random_selected_train_images
#]

In [None]:
#fig = plt.figure(figsize=(24, 24))
#ax = fig.add_subplot(111)
#ax.hist(random_selected_train_images_mean_brightness, bins=100)
#plt.show()

In [None]:
print(whales_train_images_mean_brightness[0])

In [None]:
print((sum(whales_train_images_mean_brightness)) / len(whales_train_images_mean_brightness))

In [None]:
print(type(whales_train_images[8].shape))

In [None]:
whales_train_images_shapes_counter_dict = {}
for image in whales_train_images:
    if image.shape in whales_train_images_shapes_counter_dict:
        whales_train_images_shapes_counter_dict[image.shape] += 1
    else:
        whales_train_images_shapes_counter_dict[image.shape] = 1

In [None]:
len(whales_train_images_shapes_counter_dict)

In [None]:
for key, value in whales_train_images_shapes_counter_dict.items():
    print(key, value)

In [None]:
whales_train_images_shapes_len_dict = {1: 0, 2: 0, 3: 0, 4: 0}
for key in whales_train_images_shapes_counter_dict.keys():
    whales_train_images_shapes_len_dict[len(key)] += 1

In [None]:
print(whales_train_images_shapes_len_dict)

In [None]:
whales_train_images_color_channels_counter_dict = {1: 0, 3: 0}
for image in whales_train_images:
    if len(image.shape) == 2:
        whales_train_images_color_channels_counter_dict[1] += 1
    elif len(image.shape) == 3 and image.shape[2] == 3:
        whales_train_images_color_channels_counter_dict[3] += 1

In [None]:
print(whales_train_images_color_channels_counter_dict)

In [None]:
whales_train_images_color_channels_counter_dict[1] + whales_train_images_color_channels_counter_dict[3] == len(whales_train_images)

In [None]:
print(type(whales_train_images))
print(len(whales_train_images))

In [None]:
#whales_train_images_color_channels = [get_image_color_channels(image) for image in whales_train_images]

In [None]:
group_by_id_train_images_description_df = train_images_description_df.groupby('Id')

In [None]:
example_of_grouped_objects = list(group_by_id_train_images_description_df)[:3]

In [None]:
for grouped_object in example_of_grouped_objects:
    print("\n*******\ntype(grouped_object): ", type(grouped_object))
    print("\ngrouped_object:\n", grouped_object)
    '''
    print("\ntype(grouped_object[0]): {}, type(grouped_object[1]): {}, type(grouped_object[2]): {} \n\n".format(
            type(grouped_object[0]),
            type(grouped_object[1]),
            type(grouped_object[2])
        )
    )
    '''
    print("\ntype(grouped_object[0]): {}, type(grouped_object[1]): {}\n\n".format(
            type(grouped_object[0]),
            type(grouped_object[1])
        )
    )

In [None]:
count_of_images_with_particular_id = group_by_id_train_images_description_df.count()

In [None]:
type(count_of_images_with_particular_id)

In [None]:
count_of_images_with_particular_id.shape

In [None]:
count_of_images_with_particular_id.head()

In [None]:
sorted_count_of_images_with_particular_id = count_of_images_with_particular_id.sort_values(by="Image")

In [None]:
type(sorted_count_of_images_with_particular_id)

In [None]:
sorted_count_of_images_with_particular_id.tail()

In [None]:
id_with_many_images_df = sorted_count_of_images_with_particular_id[sorted_count_of_images_with_particular_id['Image'] > 1]

In [None]:
id_with_many_images_df.shape

In [None]:
id_with_one_image_df = sorted_count_of_images_with_particular_id[sorted_count_of_images_with_particular_id['Image'] == 1]

In [None]:
id_with_one_image_df.shape

In [None]:
id_with_many_images_df.shape[0] + id_with_one_image_df.shape[0] == count_of_images_with_particular_id.shape[0]

In [None]:
type(group_by_id_train_images_description_df)

In [None]:
type(whales_train_images)

In [None]:
whales_train_images[0]

In [None]:
train_images_description_df_extend = train_images_description_df.copy()

In [None]:
train_images_description_df_extend is train_images_description_df

In [None]:
train_images_description_df_extend == train_images_description_df

In [None]:
train_images_description_df_extend['new_whale_feature'] = 1

In [None]:
train_images_description_df_extend.loc[train_images_description_df_extend['Id'] != 'new_whale', 'new_whale_feature'] = 0 

In [None]:
train_images_description_df_extend[train_images_description_df_extend['new_whale_feature'] == 0].shape[0]

In [None]:
train_images_description_df_extend[train_images_description_df_extend['new_whale_feature'] == 1].shape[0]

In [None]:
train_images_description_df_extend.head()

In [None]:
new_whale_image_names = train_images_description_df_extend.loc[train_images_description_df_extend['new_whale_feature'] == 1, 'Image']

In [None]:
new_whale_image_names.head()

In [None]:
for item in new_whale_image_names.head():
    print(item)

In [None]:
new_whales_train_images_mean_brightness = [
    compute_whole_image_mean_brightnes(image) for image in (
        whales_train_images_dict[img_name] for img_name in new_whale_image_names
    )
]

In [None]:
fig = plt.figure(figsize=(24, 24))
ax = fig.add_subplot(111)
ax.hist(new_whales_train_images_mean_brightness, bins=300)
ax.set_title("Mean brightnes distribution for new whales train images")
#ax.set(title="Mean brightnes distribution for images with chips")
ax.set_xlabel("Mean brightness")
ax.set_ylabel("Num of images")
plt.show()