In [1]:
import yaml
import numpy as np
import pandas as ps
from pathlib import Path

import matplotlib.pyplot as plt
from skimage.io import imshow
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
RANDOM_STATE = np.random.RandomState(seed=2019)
DATA = ps.read_csv(Path('..') / 'data' / 'train.csv')
DATA.head()

Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [3]:
def combine_masks(df):
    masks = [''] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = df.at[idx, 'EncodedPixels']
    return ps.Series(masks, [f'cls{i}' for i in range(1, 5)])


DATA['Image'] = DATA['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
DATA['ClassId'] = DATA['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
# DATA = DATA[DATA['EncodedPixels'].notnull()].reset_index(drop=True)
DATA = DATA.sort_values(['Image', 'ClassId'])
DATA = DATA.groupby('Image').apply(combine_masks).reset_index()

images_folder = Path('..') / 'data' / 'train_images'
DATA['Image'] = DATA['Image'].apply(lambda img_path: (images_folder / img_path).resolve())
DATA['IsAllMissing'] = DATA.apply(lambda row: all(row[f'cls{i}'] != row[f'cls{i}'] for i in range(1, 5)), axis=1)


print(DATA.dtypes)

DATA.head()

Image           object
cls1            object
cls2            object
cls3            object
cls4            object
IsAllMissing      bool
dtype: object


Unnamed: 0,Image,cls1,cls2,cls3,cls4,IsAllMissing
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,,False
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,True
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,True
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,True
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,,False


# Segmentation

## Train & Validation split

In [4]:
def split_dataset(dataset: ps.DataFrame, 
                  validation_pcnt: float = .2,
                  random_state: np.random.RandomState = np.random.RandomState(2019),
                  how: str = 'sample'):
    assert how in {'sample', 'split'}, '`how` should be one of `sample` or `split` '
    
    print(f'Input shapes - {dataset.shape}')
    if how == 'sample':
        train_set = dataset.copy()

        validation_size = int(train_set.shape[0] * validation_pcnt)
        validation_set = dataset.sample(validation_size, random_state=random_state)
    else:
        train_set, validation_set = train_test_split(
            dataset, 
            test_size=validation_pcnt,
            random_state=random_state,
            stratify=dataset['IsAllMissing']
        )
    
    print(f'Train shapes - {train_set.shape}')
    print(f'Validation shapes - {validation_set.shape}')
    return train_set, validation_set

In [5]:
train_set, validation_set = split_dataset(
    DATA[~DATA['IsAllMissing']],
    how='sample'
)

Input shapes - (6666, 6)
Train shapes - (6666, 6)
Validation shapes - (1333, 6)


In [6]:
cols = ['Image'] + [f'cls{i}' for i in range(1, 5)]

train_set = train_set[cols]
validation_set = validation_set[cols]

train_set.head()

Unnamed: 0,Image,cls1,cls2,cls3,cls4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,
5,/home/dmdr/Documents/Code/Python/kaggle/severs...,37607 3 37858 8 38108 14 38359 20 38610 25 388...,,,
6,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,131973 1 132228 4 132483 6 132738 8 132993 11 ...
7,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,229501 11 229741 33 229981 55 230221 77 230468...,


## Dump

In [7]:
csvs_folder = Path('..') / 'data' / 'splits'

In [8]:
train_set.to_csv(csvs_folder / 'train_set.csv', index=False)
validation_set.to_csv(csvs_folder / 'validation_set.csv', index=False)

# Classification

In [9]:
train_set, validation_set = split_dataset(DATA, how='split')

Input shapes - (12568, 6)
Train shapes - (10054, 6)
Validation shapes - (2514, 6)


In [10]:
cols = ['Image', 'IsAllMissing']

train_set = train_set[cols]
train_set['IsAllMissing'] = train_set['IsAllMissing'].astype(int)

validation_set = validation_set[cols]
validation_set['IsAllMissing'] = validation_set['IsAllMissing'].astype(int)

train_set.head()

Unnamed: 0,Image,IsAllMissing
1510,/home/dmdr/Documents/Code/Python/kaggle/severs...,1
5375,/home/dmdr/Documents/Code/Python/kaggle/severs...,0
5047,/home/dmdr/Documents/Code/Python/kaggle/severs...,1
9021,/home/dmdr/Documents/Code/Python/kaggle/severs...,1
12342,/home/dmdr/Documents/Code/Python/kaggle/severs...,0


## Dump

In [11]:
train_set.to_csv(csvs_folder / 'classification_train_set.csv', index=False)
validation_set.to_csv(csvs_folder / 'classification_validation_set.csv', index=False)