In [1]:
import yaml
import numpy as np
import pandas as ps
from pathlib import Path

import matplotlib.pyplot as plt
from skimage.io import imshow
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
RANDOM_STATE = np.random.RandomState(seed=2019)
DATA = ps.read_csv(Path('..') / 'data' / 'train.csv')
DATA.head()

Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [3]:
def combine_masks(df):
    masks = [''] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = df.at[idx, 'EncodedPixels']
    return ps.Series(masks, [f'cls{i}' for i in range(1, 5)])


DATA['Image'] = DATA['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
DATA['ClassId'] = DATA['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
# DATA = DATA[DATA['EncodedPixels'].notnull()].reset_index(drop=True)
DATA = DATA.sort_values(['Image', 'ClassId'])
DATA = DATA.groupby('Image').apply(combine_masks).reset_index()

images_folder = Path('..') / 'data' / 'train_images'
DATA['Image'] = DATA['Image'].apply(lambda img_path: (images_folder / img_path).resolve())
DATA['NumMissing'] = DATA.apply(lambda row: 4 - sum(row[f'cls{i}'] != row[f'cls{i}'] for i in range(1, 5)), axis=1)
DATA['IsAllMissing'] = DATA['NumMissing'].apply(lambda item: item == 0)


print(DATA.dtypes)

DATA.head()

Image           object
cls1            object
cls2            object
cls3            object
cls4            object
NumMissing       int64
IsAllMissing      bool
dtype: object


Unnamed: 0,Image,cls1,cls2,cls3,cls4,NumMissing,IsAllMissing
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,,1,False
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,,1,False


# Segmentation

## Train & Validation split

In [4]:
def split_dataset(dataset: ps.DataFrame, 
                  validation_pcnt: float = .2,
                  random_state: np.random.RandomState = np.random.RandomState(2019),
                  how: str = 'sample',
                  stratify_set = None):
    assert how in {'sample', 'split'}, '`how` should be one of `sample` or `split` '
    
    print(f'Input shapes - {dataset.shape}')
    if how == 'sample':
        train_set = dataset.copy()

        validation_size = int(train_set.shape[0] * validation_pcnt)
        validation_set = dataset.sample(validation_size, random_state=random_state)
    else:
        train_set, validation_set = train_test_split(
            dataset, 
            test_size=validation_pcnt,
            random_state=random_state,
            stratify=stratify_set
        )
    
    print(f'Train shapes - {train_set.shape}')
    print(f'Validation shapes - {validation_set.shape}')
    return train_set, validation_set

In [5]:
train_set, validation_set = split_dataset(
    DATA[~DATA['IsAllMissing']],
    how='sample'
)

Input shapes - (6666, 7)
Train shapes - (6666, 7)
Validation shapes - (1333, 7)


In [6]:
cols = ['Image'] + [f'cls{i}' for i in range(1, 5)]

train_set = train_set[cols]
validation_set = validation_set[cols]

train_set.head()

Unnamed: 0,Image,cls1,cls2,cls3,cls4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,
5,/home/dmdr/Documents/Code/Python/kaggle/severs...,37607 3 37858 8 38108 14 38359 20 38610 25 388...,,,
6,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,131973 1 132228 4 132483 6 132738 8 132993 11 ...
7,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,229501 11 229741 33 229981 55 230221 77 230468...,


In [7]:
split_train_set, split_validation_set = split_dataset(
    DATA[~DATA['IsAllMissing']],
    how='split',
    validation_pcnt=.15
)

Input shapes - (6666, 7)
Train shapes - (5666, 7)
Validation shapes - (1000, 7)


In [8]:
split_train_set = split_train_set[cols]
split_validation_set = split_validation_set[cols]

split_train_set.head()

Unnamed: 0,Image,cls1,cls2,cls3,cls4
11881,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,65073 7 65315 21 65558 34 65800 48 66049 55 66...,278695 8 278943 18 279196 24 279448 31 279701 ...
9305,/home/dmdr/Documents/Code/Python/kaggle/severs...,201729 26 201985 55 202241 62 202497 69 202753...,,,
63,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,102 96 358 96 613 97 869 98 1125 98 1381 98 16...,
7866,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,52225 22 52481 64 52737 107 52993 149 53249 19...,
1586,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,94290 20 94544 24 94798 27 95053 30 95243 5 95...,


In [9]:
raw_train_set, raw_validation_set = split_dataset(
    DATA,
    how='split',
    validation_pcnt=.2,
    stratify_set=DATA['NumMissing']
)

Input shapes - (12568, 7)
Train shapes - (10054, 7)
Validation shapes - (2514, 7)


In [10]:
raw_train_set = raw_train_set[cols]
raw_validation_set = raw_validation_set[cols]

raw_train_set.head()

Unnamed: 0,Image,cls1,cls2,cls3,cls4
9736,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,45021 9 45260 28 45498 48 45737 66 45976 85 46...,
5773,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,
1502,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,224422 6 224670 18 224918 30 225166 42 225414 ...
2669,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,128261 238 128517 238 128773 238 129029 237 12...,132335 5 132590 15 132845 16 132862 2 133093 2...
2490,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,174550 43 174722 127 174894 211 175107 254 175...,


## Dump

In [11]:
csvs_folder = Path('..') / 'data' / 'splits'

In [12]:
train_set.to_csv(csvs_folder / 'train_set.csv', index=False)
validation_set.to_csv(csvs_folder / 'validation_set.csv', index=False)

split_train_set.to_csv(csvs_folder / 'split_train_set.csv', index=False)
split_validation_set.to_csv(csvs_folder / 'split_validation_set.csv', index=False)

raw_train_set.to_csv(csvs_folder / 'raw_train_set.csv', index=False)
raw_validation_set.to_csv(csvs_folder / 'raw_validation_set.csv', index=False)

# Classification

In [13]:
train_set, validation_set = split_dataset(DATA, how='split', stratify_set=DATA['IsAllMissing'])

Input shapes - (12568, 7)
Train shapes - (10054, 7)
Validation shapes - (2514, 7)


In [14]:
cols = ['Image', 'IsAllMissing']

train_set = train_set[cols]
train_set['IsAllMissing'] = train_set['IsAllMissing'].astype(int)

validation_set = validation_set[cols]
validation_set['IsAllMissing'] = validation_set['IsAllMissing'].astype(int)

train_set.head()

Unnamed: 0,Image,IsAllMissing
2805,/home/dmdr/Documents/Code/Python/kaggle/severs...,1
721,/home/dmdr/Documents/Code/Python/kaggle/severs...,0
3858,/home/dmdr/Documents/Code/Python/kaggle/severs...,0
1408,/home/dmdr/Documents/Code/Python/kaggle/severs...,1
2449,/home/dmdr/Documents/Code/Python/kaggle/severs...,1


## Dump

In [15]:
train_set.to_csv(csvs_folder / 'classification_train_set.csv', index=False)
validation_set.to_csv(csvs_folder / 'classification_validation_set.csv', index=False)