In [1]:
import yaml
import numpy as np
import pandas as ps
from pathlib import Path

import matplotlib.pyplot as plt
from skimage.io import imshow
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
RANDOM_STATE = np.random.RandomState(seed=2019)
DATA = ps.read_csv(Path('..') / 'data' / 'train.csv')
DATA.head()

Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [3]:
def combine_masks(df):
    masks = [''] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = df.at[idx, 'EncodedPixels']
    return ps.Series(masks, [f'cls{i}' for i in range(1, 5)])


DATA['Image'] = DATA['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
DATA['ClassId'] = DATA['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
# DATA = DATA[DATA['EncodedPixels'].notnull()].reset_index(drop=True)
DATA = DATA.sort_values(['Image', 'ClassId'])
DATA = DATA.groupby('Image').apply(combine_masks).reset_index()

images_folder = Path('..') / 'data' / 'train_images'
DATA['Image'] = DATA['Image'].apply(lambda img_path: (images_folder / img_path).resolve())
DATA['NumMissing'] = DATA.apply(lambda row: 4 - sum(row[f'cls{i}'] != row[f'cls{i}'] for i in range(1, 5)), axis=1)
DATA['IsAllMissing'] = DATA['NumMissing'].apply(lambda item: item == 0)

for i in range(1, 5):
    DATA[f'channel_{i}'] = DATA[f'cls{i}'].apply(lambda item: item == item)

print(DATA.dtypes)

DATA.head()

Image           object
cls1            object
cls2            object
cls3            object
cls4            object
NumMissing       int64
IsAllMissing      bool
channel_1         bool
channel_2         bool
channel_3         bool
channel_4         bool
dtype: object


Unnamed: 0,Image,cls1,cls2,cls3,cls4,NumMissing,IsAllMissing,channel_1,channel_2,channel_3,channel_4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,,1,False,True,False,False,False
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False,False,False,False
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False,False,False,False
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False,False,False,False
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,,1,False,False,False,True,False


In [4]:
X = DATA[['Image'] + [f'cls{i}' for i in range(1, 5)]]
y = DATA[[f'channel_{i}' for i in range(1, 5)]]

In [5]:
mskf = MultilabelStratifiedKFold(n_splits=5, random_state=2019, shuffle=True)

In [6]:
csvs_folder = Path('..') / 'data' / 'splits'

In [16]:
for idx, (train_index, valid_index) in enumerate(mskf.split(X, y)):
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
    pattern = f'multiclass_fold_{idx + 1}'
    x_train.to_csv(csvs_folder / f'train_{pattern}.csv', index=False)
    x_valid.to_csv(csvs_folder / f'valid_{pattern}.csv', index=False)
    print(f'Saved splits for fold {idx + 1}')

Saved splits for fold 1
Saved splits for fold 2
Saved splits for fold 3
Saved splits for fold 4
Saved splits for fold 5
