In [1]:
import yaml
import numpy as np
import pandas as ps
from pathlib import Path

import matplotlib.pyplot as plt
from skimage.io import imshow
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
RANDOM_STATE = np.random.RandomState(seed=2019)
DATA = ps.read_csv(Path('..') / 'data' / 'train.csv')
DATA.head()

Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [3]:
def combine_masks(df):
    masks = [''] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = df.at[idx, 'EncodedPixels']
    return ps.Series(masks, [f'cls{i}' for i in range(1, 5)])


DATA['Image'] = DATA['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
DATA['ClassId'] = DATA['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
# DATA = DATA[DATA['EncodedPixels'].notnull()].reset_index(drop=True)
DATA = DATA.sort_values(['Image', 'ClassId'])
DATA = DATA.groupby('Image').apply(combine_masks).reset_index()

images_folder = Path('..') / 'data' / 'train_images'
DATA['Image'] = DATA['Image'].apply(lambda img_path: (images_folder / img_path).resolve())
DATA['NumPresented'] = DATA.apply(lambda row: 4 - sum(row[f'cls{i}'] != row[f'cls{i}'] for i in range(1, 5)), axis=1)
DATA['IsAllMissing'] = DATA['NumPresented'].apply(lambda item: item == 0)
DATA['NeedToPredict'] = DATA['NumPresented'] > 0


print(DATA.dtypes)

DATA.head()

Image            object
cls1             object
cls2             object
cls3             object
cls4             object
NumPresented      int64
IsAllMissing       bool
NeedToPredict      bool
dtype: object


Unnamed: 0,Image,cls1,cls2,cls3,cls4,NumPresented,IsAllMissing,NeedToPredict
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,,1,False,True
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,,0,True,False
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,,1,False,True


In [4]:
from sklearn.model_selection import KFold

In [5]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
d = DATA[['Image', 'NeedToPredict']]
csvs_folder = Path('..') / 'data' / 'splits'

In [6]:
for fold_num, (train_idx, valid_idx) in enumerate(folds.split(d)):
    train_data, valid_data = d.iloc[train_idx], d.iloc[valid_idx]
    train_data.to_csv(csvs_folder / f'train_detection_fold_{fold_num + 1}.csv', index=False)
    valid_data.to_csv(csvs_folder / f'valid_detection_fold_{fold_num + 1}.csv', index=False)