In [1]:
import yaml
import numpy as np
import pandas as ps
from pathlib import Path

import matplotlib.pyplot as plt
from skimage.io import imshow

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
RANDOM_STATE = np.random.RandomState(seed=2019)
DATA = ps.read_csv(Path('..') / 'data' / 'train.csv')
DATA.head()

Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [3]:
DATA = DATA[DATA['EncodedPixels'].notnull()].reset_index(drop=True)
DATA['Image'] = DATA['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
DATA['ClassId'] = DATA['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
DATA = DATA.sort_values(['Image', 'ClassId'])

print(DATA.dtypes)

DATA.head()

ImageId_ClassId    object
EncodedPixels      object
Image              object
ClassId             int64
dtype: object


Unnamed: 0,ImageId_ClassId,EncodedPixels,Image,ClassId
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...,0002cc93b.jpg,0
1,0007a71bf.jpg_3,18661 28 18863 82 19091 110 19347 110 19603 11...,0007a71bf.jpg,2
2,000a4bcdd.jpg_1,37607 3 37858 8 38108 14 38359 20 38610 25 388...,000a4bcdd.jpg,0
3,000f6bf48.jpg_4,131973 1 132228 4 132483 6 132738 8 132993 11 ...,000f6bf48.jpg,3
4,0014fce06.jpg_3,229501 11 229741 33 229981 55 230221 77 230468...,0014fce06.jpg,2


In [4]:
def combine_masks(df):
    masks = [''] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = df.at[idx, 'EncodedPixels']
    return ps.Series(masks, [f'cls{i}' for i in range(1, 5)])
    

DATA = DATA.groupby('Image').apply(combine_masks).reset_index()

images_folder = Path('..') / 'data' / 'train_images'
DATA['Image'] = DATA['Image'].apply(lambda img_path: (images_folder / img_path).resolve())

print(DATA.shape)
DATA.head()

(6666, 5)


Unnamed: 0,Image,cls1,cls2,cls3,cls4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,37607 3 37858 8 38108 14 38359 20 38610 25 388...,,,
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,229501 11 229741 33 229981 55 230221 77 230468...,


# Train & Validation split

In [5]:
train_set = DATA.copy()

print(train_set.shape)
train_set.head()

(6666, 5)


Unnamed: 0,Image,cls1,cls2,cls3,cls4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,37607 3 37858 8 38108 14 38359 20 38610 25 388...,,,
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,229501 11 229741 33 229981 55 230221 77 230468...,


In [6]:
validation_pcnt = 0.2
validation_set = DATA.sample(int(train_set.shape[0] * validation_pcnt),
                             random_state=RANDOM_STATE)
print(validation_set.shape)
validation_set.head()

(1333, 5)


Unnamed: 0,Image,cls1,cls2,cls3,cls4
5524,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,285887 24 286117 51 286367 59 286620 63 286874...,
2292,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,172782 19 173002 55 173221 92 173441 128 17366...,
3179,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,275937 32 276129 96 276321 160 276513 13280 28...,
600,/home/dmdr/Documents/Code/Python/kaggle/severs...,131870 32 132097 66 132353 71 132609 76 132778...,,,
2076,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,366032 12 366286 15 366539 18 366792 22 367046...,


# Dump csvs

In [7]:
csvs_folder = Path('..') / 'data' / 'splits'


train_set.to_csv(csvs_folder / 'train_set.csv', index=False)
validation_set.to_csv(csvs_folder / 'validation_set.csv', index=False)

In [8]:
test = ps.read_csv(csvs_folder / 'train_set.csv')
print(test.shape)
print(test.dtypes)
test.head()

(6666, 5)
Image    object
cls1     object
cls2     object
cls3     object
cls4     object
dtype: object


Unnamed: 0,Image,cls1,cls2,cls3,cls4
0,/home/dmdr/Documents/Code/Python/kaggle/severs...,29102 12 29346 24 29602 24 29858 24 30114 24 3...,,,
1,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,18661 28 18863 82 19091 110 19347 110 19603 11...,
2,/home/dmdr/Documents/Code/Python/kaggle/severs...,37607 3 37858 8 38108 14 38359 20 38610 25 388...,,,
3,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,/home/dmdr/Documents/Code/Python/kaggle/severs...,,,229501 11 229741 33 229981 55 230221 77 230468...,


In [9]:
test.at[0, 'Image']

'/home/dmdr/Documents/Code/Python/kaggle/severstal_steel_defect_detection/data/train_images/0002cc93b.jpg'

In [10]:
def _rle_str2arr(rle_str: str) -> np.ndarray:
    return np.array(list(map(int, rle_str.split(' '))))
