In [1]:
import yaml
import numpy as np
import pandas as ps
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from pathlib import Path

import albumentations.pytorch
from albumentations.core.serialization import from_dict

from dataset import SteelDataset
from models import UNetResNet
from metrics import MulticlassDiceMetricCallback
from losses import BinaryDiceLoss

from fastprogress import master_bar, progress_bar

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
config_file = Path('.') / 'configs' / 'simple_unet.yaml'

with open(config_file, 'r') as fuck:
    config = yaml.safe_load(fuck)

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
DATA_FOLDER = Path('..') / 'data'
RANDOM_STATE = np.random.RandomState(seed=2019)
NUM_WORKERS = 6
BATCH_SIZE = 16
NUM_CLASSES = 4
validation_pcnt = 0.1

In [3]:
def load_transforms(transforms_dict: dict):
    if transforms_dict is not None:
        return from_dict(transforms_dict)
    return None


train_transforms = load_transforms(config['train']['transformations']['train'])
validation_transforms = load_transforms(config['train']['transformations']['validation'])

In [4]:
train_df = ps.read_csv(config['train']['file'])
train_images_folder = config['train']['folder']

print(f'train images folder - {train_images_folder}')
train_df.head()

train images folder - /home/dmdr/Documents/Code/Python/kaggle/severstal_steel_defect_detection/data/train_images


Unnamed: 0,ImageId_ClassId,EncodedPixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [5]:
train_df = train_df[train_df['EncodedPixels'].notnull()].reset_index(drop=True)
train_df['Image'] = train_df['ImageId_ClassId'].apply(lambda img_cls: img_cls.rsplit('_', 1)[0])
train_df['ClassId'] = train_df['ImageId_ClassId'].apply(lambda img_cls: int(img_cls.rsplit('_', 1)[1]) - 1)
train_df = train_df.sort_values(['Image', 'ClassId'])
# train_df = train_df.drop(columns=['ImageId_ClassId'])

print(train_df.dtypes)

train_df.head()

ImageId_ClassId    object
EncodedPixels      object
Image              object
ClassId             int64
dtype: object


Unnamed: 0,ImageId_ClassId,EncodedPixels,Image,ClassId
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...,0002cc93b.jpg,0
1,0007a71bf.jpg_3,18661 28 18863 82 19091 110 19347 110 19603 11...,0007a71bf.jpg,2
2,000a4bcdd.jpg_1,37607 3 37858 8 38108 14 38359 20 38610 25 388...,000a4bcdd.jpg,0
3,000f6bf48.jpg_4,131973 1 132228 4 132483 6 132738 8 132993 11 ...,000f6bf48.jpg,3
4,0014fce06.jpg_3,229501 11 229741 33 229981 55 230221 77 230468...,0014fce06.jpg,2


In [6]:
def combine_masks(df):
    masks = [[]] * 4
    for idx in df.index:
        masks[df.at[idx, 'ClassId']] = np.array(list(map(int, df.at[idx, 'EncodedPixels'].split())))
    return ps.Series([masks], ['RLEs'])
    

train_set = train_df.groupby('Image').apply(combine_masks).reset_index()

for cls_idx in range(4):
    cls_name = f'IsPresent{cls_idx}'
    train_set[cls_name] = False
    for idx in train_set.index:
        train_set.at[idx, cls_name] = len(train_set.at[idx, 'RLEs'][cls_idx]) > 0


print(train_set.shape)
train_set.head()

(6666, 6)


Unnamed: 0,Image,RLEs,IsPresent0,IsPresent1,IsPresent2,IsPresent3
0,0002cc93b.jpg,"[[29102, 12, 29346, 24, 29602, 24, 29858, 24, ...",True,False,False,False
1,0007a71bf.jpg,"[[], [], [18661, 28, 18863, 82, 19091, 110, 19...",False,False,True,False
2,000a4bcdd.jpg,"[[37607, 3, 37858, 8, 38108, 14, 38359, 20, 38...",True,False,False,False
3,000f6bf48.jpg,"[[], [], [], [131973, 1, 132228, 4, 132483, 6,...",False,False,False,True
4,0014fce06.jpg,"[[], [], [229501, 11, 229741, 33, 229981, 55, ...",False,False,True,False


In [7]:
validation_set = train_set.sample(int(train_set.shape[0] * validation_pcnt),
                                  random_state=RANDOM_STATE)
print(validation_set.shape)
validation_set.head()

(666, 6)


Unnamed: 0,Image,RLEs,IsPresent0,IsPresent1,IsPresent2,IsPresent3
5524,d2c14891a.jpg,"[[], [], [285887, 24, 286117, 51, 286367, 59, ...",False,False,True,False
2292,5a3db6845.jpg,"[[], [], [172782, 19, 173002, 55, 173221, 92, ...",False,False,True,False
3179,7a5fae002.jpg,"[[], [], [275937, 32, 276129, 96, 276321, 160,...",False,False,True,False
600,17c821b3f.jpg,"[[131870, 32, 132097, 66, 132353, 71, 132609, ...",True,False,False,False
2076,5132041da.jpg,"[[], [], [366032, 12, 366286, 15, 366539, 18, ...",False,False,True,False


In [8]:
def get_loaders() -> tuple:
    train_dataset = SteelDataset(
        images=train_set['Image'].values, 
        rles=train_set['RLEs'].values,
        folder=DATA_FOLDER / 'train_images',
        transforms=train_transforms
    )
    validation_dataset = SteelDataset(
        images=validation_set['Image'].values, 
        rles=validation_set['RLEs'].values,
        folder=DATA_FOLDER / 'train_images',
        transforms=validation_transforms
    )
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS
    )

    validation_loader = DataLoader(
        validation_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS
    )
    
    print(f'Number of batches in train loader - {len(train_loader)}')
    print(f'Number of batches in validation loader - {len(validation_loader)}')
    
    return train_loader, validation_loader


train_loader, validation_loader = get_loaders()

Number of batches in train loader - 417
Number of batches in validation loader - 42


In [9]:
from collections import OrderedDict
from catalyst.contrib.models.segmentation import Unet
from losses import MulticlassDiceLoss, CCE, JointLoss


def dice_coef(y_pred, y_true, smooth=1):
    y_pred_f = torch.flatten(torch.sigmoid(y_pred))
    y_true_f = torch.flatten(y_true)
    intersection = torch.sum(y_true_f * y_pred_f)
    dice = (2. * intersection + smooth) / (torch.sum(y_true_f) + torch.sum(y_pred_f) + smooth)
    return dice.item()


def channelvise_dice_coef(y_pred, y_true, smooth=1):
    num_channels = y_true.size(1)
    return np.array([dice_coef(y_pred[:, c, ...], y_true[:, c, ...], smooth) 
                     for c in range(num_channels)])


model = Unet(num_classes=5, in_channels=3, num_channels=64, num_blocks=4).to(DEVICE)

multiclass_dice_loss = MulticlassDiceLoss([1, 2, 3, 4], from_logits=True, weight=None, reduction='elementwise_mean')
cce_loss = CCE(ignore_index=5)
alpha = 0.7
joint_loss = JointLoss(cce_loss, multiclass_dice_loss, alpha, 1 - alpha)

metrics = OrderedDict([
#     ('dice (by channel)', channelvise_dice_coef)   
])

In [10]:
def train(
    model: nn.Module,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    train: DataLoader, 
    valid: DataLoader,
    metrics: OrderedDict = None,
    num_epochs: int = 1):
    
    if metrics is None:
        metrics = OrderedDict()
    
    epochs_range = list(range(num_epochs))
    epochs_bar = master_bar(epochs_range)
    
    for epoch in epochs_bar:
        
        train_loss = .0
        model.train()
        
        for _x, _y in progress_bar(train, parent=epochs_bar):
            _x, _y = _x.to(DEVICE), _y.to(DEVICE)
            
            optimizer.zero_grad()
            _out = model(_x)
            _loss = criterion(_out, _y)
            _loss.backward()
            optimizer.step()
            
            train_loss += _loss.item() / len(train)
            
            epochs_bar.child.comment = f'loss - {_loss.item():.4f}'
        
        valid_loss = .0
        model.eval()
        
        with torch.no_grad():
            _metrics = OrderedDict([(mname, .0) for mname in metrics.keys()])
            
            for _x, _y in progress_bar(valid, parent=epochs_bar):
                _x, _y = _x.to(DEVICE), _y.to(DEVICE)
                _out = model(_x)
                _loss = criterion(_out, _y)
                
                valid_loss += _loss.item() / len(valid)
                
                for mname, mfoo in metrics.items():
                    _metrics[mname] += mfoo(_out, _y) / len(valid)
        
        epochs_bar.write(
            ', '.join([f'train loss - {train_loss:.6f}', f'valid loss - {valid_loss:.6f}'] \
                      + [f'valid {mname} - {mvalue}' for mname, mvalue in _metrics.items()])
        )

In [None]:
train(
    model,
    joint_loss,
    optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5),
    train_loader,
    validation_loader,
    metrics,
    num_epochs=20
)