# Sampling Strategy

In [5]:
import os
import datetime

import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from numba import cuda

from src.model import unet_model
from src.dataset import SegmentationDataset
from src.data_pipeline import SegmentationDataPipeline
from src.model_utils import (
    tversky,
    tversky_loss,
    tversky_axis,
    tversky_loss_axis,
)


%load_ext lab_black
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [127]:
IMG_SHAPE = (256, 1600)
EPOCHS = 10
BATCH_SIZE = 8
ANNOTATIONS_PATH = "../data/train.csv"
TRAIN_IMG_PATH = "../data/train_images/"
LOSSES = {
    "tversky_loss": tversky_loss,
    "tversky_loss_axis": tversky_loss_axis,
}
METRICS = {
    "tversky": tversky,
    "tversky_axis": tversky_axis,
}


# instantiate dataset and pipelne
sd = SegmentationDataset(
    label_file=ANNOTATIONS_PATH,
    img_dir_path=TRAIN_IMG_PATH,
    img_shape=IMG_SHAPE,
)

# create train/test & x/y splits
# train_imgs, test_imgs = sd.get_train_test_split(test_size=0.2)
train_imgs = sd.train_imgs
test_imgs = sd.test_imgs

# oversample train images
train_imgs = sd.oversample_train_set(train_imgs)

# get stratified sample
_, train_imgs = train_test_split(
    sd.imgid_to_classid_mapping[train_imgs],
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=sd.imgid_to_classid_mapping[train_imgs],
)
_, test_imgs = train_test_split(
    sd.imgid_to_classid_mapping[test_imgs],
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=sd.imgid_to_classid_mapping[test_imgs],
)
train_imgs = list(train_imgs.index)
test_imgs = list(test_imgs.index)

X_train = sd.get_image_sequence(train_imgs)
y_train = sd.get_label_sequence(train_imgs, label_type="preprocessed")
X_test = sd.get_image_sequence(test_imgs)
y_test = sd.get_label_sequence(test_imgs, label_type="preprocessed")

{-2: 1.1604, -1: 0.4045, 1: 0.7517, 2: 2.2761, 3: 0.4069, 4: 1.0004}
Old Class Distribution: 
 -2     384
-1    5312
 1     692
 2     176
 3    4283
 4     464
dtype: int64
New Class Distribution: 
 -2     384
-1    5312
 1    5312
 2    5312
 3    5312
 4    5312
dtype: int64
