<a href="https://colab.research.google.com/github/emely3h/Geospatial_ML/blob/feature%2Fphysics-jaccard-index/data_exploration/physics_jaccard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Physics Jaccard Index

Calculate the physics jaccard index which will be the main success metric.

# Small Subset(256x256: Training)

### 0. Get Stats for each image

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! ls
%cd drive/MyDrive/MachineLearning/Geospatial_ML
! ls

drive  sample_data
/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning/Geospatial_ML
architecture.drawio  experiment_1_2.ipynb  prepare_data
data_exploration     experiments	   README.md
evaluation	     models		   requirements.txt


In [None]:
! git checkout feature/physics-jaccard-index
! git pull

Already on 'feature/physics-jaccard-index'
Your branch is up to date with 'origin/feature/physics-jaccard-index'.
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 4 (delta 2), reused 4 (delta 2), pack-reused 0[K
Unpacking objects: 100% (4/4), 676 bytes | 2.00 KiB/s, done.
From https://github.com/emely3h/Geospatial_ML
   7f09b81..06a6ecc  feature/physics-jaccard-index -> origin/feature/physics-jaccard-index
Updating 7f09b81..06a6ecc
Fast-forward
 models/helpers.py | 19 [32m+++++++++++++++++++[m
 1 file changed, 19 insertions(+)


In [3]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from data_exploration.mask_stats import Mask_Stats
from prepare_data.create_mask import create_physical_mask
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from models.unet_model import unet_2d
# from models.helpers import copy_data_to_arrays, save_pickle, jaccard_coef

### 1. Loading + Preparing Training Data

In [4]:
total_tiles = 11121
train_tiles = total_tiles // 100 * 60 +1
test_val_tiles = total_tiles // 100 * 20 +1
data_path = "../data_colab/256_256"

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_tiles, 256, 256, 5), dtype=np.float32)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_tiles, 256, 256), dtype=np.float32)

train_stats = Mask_Stats(train_split_y)
train_stats.print_stats()


Shape: (6661, 256, 256)
Land pixels: 176919986  40.528 %
Valid pixels: 125877821  28.836 %
Invalid pixels: 133737489  30.636 %
Sum: 6661


### 2. Initializing training data

In [29]:
import numpy as np
from typing import Tuple
from skimage.measure import label
from keras.utils import to_categorical


class JaccardIndexCalculator:

    def __init__(self, split_x: np.ndarray, split_y: np.ndarray, tiles: int, chunk_size: int):
        self.split_x = split_x
        self.split_y = split_y
        self.tiles = tiles
        self.chunk_size = chunk_size
        self.num_chunks = tiles // chunk_size
        self.current_chunk_index = 0
        self.jaccard_indices = []

    def __iter__(self):
        return self

    def __next__(self):
        if self.current_chunk_index >= self.num_chunks:
            raise StopIteration

        x_input_chunk, y_mask_chunk = self.initialize_saved_data()
        pred_physical = self.create_physical_mask(x_input_chunk)
        y_one_hot = to_categorical(y_mask_chunk, num_classes=3)
        jaccard_index = self.jaccard_coef(pred_physical[:, :, :, 1], y_one_hot[:, :, :, 1])
        self.jaccard_indices.append(jaccard_index)
        self.current_chunk_index += 1
        return jaccard_index

    def initialize_saved_data(self) -> Tuple[np.ndarray, np.ndarray]:
        start_index = self.current_chunk_index * self.chunk_size
        end_index = start_index + self.chunk_size
        x_input = np.zeros((self.chunk_size, 256, 256, 5), dtype=np.float32)
        np.copyto(x_input, self.split_x[start_index:end_index])
        print("x_input_shape", x_input.shape)
        y_mask = np.zeros((self.chunk_size, 256, 256), dtype=np.float32)
        np.copyto(y_mask, self.split_y[start_index:end_index])
        print("y_mask_shape:",y_mask.shape)
        return x_input, y_mask

    def label_pixels(self, img):
        mask1 = img == 0
        mask2 = (img == 255) | (img == 253)
        img[mask1] = 2
        img[mask2] = 0
        img[~(mask1 | mask2)] = 1
        return img

    def create_physical_mask(self, x_input: np.ndarray) -> np.ndarray:
      wq_channel = x_input[:, :, :, 4]
      labeled = self.label_pixels(wq_channel)
      return to_categorical(labeled, num_classes=3)

    @staticmethod
    def jaccard_coef(y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true_f = y_true.flatten()
        y_pred_f = y_pred.flatten()
        intersection = np.sum(y_true_f * y_pred_f)
        return (intersection + 1.0) / (np.sum(y_true_f) + np.sum(y_pred_f) - intersection + 1.0)

    def calculate_mean_jaccard_index(self) -> float:
        for jaccard_index in self:
            pass
        return np.mean(self.jaccard_indices)


In [30]:
jaccard_index_calculator = JaccardIndexCalculator(train_split_x, train_split_y, train_tiles, chunk_size=1000)
mean_jaccard_index = jaccard_index_calculator.calculate_mean_jaccard_index()
print(mean_jaccard_index)

x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
x_input_shape (1000, 256, 256, 5)
y_mask_shape: (1000, 256, 256)
0.8615554758641575


In [None]:
x_input,y_mask = copy_data_to_arrays(train_split_x, train_split_y, train_tiles, chunk_size)

Copying data to arrays...
x_input shape: (6661, 256, 256, 5)
x_input min value: 0.0 x_input max value: 0.0
Data copied to x_input...
x_input shape: (6661, 256, 256, 5)
x_input min value: 0.0 x_input max value: 255.0

Initializing y_mask...
y_mask shape: (6661, 256, 256)
y_mask min value: 0.0 y_mask max value: 0.0
Data copied to y_mask...
y_mask shape: (6661, 256, 256)
y_mask min value: 0.0 y_mask max value: 2.0


### 3. Create Physical mask(training data)

In [None]:
pred_physical = create_physical_mask(x_input, chunk_size)

### 4. Encoding (training data)

In [None]:
y_one_hot = to_categorical(y_mask, num_classes=3)

### 5. Calculate Physics jaccard index(training data)

In [None]:
jaccard = jaccard_coef(y_one_hot, pred_physical)
print(jaccard)

tf.Tensor(0.7695766, shape=(), dtype=float32)


In [None]:
# save_pickle(jaccard, "../data_colab/256_256", "train_physics_jaccard")

Saving training_physics_jaccard.pkl to ../data_colab/256_256...
Saved training_physics_jaccard.pkl to ../data_colab/256_256.


# All dataset(Overlap: 56px)

### 6. Loading all data(256_200)

In [None]:
data_path = "../data_colab/256_200"
train_total_tiles = 11063
val_total_tiles = 3545
test_total_tiles = 3699

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_total_tiles, 256, 256, 5), dtype=np.uint8)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_total_tiles, 256, 256), dtype=np.uint8)
val_split_x = np.memmap(os.path.join(data_path, "val_split_x.npy"), mode="r", shape=(val_total_tiles, 256, 256, 5), dtype=np.uint8)
val_split_y = np.memmap(os.path.join(data_path, "val_split_y.npy"), mode="r", shape=(val_total_tiles, 256, 256), dtype=np.uint8)
test_split_x = np.memmap(os.path.join(data_path, "test_split_x.npy"), mode="r", shape=(test_total_tiles, 256, 256, 5), dtype=np.uint8)
test_split_y = np.memmap(os.path.join(data_path, "test_split_y.npy"), mode="r", shape=(test_total_tiles, 256, 256), dtype=np.uint8)


### 7. Initializing all data(256_200)

In [None]:
print("1: Training Dataset")
train_x_input, train_y_mask = copy_data_to_arrays(train_split_x, train_split_y, train_total_tiles, chunk_size)
print("\n2: Validation Dataset")
val_x_input, val_y_mask = copy_data_to_arrays(val_split_x, val_split_y, val_total_tiles, chunk_size)
print("\n3: Test Dataset")
test_x_input, test_y_mask = copy_data_to_arrays(test_split_x, test_split_y, test_total_tiles, chunk_size)

1: Training Dataset
Copying data to arrays...
x_input shape: (11063, 256, 256, 5)
x_input min value: 0.0 x_input max value: 0.0
Data copied to x_input...
x_input shape: (11063, 256, 256, 5)
x_input min value: 0.0 x_input max value: 255.0

Initializing y_mask...
y_mask shape: (11063, 256, 256)
y_mask min value: 0.0 y_mask max value: 0.0
Data copied to y_mask...
y_mask shape: (11063, 256, 256)
y_mask min value: 0.0 y_mask max value: 2.0

2: Validation Dataset
Copying data to arrays...
x_input shape: (3545, 256, 256, 5)
x_input min value: 0.0 x_input max value: 0.0
Data copied to x_input...
x_input shape: (3545, 256, 256, 5)
x_input min value: 0.0 x_input max value: 255.0

Initializing y_mask...
y_mask shape: (3545, 256, 256)
y_mask min value: 0.0 y_mask max value: 0.0
Data copied to y_mask...
y_mask shape: (3545, 256, 256)
y_mask min value: 0.0 y_mask max value: 2.0

3: Test Dataset
Copying data to arrays...
x_input shape: (3699, 256, 256, 5)
x_input min value: 0.0 x_input max value: 0.0

### 8. Create all Physical masks (256_200)

In [None]:
train_pred_physical = create_physical_mask(train_x_input, chunk_size)
val_pred_physical = create_physical_mask(val_x_input, chunk_size)
test_pred_physical = create_physical_mask(test_x_input, chunk_size)

### 9. Encoding all data (256_200)

In [None]:
train_y_one_hot = to_categorical(train_y_mask, num_classes=3)
val_y_one_hot = to_categorical(val_y_mask, num_classes=3)
test_y_one_hot = to_categorical(test_y_mask, num_classes=3)

### 10. Calculate all Physics jaccard index (256_200)

In [None]:
train_jaccard = jaccard_coef(train_y_one_hot, train_pred_physical)
val_jaccard = jaccard_coef(val_y_one_hot, val_pred_physical)
test_jaccard = jaccard_coef(test_y_one_hot, test_pred_physical)

ResourceExhaustedError: ignored

In [None]:
print("1.Traing dataset")
print(train_jaccard)
print("\n2.Validation dataset")
print(val_jaccard)
print("\n3.Test dataset")
print(test_jaccard)

In [None]:
save_pickle(train_jaccard, "../data_colab/256_200", "train_physics_jaccard")
save_pickle(val_jaccard, "../data_colab/256_200", "val_physics_jaccard")
save_pickle(test_jaccard, "../data_colab/256_200", "test_physics_jaccard")

# All dataset(No overlap)

### 11. Loading all data(256_256)

In [None]:
data_path = "../data_colab/256_256"
train_total_tiles = 11063
val_total_tiles = 3545
test_total_tiles = 3699

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_total_tiles, 256, 256, 5), dtype=np.uint8)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_total_tiles, 256, 256), dtype=np.uint8)
val_split_x = np.memmap(os.path.join(data_path, "val_split_x.npy"), mode="r", shape=(val_total_tiles, 256, 256, 5), dtype=np.uint8)
val_split_y = np.memmap(os.path.join(data_path, "val_split_y.npy"), mode="r", shape=(val_total_tiles, 256, 256), dtype=np.uint8)
test_split_x = np.memmap(os.path.join(data_path, "test_split_x.npy"), mode="r", shape=(test_total_tiles, 256, 256, 5), dtype=np.uint8)
test_split_y = np.memmap(os.path.join(data_path, "test_split_y.npy"), mode="r", shape=(test_total_tiles, 256, 256), dtype=np.uint8)

### 12. Initializing all data(256_256)

In [None]:
print("1: Training Dataset")
train_x_input, train_y_mask = initialize_saved_data(train_split_x, train_split_y, train_total_tiles)
print("\n2: Validation Dataset")
val_x_input, val_y_mask = initialize_saved_data(val_split_x, val_split_y, val_total_tiles)
print("\n3: Test Dataset")
test_x_input, test_y_mask = initialize_saved_data(test_split_x, test_split_y, test_total_tiles)

### 13. Create all Physical masks (256_256)

In [None]:
train_pred_physical = create_physical_mask(train_x_input)
val_pred_physical = create_physical_mask(val_x_input)
test_pred_physical = create_physical_mask(test_x_input)

### 14. Encoding all data (256_256)

In [None]:
train_y_one_hot = to_categorical(train_y_mask, num_classes=3)
val_y_one_hot = to_categorical(val_y_mask, num_classes=3)
test_y_one_hot = to_categorical(test_y_mask, num_classes=3)

### 15. Calculate all Physics jaccard index (256_256)

In [None]:
train_jaccard = jaccard_coef(train_y_one_hot, train_pred_physical)
val_jaccard = jaccard_coef(val_y_one_hot, val_pred_physical)
test_jaccard = jaccard_coef(test_y_one_hot, test_pred_physical)

print("1.Traing dataset")
print(train_jaccard)
print("\n2.Validation dataset")
print(val_jaccard)
print("\n3.Test dataset")
print(test_jaccard)