<a href="https://colab.research.google.com/github/emely3h/Geospatial_ML/blob/feature%2Fjaccard_index/data_exploration/physics_jaccard_emely.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Physics Jaccard Index

Calculate the physics jaccard index which will be the main success metric.

### 0. Prepare Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! ls
%cd drive/MyDrive/MachineLearning/Geospatial_ML
! ls

drive  sample_data
/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning/Geospatial_ML
architecture.drawio  experiment_1_2.ipynb  prepare_data
data_exploration     experiments	   README.md
evaluation	     models		   requirements.txt


In [3]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from data_exploration.mask_stats import Mask_Stats
from prepare_data.create_mask import create_physical_mask
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from datetime import datetime
from typing import Tuple

### 1. Helper functions

In [4]:
def intersection_union(y_true, y_pred):
    y_true_f = keras.backend.flatten(y_true)
    y_pred_f = keras.backend.flatten(y_pred)

    intersection = keras.backend.sum(y_true_f * y_pred_f)
    return (intersection, (keras.backend.sum(y_true_f) + keras.backend.sum(y_pred_f) - intersection))

In [5]:
def jaccard_for_batch(split_x: np.memmap, split_y: np.memmap, batch_idx, batch_size):
  print(f'\nCalculating intersection and union for batch {batch_idx/batch_size}, copy tiles [{batch_idx}:{batch_idx+batch_size}] from mmap')
  x_input = np.zeros((batch_size, 256, 256, 5), dtype=np.float32)

  np.copyto(x_input, split_x[batch_idx:batch_idx+batch_size])
  print(f'x_input shape after copying batch from mmap: {x_input.shape}')

  y_mask = np.zeros((batch_size, 256, 256), dtype=np.float32)

  np.copyto(y_mask, split_y[batch_idx:batch_idx+batch_size])
  print(f'y_mask shape after copying batch from mmap: {y_mask.shape}')

  pred_physical = create_physical_mask(x_input)
  print(f'pred mask shape: {pred_physical.shape}')

  y_one_hot = to_categorical(y_mask, num_classes=3)
  print(f'true mask shape: {y_one_hot.shape}')

  inter_union = intersection_union(y_one_hot, pred_physical)
  print(f'intersection: {inter_union[0]} union: {inter_union[1]} intermediate_jaccard: {(inter_union[0]+1)/(inter_union[1]+1)}')
  return inter_union

In [6]:
def get_mean_jaccard(split_x: np.memmap, split_y: np.memmap, batch_size: int):
  num_tiles = split_x.shape[0]
  batches = int(num_tiles / batch_size)
  rest = num_tiles % batch_size
  print(f'total tiles: {num_tiles} batches: {batches} rest: {rest}')
  intersection = 0
  union = 0
  for batch in range (0, batches):
    inter_union = jaccard_for_batch(split_x, split_y, batch*batch_size, batch_size)
    intersection += inter_union[0]
    union += inter_union[1]

  if rest != 0:
    inter_union = jaccard_for_batch(split_x, split_y, batches*batch_size, rest)
    intersection += inter_union[0]
    union += inter_union[1]

  mean_jaccard = (intersection +1.0) / (union + 1.0)
  print(f'\nTotal intersection: {intersection} Total union: {union} mean_jaccard: {mean_jaccard}')
  return [mean_jaccard, intersection, union]



### 2. Jaccard index for overlapping tiles

In [8]:
data_path = "../data_colab/256_200"
train_total_tiles = 11063
val_total_tiles = 3545
test_total_tiles = 3699

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_total_tiles, 256, 256, 5), dtype=np.uint8)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_total_tiles, 256, 256), dtype=np.uint8)
val_split_x = np.memmap(os.path.join(data_path, "val_split_x.npy"), mode="r", shape=(val_total_tiles, 256, 256, 5), dtype=np.uint8)
val_split_y = np.memmap(os.path.join(data_path, "val_split_y.npy"), mode="r", shape=(val_total_tiles, 256, 256), dtype=np.uint8)
test_split_x = np.memmap(os.path.join(data_path, "test_split_x.npy"), mode="r", shape=(test_total_tiles, 256, 256, 5), dtype=np.uint8)
test_split_y = np.memmap(os.path.join(data_path, "test_split_y.npy"), mode="r", shape=(test_total_tiles, 256, 256), dtype=np.uint8)

train_stats = Mask_Stats(train_split_y)
train_stats.print_stats()
print()
val_stats = Mask_Stats(val_split_y)
val_stats.print_stats()
print()
test_stats = Mask_Stats(test_split_y)
test_stats.print_stats()

Shape: (11063, 256, 256)
Land pixels: 326666615  45.056 %
Valid pixels: 231026701  31.865 %
Invalid pixels: 167331452  23.079 %
Sum: 11063

Shape: (3545, 256, 256)
Land pixels: 100682317  43.337 %
Valid pixels: 76811432  33.062 %
Invalid pixels: 54831371  23.601 %
Sum: 3545

Shape: (3699, 256, 256)
Land pixels: 112712687  46.495 %
Valid pixels: 71301683  29.413 %
Invalid pixels: 58403294  24.092 %
Sum: 3699


In [9]:
start = datetime.now()
print("Calculate Intersection and Union for training set \n")
mean_train_jaccard = get_mean_jaccard(train_split_x, train_split_y, 1000)
print("\n\nCalculate Intersection and Union for validation set \n")
mean_val_jaccard = get_mean_jaccard(val_split_x, val_split_y, 1000)
print("\n\nCalculate Intersection and Union for testing set \n")
mean_test_jaccard = get_mean_jaccard(test_split_x, test_split_y, 1000)
end = datetime.now()
print(f'time needed: {end - start}')

Calculate Intersection and Union for training set 

total tiles: 11063 batches: 11 rest: 63

Calculating intersection and union for batch 0.0, copy tiles [0:1000] from mmap
x_input shape after copying batch from mmap: (1000, 256, 256, 5)
y_mask shape after copying batch from mmap: (1000, 256, 256)
pred mask shape: (1000, 256, 256, 3)
true mask shape: (1000, 256, 256, 3)
intersection: 63511032.0 union: 67560960.0 intermediate_jaccard: 0.9400551915168762

Calculating intersection and union for batch 1.0, copy tiles [1000:2000] from mmap
x_input shape after copying batch from mmap: (1000, 256, 256, 5)
y_mask shape after copying batch from mmap: (1000, 256, 256)
pred mask shape: (1000, 256, 256, 3)
true mask shape: (1000, 256, 256, 3)
intersection: 63649104.0 union: 67422896.0 intermediate_jaccard: 0.9440280199050903

Calculating intersection and union for batch 2.0, copy tiles [2000:3000] from mmap
x_input shape after copying batch from mmap: (1000, 256, 256, 5)
y_mask shape after copying

In [10]:
total_intersection = mean_train_jaccard[1] + mean_val_jaccard[1] + mean_test_jaccard[1]
total_union = mean_train_jaccard[2] + mean_val_jaccard[2] + mean_test_jaccard[2]

mean_jaccard = (total_intersection +1) / (total_union +1)
print(mean_jaccard)

tf.Tensor(0.91157514, shape=(), dtype=float32)


### 3. Jaccard index for non-overlapping tiles

In [11]:
total_tiles = 11121
train_tiles = total_tiles // 100 * 60 +1
test_val_tiles = total_tiles // 100 * 20 +1
data_path = "../data_colab/256_256"

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_tiles, 256, 256, 5), dtype=np.float32)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_tiles, 256, 256), dtype=np.float32)
val_split_x = np.memmap(os.path.join(data_path, "val_split_x.npy"), mode="r", shape=(test_val_tiles, 256, 256, 5), dtype=np.float32)
val_split_y = np.memmap(os.path.join(data_path, "val_split_y.npy"), mode="r", shape=(test_val_tiles, 256, 256), dtype=np.float32)
test_split_x = np.memmap(os.path.join(data_path, "test_split_x.npy"), mode="r", shape=(test_val_tiles, 256, 256, 5), dtype=np.float32)
test_split_y = np.memmap(os.path.join(data_path, "test_split_y.npy"), mode="r", shape=(test_val_tiles, 256, 256), dtype=np.float32)

train_stats = Mask_Stats(train_split_y)
train_stats.print_stats()
print()
val_stats = Mask_Stats(val_split_y)
val_stats.print_stats()
print()
test_stats = Mask_Stats(test_split_y)
test_stats.print_stats()

Shape: (6661, 256, 256)
Land pixels: 176919986  40.528 %
Valid pixels: 125877821  28.836 %
Invalid pixels: 133737489  30.636 %
Sum: 6661

Shape: (2221, 256, 256)
Land pixels: 59840780  41.112 %
Valid pixels: 41275933  28.358 %
Invalid pixels: 44438743  30.530 %
Sum: 2221

Shape: (2221, 256, 256)
Land pixels: 59010175  40.541 %
Valid pixels: 42215169  29.003 %
Invalid pixels: 44330112  30.456 %
Sum: 2221


In [None]:
start = datetime.now()
print("Calculate Intersection and Union for training set \n")
mean_train_jaccard = get_mean_jaccard(train_split_x, train_split_y, 1000)
print("\n\nCalculate Intersection and Union for validation set \n")
mean_val_jaccard = get_mean_jaccard(val_split_x, val_split_y, 1000)
print("\n\nCalculate Intersection and Union for testing set \n")
mean_test_jaccard = get_mean_jaccard(test_split_x, test_split_y, 1000)
end = datetime.now()
print(f'time needed: {end - start}')

Calculate Intersection and Union for training set 

total tiles: 6661 batches: 6 rest: 661

Calculating intersection and union for batch 0.0, copy tiles [0:1000] from mmap


In [None]:
total_intersection = mean_train_jaccard[1] + mean_val_jaccard[1] + mean_test_jaccard[1]
total_union = mean_train_jaccard[2] + mean_val_jaccard[2] + mean_test_jaccard[2]

mean_jaccard = (total_intersection +1) / (total_union +1)
print(mean_jaccard)

### Notes

In [8]:
mean_jaccard_2 = get_mean_jaccard(6661)

total tiles: 6661 batches: 1 rest: 0

Calculating jaccard index for batch 0, copy tiles [0:6661] from mmap

x_input after copying batch from mmap

x_input shape: (6661, 256, 256, 5)
x_input max: 255.0
x_input min: 0.0

y_mask after copying batch from mmap

y_mask shape: (6661, 256, 256)
y_mask max: 2.0
y_mask min: 0.0
pred mask shape: (6661, 256, 256, 3)
true mask shape: (6661, 256, 256, 3)
jaccard index: 0.7695764899253845


when executed in batches (batch size ~ 500) no GPU, no additional System RAM needed

Time to execute: 2 min


In [None]:
# todo: over entire dataset, not only training and both for non-overlapping and overlapping => should be almost equal
# todo: calculate index separately for each class

In [18]:
class JaccardIndexCalculator:

    def __init__(self, split_x: np.memmap, split_y: np.memmap, tiles: int, chunk_size: int):
        self.split_x = split_x
        self.split_y = split_y
        self.tiles = tiles
        self.chunk_size = chunk_size
        self.num_chunks = int(np.ceil(tiles/chunk_size))
        self.current_chunk_index = 0
        self.intersection = 0
        self.union = 0

    def __iter__(self):
        return self

    def __next__(self):
       if self.current_chunk_index >= self.num_chunks:
            raise StopIteration
       print(f"Calculating intersection and union for chunk {self.current_chunk_index}")
       x_input_chunk, y_mask_chunk = self.copy_data_to_array()
       pred_physical = create_physical_mask(x_input_chunk)
       y_one_hot = to_categorical(y_mask_chunk, num_classes=3)
       inter_union = self.intersection_union(pred_physical[:, :, :, 1], y_one_hot[:, :, :, 1])
       
       print(f"Intersection:{inter_union[0]}, union: {inter_union[1]} \n")
       
       
       self.intersection += inter_union[0]
       self.union += inter_union[1]
  
       self.current_chunk_index += 1
       return inter_union


    def copy_data_to_array(self) -> Tuple[np.ndarray, np.ndarray]:
      start_index = self.current_chunk_index * self.chunk_size
      end_index = start_index + self.chunk_size
      if end_index > self.tiles:
          end_index = self.tiles
      chunk_size = end_index - start_index
      x_input = np.zeros((chunk_size, 256, 256, 5), dtype=np.float32)
      np.copyto(x_input, self.split_x[start_index:end_index])
      y_mask = np.zeros((chunk_size, 256, 256), dtype=np.float32)
      np.copyto(y_mask, self.split_y[start_index:end_index])
      print(f"Copyed from mmap [{start_index}:{end_index}]")
      return x_input, y_mask
    
    def intersection_union(self, y_true, y_pred):
      y_true_f = keras.backend.flatten(y_true)
      y_pred_f = keras.backend.flatten(y_pred)

      intersection = keras.backend.sum(y_true_f * y_pred_f)
      return (intersection, (keras.backend.sum(y_true_f) + keras.backend.sum(y_pred_f) - intersection))
    
    def mean_jaccard(self):
      for chunk in self:
        pass
      return [(self.intersection +1) / (self.union + 1), self.intersection, self.union]

test = JaccardIndexCalculator(train_split_x, train_split_y, train_tiles, 500)
start = datetime.now()
mean_jaccard = test.mean_jaccard()
end = datetime.now()
print(f'time needed: {end - start}')
print(mean_jaccard)