<a href="https://colab.research.google.com/github/emely3h/Geospatial_ML/blob/feature%2Fjaccard_index/data_exploration/physics_jaccard_emely.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Physical Jaccard Index

Calculate the physical jaccard index which will be the main success metric.

### 0. Get Stats for each image

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! ls
%cd drive/MyDrive/MachineLearning/Geospatial_ML
! ls

drive  sample_data
/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning/Geospatial_ML
architecture.drawio  experiment_1_2.ipynb  prepare_data
data_exploration     experiments	   README.md
evaluation	     models		   requirements.txt


In [3]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from data_exploration.mask_stats import Mask_Stats
from prepare_data.create_mask import create_physical_mask
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

In [4]:
total_tiles = 11121
train_tiles = total_tiles // 100 * 60 +1
test_val_tiles = total_tiles // 100 * 20 +1
data_path = "../data_colab/256_256"

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_tiles, 256, 256, 5), dtype=np.float32)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_tiles, 256, 256), dtype=np.float32) # use int?

train_stats = Mask_Stats(train_split_y)
train_stats.print_stats()


Shape: (6661, 256, 256)
Land pixels: 176919986  40.528 %
Valid pixels: 125877821  28.836 %
Invalid pixels: 133737489  30.636 %
Sum: 6661


In [5]:
def jaccard_coef(y_true, y_pred):
    y_true_f = keras.backend.flatten(y_true)
    y_pred_f = keras.backend.flatten(y_pred)

    intersection = keras.backend.sum(y_true_f * y_pred_f)
    return (intersection + 1.0) / (
            keras.backend.sum(y_true_f) + keras.backend.sum(y_pred_f) - intersection + 1.0
    ) 

In [6]:
def jaccard_for_batch(batch_idx, batch_size):
  print(f'\nCalculating jaccard index for batch {batch_idx}, copy tiles [{batch_idx}:{batch_idx+batch_size}] from mmap')
  x_input = np.zeros((batch_size, 256, 256, 5), dtype=np.float32)

  np.copyto(x_input, train_split_x[batch_idx:batch_idx+batch_size])
  print('\nx_input after copying batch from mmap\n')
  print(f'x_input shape: {x_input.shape}')
  print(f'x_input max: {np.max(x_input)}')
  print(f'x_input min: {np.min(x_input)}')

  y_mask = np.zeros((batch_size, 256, 256), dtype=np.float32)

  np.copyto(y_mask, train_split_y[batch_idx:batch_idx+batch_size])
  print('\ny_mask after copying batch from mmap\n')
  print(f'y_mask shape: {y_mask.shape}')
  print(f'y_mask max: {np.max(y_mask)}')
  print(f'y_mask min: {np.min(y_mask)}')

  pred_physical = create_physical_mask(x_input)
  print(f'pred mask shape: {pred_physical.shape}')

  y_one_hot = to_categorical(y_mask, num_classes=3)
  print(f'true mask shape: {y_one_hot.shape}')

  jaccard = jaccard_coef(y_one_hot, pred_physical)
  print(f'jaccard index: {jaccard}')
  return jaccard

In [7]:
def get_mean_jaccard(batch_size):
  batches = int(train_tiles / batch_size)
  rest = train_tiles % batch_size
  print(f'total tiles: {train_tiles} batches: {batches} rest: {rest}')
  mean_jaccard = 0
  for batch in range (0, batches):
    mean_jaccard += jaccard_for_batch(batch*batch_size, batch_size)
  if rest != 0:
    mean_jaccard += jaccard_for_batch(batches*batch_size, rest)
    rest_percentage = 100/batch_size*rest/100
    return mean_jaccard/ (batches + rest_percentage)
  else:
    return mean_jaccard / batches



In [9]:
mean_jaccard = get_mean_jaccard(500)

total tiles: 6661 batches: 13 rest: 161

Calculating jaccard index for batch 0, copy tiles [0:500] from mmap

x_input after copying batch from mmap

x_input shape: (500, 256, 256, 5)
x_input max: 255.0
x_input min: 0.0

y_mask after copying batch from mmap

y_mask shape: (500, 256, 256)
y_mask max: 2.0
y_mask min: 0.0
pred mask shape: (500, 256, 256, 3)
true mask shape: (500, 256, 256, 3)
jaccard index: 0.8502607941627502

Calculating jaccard index for batch 500, copy tiles [500:1000] from mmap

x_input after copying batch from mmap

x_input shape: (500, 256, 256, 5)
x_input max: 255.0
x_input min: 0.0

y_mask after copying batch from mmap

y_mask shape: (500, 256, 256)
y_mask max: 2.0
y_mask min: 0.0
pred mask shape: (500, 256, 256, 3)
true mask shape: (500, 256, 256, 3)
jaccard index: 0.9117470383644104

Calculating jaccard index for batch 1000, copy tiles [1000:1500] from mmap

x_input after copying batch from mmap

x_input shape: (500, 256, 256, 5)
x_input max: 255.0
x_input min: 0

In [10]:
print(mean_jaccard)

tf.Tensor(0.82751197, shape=(), dtype=float32)


In [8]:
mean_jaccard_2 = get_mean_jaccard(6661)

total tiles: 6661 batches: 1 rest: 0

Calculating jaccard index for batch 0, copy tiles [0:6661] from mmap

x_input after copying batch from mmap

x_input shape: (6661, 256, 256, 5)
x_input max: 255.0
x_input min: 0.0

y_mask after copying batch from mmap

y_mask shape: (6661, 256, 256)
y_mask max: 2.0
y_mask min: 0.0
pred mask shape: (6661, 256, 256, 3)
true mask shape: (6661, 256, 256, 3)
jaccard index: 0.7695764899253845


513,5 compute units
standard gpu, RAM


1,96 compute units/ hour

In [None]:
# todo: over entire dataset, not only training and both for non-overlapping and overlapping => should be equal

In [None]:


data_path = "../data_colab/256_200"
train_total_tiles = 11063
val_total_tiles = 3545
test_total_tiles = 3699

train_split_x = np.memmap(os.path.join(data_path, "train_split_x.npy"), mode="r", shape=(train_total_tiles, 256, 256, 5), dtype=np.uint8)
train_split_y = np.memmap(os.path.join(data_path, "train_split_y.npy"), mode="r", shape=(train_total_tiles, 256, 256), dtype=np.uint8)
val_split_x = np.memmap(os.path.join(data_path, "val_split_x.npy"), mode="r", shape=(val_total_tiles, 256, 256, 5), dtype=np.uint8)
val_split_y = np.memmap(os.path.join(data_path, "val_split_y.npy"), mode="r", shape=(val_total_tiles, 256, 256), dtype=np.uint8)
test_split_x = np.memmap(os.path.join(data_path, "test_split_x.npy"), mode="r", shape=(test_total_tiles, 256, 256, 5), dtype=np.uint8)
test_split_y = np.memmap(os.path.join(data_path, "test_split_y.npy"), mode="r", shape=(test_total_tiles, 256, 256), dtype=np.uint8)
