# Data Exploration

The goal of this notebook is to compare the original images before they are split into tiles. When training the model on the entire dataset the dataset is split into training data, validation data and test data. When evaluating the model based on calculated metrics it is important to use data that the model has never seen before. As experiment two shows that it is beneficial if the tiles in which the images are split overlap it is crucial that tiles of the same original images don't appear in e.g. training data and test data. Therefore it is necessary to analyze the images in order to come up with a good training-, validation- and test split of the dataset.

The entire dataset consists of 17 images taken on the following dates:
- 2022_12_12
- 2022_12_02
- 2022_10_23
- 2022_10_13
- 2022_09_18
- 2022_09_13
- 2022_09_08
- 2022_09_03
- 2022_08_24
- 2022_08_14
- 2022_08_09
- 2022_08_04
- 2022_07_30
- 2022_07_25
- 2022_07_15
- 2022_07_10
- 2022_06_20

All images have been split with a tile size of 256 and a step size of 200.

The goal is to explore the following parameters:
- amount of tiles per image
- amount of pixels per class per image

### 0. Get Stats for each image

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! ls
%cd drive/MyDrive/MachineLearning/Geospatial_ML
! ls

drive  sample_data
/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning/Geospatial_ML
architecture.drawio  evaluation  notebooks     README.md
Copy_of_unet.ipynb   models	 prepare_data  requirements.txt


In [3]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [4]:
def check_shapes(x_input, y_mask):
  if not x_input.shape[0] == y_mask.shape[0]:
    raise TypeError('amount of tiles different in input and mask array.')
  if not (x_input.shape[1] == 256 and x_input.shape[2] == 256):
    raise TypeError('tile size of input array does not match 256')
  if not (x_input.shape[3] == 5):
    raise TypeError('input array does not have 5 channels')
  if not (y_mask.shape[1] == 256 and y_mask.shape[2] == 256):
    raise TypeError('tile size of mask array does not match 256')

In [5]:
def num_of_pixels_per_class(y_mask, label):
  flatten = np.reshape(y_mask, (-1,))
  pixel_match = (flatten == label)
  pix_per_class = np.count_nonzero(pixel_match)
  return pix_per_class


In [6]:
def data_exploration(x_input, y_mask, file_name):

  check_shapes(x_input, y_mask)

  num_tiles = x_input.shape[0]
  num_pixels = x_input.shape[0] * x_input.shape[1] * x_input.shape[2]

  num_land_pix = num_of_pixels_per_class(y_mask, 2)
  num_valid_pix = num_of_pixels_per_class(y_mask, 0)
  num_invalid_pix = num_of_pixels_per_class(y_mask, 1)

  if not num_pixels == (num_land_pix + num_valid_pix + num_invalid_pix):
    raise TypeError('pixels per class summed up is not equal to num_pixels.')

  percenetage_land = 100/ num_pixels * num_land_pix
  percenetage_valid = 100/ num_pixels * num_valid_pix
  percenetage_invalid = 100/ num_pixels * num_invalid_pix

  return {
      'file_name': file_name,
      'x_input.shape': x_input.shape,
      'y_mask.shape': y_mask.shape,
      'num_tiles': num_tiles,
      'num_pixels': num_pixels,
      'num_land_pix': num_land_pix,
      'num_valid_pix': num_valid_pix,
      'num_invalid_pix': num_invalid_pix,
      'percenetage_land': percenetage_land,
      'percenetage_valid': percenetage_valid,
      'percenetage_invalid': percenetage_invalid
  }
  

In [7]:
data_directory = "../data_colab/256_200"

all_stats = []

for file_name in os.listdir(data_directory):
  tiles_path = os.path.join(data_directory, file_name)

  y_mask  = np.load(tiles_path)['y_mask']
  x_input = np.load(tiles_path)['x_input']

  print(file_name)
  print(y_mask.shape)
  print(x_input.shape)

  stats = data_exploration(x_input, y_mask, file_name)
  all_stats.append(stats)


(889, 256, 256)
(889, 256, 256, 5)
(864, 256, 256)
(864, 256, 256, 5)
(1174, 256, 256)
(1174, 256, 256, 5)
(1251, 256, 256)
(1251, 256, 256, 5)
(1164, 256, 256)
(1164, 256, 256, 5)
(1258, 256, 256)
(1258, 256, 256, 5)
(1319, 256, 256)
(1319, 256, 256, 5)
(1323, 256, 256)
(1323, 256, 256, 5)
(1183, 256, 256)
(1183, 256, 256, 5)
(1179, 256, 256)
(1179, 256, 256, 5)
(1306, 256, 256)
(1306, 256, 256, 5)
(1196, 256, 256)
(1196, 256, 256, 5)
(957, 256, 256)
(957, 256, 256, 5)
(927, 256, 256)
(927, 256, 256, 5)
(1142, 256, 256)
(1142, 256, 256, 5)
(1175, 256, 256)
(1175, 256, 256, 5)
(1181, 256, 256)
(1181, 256, 256, 5)


### 1. Display stats

In [9]:
df = pd.DataFrame(all_stats)
df

Unnamed: 0,file_name,x_input.shape,y_mask.shape,num_tiles,num_pixels,num_land_pix,num_valid_pix,num_invalid_pix,percenetage_land,percenetage_valid,percenetage_invalid
0,2022_10_13.npz,"(889, 256, 256, 5)","(889, 256, 256)",889,58261504,25291432,23177868,9792204,43.410194,39.782475,16.807331
1,2022_07_15.npz,"(864, 256, 256, 5)","(864, 256, 256)",864,56623104,22234514,28697670,5690920,39.267565,50.681909,10.050526
2,2022_09_18.npz,"(1174, 256, 256, 5)","(1174, 256, 256)",1174,76939264,33676340,6146271,37116653,43.770031,7.988471,48.241497
3,2022_06_20.npz,"(1251, 256, 256, 5)","(1251, 256, 256)",1251,81985536,38419876,8891328,34674332,46.861773,10.844996,42.293231
4,2022_10_23.npz,"(1164, 256, 256, 5)","(1164, 256, 256)",1164,76283904,34711353,33409321,8163230,45.502853,43.796029,10.701117
5,2022_07_25.npz,"(1258, 256, 256, 5)","(1258, 256, 256)",1258,82444288,38571454,5478224,38394610,46.784871,6.644759,46.57037
6,2022_08_04.npz,"(1319, 256, 256, 5)","(1319, 256, 256)",1319,86441984,42183449,11617999,32640536,48.799723,13.440227,37.76005
7,2022_07_10.npz,"(1323, 256, 256, 5)","(1323, 256, 256)",1323,86704128,37265773,23166510,26271845,42.980391,26.719039,30.30057
8,2022_07_30.npz,"(1183, 256, 256, 5)","(1183, 256, 256)",1183,77529088,34286477,2614816,40627795,44.224017,3.37269,52.403293
9,2022_08_14.npz,"(1179, 256, 256, 5)","(1179, 256, 256)",1179,77266944,33801595,6528640,36936709,43.746515,8.449461,47.804025


In [12]:
# sort by num tiles
df_num_tiles = df[['file_name', 'num_tiles', 'percenetage_valid', 'percenetage_invalid', 'percenetage_land']]
df_num_tiles_sorted = df_num_tiles.sort_values(by='num_tiles')
df_num_tiles_sorted


Unnamed: 0,file_name,num_tiles,percenetage_valid,percenetage_invalid,percenetage_land
1,2022_07_15.npz,864,50.681909,10.050526,39.267565
0,2022_10_13.npz,889,39.782475,16.807331,43.410194
13,2022_09_08.npz,927,24.559434,31.447306,43.99326
12,2022_12_12.npz,957,28.253614,21.371911,50.374475
14,2022_12_02.npz,1142,35.459047,17.023361,47.517591
4,2022_10_23.npz,1164,43.796029,10.701117,45.502853
2,2022_09_18.npz,1174,7.988471,48.241497,43.770031
15,2022_09_13.npz,1175,10.073545,46.242989,43.683466
9,2022_08_14.npz,1179,8.449461,47.804025,43.746515
16,2022_08_09.npz,1181,4.649956,51.351912,43.998132


In [16]:
# sort by ratio invalid valid

df_ratio = df[['file_name', 'percenetage_valid', 'percenetage_invalid', 'percenetage_land', 'num_tiles']]
df_ratio_sorted = df_ratio.sort_values(by='percenetage_valid')
df_ratio_sorted

Unnamed: 0,file_name,percenetage_valid,percenetage_invalid,percenetage_land,num_tiles
8,2022_07_30.npz,3.37269,52.403293,44.224017,1183
16,2022_08_09.npz,4.649956,51.351912,43.998132,1181
5,2022_07_25.npz,6.644759,46.57037,46.784871,1258
2,2022_09_18.npz,7.988471,48.241497,43.770031,1174
9,2022_08_14.npz,8.449461,47.804025,43.746515,1179
15,2022_09_13.npz,10.073545,46.242989,43.683466,1175
3,2022_06_20.npz,10.844996,42.293231,46.861773,1251
6,2022_08_04.npz,13.440227,37.76005,48.799723,1319
13,2022_09_08.npz,24.559434,31.447306,43.99326,927
7,2022_07_10.npz,26.719039,30.30057,42.980391,1323
