# Data exploration and cleaning

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import time
from IPython.display import clear_output

In [36]:
test_data_og = np.load('../../gcp/test-processed/test_x_l2a_processed.npy')
test_labels_og = np.load('../../gcp/test-processed/test_y_l2a_processed.npy')
test_length = np.load('../../gcp/test-processed/test_length_l2a_processed.npy')
train_data_og = np.load('../../gcp/train-processed/data_x_l2a_processed.npy')
train_labels_og= np.load('../../gcp/train-processed/data_y_l2a_processed.npy')
train_length = np.load('../../gcp/train-processed/length_l2a_processed.npy')

In [37]:
# The X data channels are as below:
# [B02,B03,B04,B05,B06,B07, B08, B8A,B11,B12, SLOPE, EVI, MSAVI2, Bare soil index, Sentinel 1 VV, Sentinel 1 VH]

test_data_og.shape

(684, 24, 16, 16, 17)

## Exploratory visualization the different bands 

### Cleaning up the data

**Removing erroneous column 14**

In [38]:
# column 14 contains NaN-s
nan = np.isnan(test_data_og[:,:,:,:,14])

unique_elements, counts_elements = np.unique(nan, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[      0       1]
 [4201728     768]]


In [39]:
# removing column 14 as it has NaN-s
test_data2 = np.delete(test_data_og,14, axis = 4)
train_data2 = np.delete(train_data_og,14, axis = 4)

In [40]:
train_data2.shape

(4383, 24, 16, 16, 16)

In [41]:
nan = np.isnan(test_data2[:,:,:,:,14])

unique_elements, counts_elements = np.unique(nan, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[      0]
 [4202496]]


**Normalize data [x_norm = (x - min) / (max - min)]**

In [42]:
data = np.concatenate((train_data2, test_data2), axis = 0)

# calculating the minimums and the max-min by column 
minarray = np.min(data[:, :, :, :, :], axis = (0,1,2,3))
max_min_array = np.max(data[:, :, :, :, :], axis = (0,1,2,3)) - np.min(data[:, :, :, :, :], axis = (0,1,2,3))

train_data3 = (train_data2 - np.tile(minarray, 4383*24*16*16).reshape(4383,24,16,16,16)) / np.tile(max_min_array, 4383*24*16*16).reshape(4383,24,16,16,16)
test_data3 = (test_data2 - np.tile(minarray, 684*24*16*16).reshape(684,24,16,16,16)) / np.tile(max_min_array, 684*24*16*16).reshape(684,24,16,16,16)

In [43]:
# Check if data is normalized
print("Min value of all variables is 0:", (np.min(train_data3[:,:,:,:,:], axis=(0,1,2,3)) == 0).all())
print("Max value of all variables is 1:", (np.max(train_data3[:,:,:,:,:], axis=(0,1,2,3)) == 1).all())

Min value of all variables is 0: False
Max value of all variables is 1: False


# ???
Why are two bands not normalizing correctly?

In [44]:
np.max(train_data3[:,:,:,:,:], axis=(0,1,2,3))


array([1.       , 1.       , 1.       , 1.       , 1.       , 1.       ,
       0.9537339, 0.864469 , 1.       , 1.       , 1.       , 1.       ,
       1.       , 1.       , 1.       , 1.       ], dtype=float32)

In [48]:
np.min(train_data3[:,:,:,:,:], axis=(0,1,2,3))

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.03773958, 0.01146612,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ], dtype=float32)

**Simplifying to a single time dimension**

In [45]:
train_data4 = train_data3[:,22,:,:,:]
train_data4.shape

(4383, 16, 16, 16)

In [46]:
test_data4 = test_data3[:,22,:,:,:]
test_data4.shape

(684, 16, 16, 16)

In [47]:
# setting it to the original name
train_data = train_data4
test_data = test_data4

In [51]:
test_labels_og.shape
test_length.shape

(684,)

### Exporting the cleaned data for use in other notebooks

In [52]:
# exporting the version that still has all the time dimension so we can use it for more advanced network structures
np.save("../../gcp/train-processed/train_x_l2a_cleaned_norm.npy", train_data3)
np.save("../../gcp/test-processed/test_x_l2a_cleaned_norm.npy", test_data3)