# Patch preprocessing

This notebook contains the code to preprocess every patch file. The steps are the following:
1. Unzip all the tiff files for every patch (one per band).
2. Stack all bands together in order to create a unique image.
3. Write the stacked tiff images in the folder patches_clean.

In [1]:
import os
import zipfile
from glob import glob
import rasterio as rio
import numpy as np
import re
import tifffile
from skimage.io import imread

## Variables

In [2]:
leak_path = "../data/patches_raw_agsal/leak/FL"
nonleak_path = "../data/patches_raw_agsal/nonleak/FL"

## Unzip files

In [4]:
# Get names of all the zips
zips_dirs = os.listdir(leak_path)

# Remove .zip extension from the name
patch_dir_names = [re.search("S2_i\d+d_[A-Z]+_\d+-\d+-\d+_\d+", name).group() for name in zips_dirs]

parent_dir = leak_path

for dir, patch_dir in zip(zips_dirs, patch_dir_names):
    path_zip = os.path.join(parent_dir, dir)
    if zipfile.is_zipfile(path_zip):
        with zipfile.ZipFile(path_zip) as item:
            item.extractall(os.path.join(parent_dir, patch_dir))

In [17]:
# Get names of all the zips
zips_dirs = os.listdir(nonleak_path)

# Remove .zip extension from the name
patch_dir_names = [re.search("S2_i\d+d_[A-Z]+_\d+-\d+-\d+_\d+", name).group() for name in zips_dirs]

parent_dir = nonleak_path

for dir, patch_dir in zip(zips_dirs, patch_dir_names):
    path_zip = os.path.join(parent_dir, dir)
    if zipfile.is_zipfile(path_zip):
        with zipfile.ZipFile(path_zip) as item:
            item.extractall(os.path.join(parent_dir, patch_dir))

## Stack bands

In [6]:
patches_bands = glob(f"{parent_dir}/S2_i1020063d_FL_26-07-2022_0/*B*.tif")

def my_digit_sort(my_list):
   return list(map(int, re.findall(r'(?<=.B)\d+(?=A?.tif)', my_list)))[0]

patches_bands.sort(key = my_digit_sort)
patches_bands.append(glob(f"{parent_dir}/S2_i1020063d_FL_26-07-2022_0/*WVP*.tif")[0])
patches_bands.append(glob(f"{parent_dir}/S2_i1020063d_FL_26-07-2022_0/*LST*.tif")[0])
patches_bands

['../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B1.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B2.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B3.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B4.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B5.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B6.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B7.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_20220713T172008_T14RKP.B8.tif',
 '../data/patches_raw_agsal/leak/FL/S2_i1020063d_FL_26-07-2022_0\\20220713T170901_202207

In [7]:
stacked_patches = []
for patch in patches_bands:
    with rio.open(patch, "r") as f:
        stacked_patches.append(f.read(1))
        print(f.profile)

{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 21, 'height': 21, 'count': 1, 'crs': CRS.from_epsg(32613), 'transform': Affine(10.0, 0.0, 904520.0,
       0.0, -10.0, 2813300.0), 'blockxsize': 256, 'blockysize': 256, 'tiled': True, 'compress': 'deflate', 'interleave': 'band'}
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 21, 'height': 21, 'count': 1, 'crs': CRS.from_epsg(32613), 'transform': Affine(10.0, 0.0, 904520.0,
       0.0, -10.0, 2813300.0), 'blockxsize': 256, 'blockysize': 256, 'tiled': True, 'compress': 'deflate', 'interleave': 'band'}
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 21, 'height': 21, 'count': 1, 'crs': CRS.from_epsg(32613), 'transform': Affine(10.0, 0.0, 904520.0,
       0.0, -10.0, 2813300.0), 'blockxsize': 256, 'blockysize': 256, 'tiled': True, 'compress': 'deflate', 'interleave': 'band'}
{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 21, 'height': 21, 'count': 1, 'crs': CRS.from_epsg(32613), 't

In [8]:
# stack bands into an nd array
stacked_patch = np.stack(stacked_patches)
stacked_patch.shape

(14, 21, 21)

In [9]:
np.transpose(stacked_patch).shape

(21, 21, 14)

In [10]:
# Save stacked tif image
image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
tifffile.imwrite(f"{parent_dir}/" + image_name + ".tif", stacked_patch)
stacked_image = tifffile.imread(f"{parent_dir}/" + image_name + ".tif")
stacked_image.shape

(14, 21, 21)

In [11]:
image_name

'20220713T170901_20220713T172008_T14RKP'

### Generalize stacking for all images (leak)

In [15]:
# Variables
parent_dir = leak_path
suffix = "_leak"

In [13]:
# Function to order tif files by their band name
def my_digit_sort(my_list):
   return list(map(int, re.findall(r'(?<=.B)\d+(?=A?.tif)', my_list)))[0]

In [14]:
# Get list of all patch directories
image_directories = os.listdir(parent_dir)

# Remove zip files from list
pattern = re.compile("^S.*\d$")
image_directories_nozips = list(filter(pattern.match, image_directories))

# Get band path for each patch
id = 0
for image_dir in image_directories_nozips:
    id_leak = re.search(r"(?<=i)(.*?)(?=d)", image_dir).group()
    patches_bands = glob(f"{parent_dir}/" + image_dir + "/*B*.tif")
    patches_bands.sort(key = my_digit_sort)
    patches_bands.append(glob(f"{parent_dir}/" + image_dir + "/*WVP*.tif")[0])
    patches_bands.append(glob(f"{parent_dir}/" + image_dir + "/*LST*.tif")[0])

    stacked_patches = []
    for patch in patches_bands:
        with rio.open(patch, "r") as f:
            stacked_patches.append(f.read(1))
    
    stacked_patch = np.transpose(np.stack(stacked_patches))

    # Crop patch
    if stacked_patch.shape == (21, 21, 14):
        stacked_patch = stacked_patch[1:21, 1:21, :]
    if stacked_patch.shape == (20, 21, 14):
        stacked_patch = stacked_patch[:, 1:21, :]
    if stacked_patch.shape == (21, 20, 14):
        stacked_patch = stacked_patch[1:21, :, :]
    
    image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
    # Add id
    image_name = image_name + f"{suffix}" + str(id) + "_idLeak_" + id_leak

    # Create stacked image save directory in patches_clean
#    save_path = "../data/patches_clean/leak/" + image_dir

#    if os.path.exists(save_path) == False:
#        os.mkdir(save_path)

    # Write stacked patch
    tifffile.imwrite(f"{parent_dir}/" + image_name + ".tif", stacked_patch)
    id += 1 

### Generalize for non leak images

In [16]:
# Variables
parent_dir = nonleak_path
suffix = "_nonleak"

In [18]:
# Get list of all patch directories
image_directories = os.listdir(parent_dir)

# Remove zip files from list
pattern = re.compile("^S.*\d$")
image_directories_nozips = list(filter(pattern.match, image_directories))

# Get band path for each patch
id = 0
for image_dir in image_directories_nozips:
    id_leak = re.search(r"(?<=i)(.*?)(?=d)", image_dir).group()
    patches_bands = glob(f"{parent_dir}/" + image_dir + "/*B*.tif")
    patches_bands.sort(key = my_digit_sort)
    patches_bands.append(glob(f"{parent_dir}/" + image_dir + "/*WVP*.tif")[0])
    patches_bands.append(glob(f"{parent_dir}/" + image_dir + "/*LST*.tif")[0])

    stacked_patches = []
    for patch in patches_bands:
        with rio.open(patch, "r") as f:
            stacked_patches.append(f.read(1))
    
    stacked_patch = np.transpose(np.stack(stacked_patches))

    # Crop patch
    if stacked_patch.shape == (21, 21, 14):
        stacked_patch = stacked_patch[1:21, 1:21, :]
    if stacked_patch.shape == (20, 21, 14):
        stacked_patch = stacked_patch[:, 1:21, :]
    if stacked_patch.shape == (21, 20, 14):
        stacked_patch = stacked_patch[1:21, :, :]
    
    image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
    # Add id
    image_name = image_name + f"{suffix}" + str(id) + "_idLeak_" + id_leak

    # Create stacked image save directory in patches_clean
#    save_path = "../data/patches_clean/leak/" + image_dir

#    if os.path.exists(save_path) == False:
#        os.mkdir(save_path)

    # Write stacked patch
    tifffile.imwrite(f"{parent_dir}/" + image_name + ".tif", stacked_patch)
    id += 1 

## Create image set

In [19]:
# Paths of .tif images
files_leak = glob(f"{leak_path}/*.tif")
files_nonleak = glob(f"{nonleak_path}/*.tif")
files = files_leak + files_nonleak

['../data/patches_raw_agsal/leak/FL\\20220713T170901_20220713T172008_T14RKP_leak0_idLeak_1007139.tif',
 '../data/patches_raw_agsal/leak/FL\\20220713T170901_20220713T172008_T14RKP_leak1_idLeak_1013099.tif',
 '../data/patches_raw_agsal/leak/FL\\20220713T170901_20220713T172008_T14RKP_leak2_idLeak_1020055.tif',
 '../data/patches_raw_agsal/leak/FL\\20220713T170901_20220713T172008_T14RKP_leak3_idLeak_1020062.tif',
 '../data/patches_raw_agsal/leak/FL\\20220713T170901_20220713T172008_T14RKP_leak4_idLeak_1020063.tif',
 '../data/patches_raw_agsal/nonleak/FL\\20220812T170901_20220812T171652_T13RHJ_nonleak0_idLeak_1007139.tif',
 '../data/patches_raw_agsal/nonleak/FL\\20220812T170901_20220812T171652_T13RHJ_nonleak1_idLeak_1013099.tif',
 '../data/patches_raw_agsal/nonleak/FL\\20220812T170901_20220812T171652_T13RHJ_nonleak2_idLeak_1020055.tif',
 '../data/patches_raw_agsal/nonleak/FL\\20220812T170901_20220812T171652_T13RHJ_nonleak3_idLeak_1020062.tif',
 '../data/patches_raw_agsal/nonleak/FL\\20220812T

In [20]:
# Read and perform augmentation
X_leaks = []
for file in files_leak:
    image = imread(file)
    if image.shape == (20, 20, 14):
        X_leaks.append(image)
    else: 
        next

print("Data without augmentation: ", np.array(X_leaks).shape)

Data without augmentation:  (5, 20, 20, 14)


In [21]:
# Read and perform augmentation
X_nonleaks = []
for file in files_nonleak:
    image = imread(file)
    if image.shape == (20, 20, 14):
        X_nonleaks.append(image)
    else: 
        next

print("Data without augmentation: ", np.array(X_nonleaks).shape)

Data without augmentation:  (5, 20, 20, 14)


In [22]:
X = np.vstack([X_leaks, X_nonleaks])
X.shape

(10, 20, 20, 14)

In [24]:
Y = [1 for i in range(0, len(X_leaks))] + [0 for i in range(0, len(X_nonleaks))]
Y = np.array(Y).reshape(X.shape[0], 1)
Y.shape

(10, 1)

In [27]:
Y

array([[1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [26]:
# Write full sets
np.save("../data/clean/X_agsal.npy", X)
np.save("../data/clean/Y_agsal.npy", Y)