# Patch preprocessing

This notebook contains the code to preprocess every patch file. The steps are the following:
1. Unzip all the tiff files for every patch (one per band).
2. Stack all bands together in order to create a unique image.
3. Write the stacked tiff images in the folder patches_clean.

In [1]:
import os
import zipfile
from glob import glob
import rasterio as rio
import numpy as np
import re
import tifffile
from skimage.io import imread

## Unzip files

In [2]:
# Get names of all the zips
zips_dirs = os.listdir("../data/patches_raw/leak/")

# Remove .zip extension from the name
patch_dir_names = [re.search("S2_i\d+d_\d+-\d+-\d+_\d+", name).group() for name in zips_dirs]

parent_dir = "../data/patches_raw/leak/"

for dir, patch_dir in zip(zips_dirs, patch_dir_names):
    path_zip = os.path.join(parent_dir, dir)
    if zipfile.is_zipfile(path_zip):
        with zipfile.ZipFile(path_zip) as item:
            item.extractall(os.path.join(parent_dir, patch_dir))

## Stack bands

In [None]:
patches_bands = glob("../data/patches_raw/leak/S2_2020-01-06_3/*B*.tif")

def my_digit_sort(my_list):
   return list(map(int, re.findall(r'(?<=.B)\d+(?=A?.tif)', my_list)))[0]

patches_bands.sort(key = my_digit_sort)
patches_bands.append(glob("../data/patches_raw/leak/S2_2020-01-06_3/*WVP*.tif")[0])
patches_bands.append(glob("../data/patches_raw/leak/S2_2020-01-06_3/*LST*.tif")[0])
patches_bands

In [None]:
stacked_patches = []
for patch in patches_bands:
    with rio.open(patch, "r") as f:
        stacked_patches.append(f.read(1))
        print(f.profile)

In [None]:
# stack bands into an nd array
stacked_patch = np.stack(stacked_patches)
stacked_patch.shape

In [None]:
np.transpose(stacked_patch).shape

In [None]:
# Save stacked tif image
image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
tifffile.imwrite("../data/patches_clean/leak/" + image_name + ".tif", stacked_patch)
stacked_image = tifffile.imread("../data/patches_clean/leak/" + image_name + ".tif")
stacked_image.shape

### Generalize stacking for all images (leak)

In [4]:
# Function to order tif files by their band name
def my_digit_sort(my_list):
   return list(map(int, re.findall(r'(?<=.B)\d+(?=A?.tif)', my_list)))[0]

In [6]:
# Get list of all patch directories
image_directories = os.listdir("../data/patches_raw/leak/")

# Remove zip files from list
pattern = re.compile("^S.*\d$")
image_directories_nozips = list(filter(pattern.match, image_directories))

# Get band path for each patch
id = 0
for image_dir in image_directories_nozips:
    id_leak = re.search(r"(?<=i)(.*?)(?=d)", image_dir).group()
    patches_bands = glob("../data/patches_raw/leak/" + image_dir + "/*B*.tif")
    patches_bands.sort(key = my_digit_sort)
    patches_bands.append(glob("../data/patches_raw/leak/" + image_dir + "/*WVP*.tif")[0])
    patches_bands.append(glob("../data/patches_raw/leak/" + image_dir + "/*LST*.tif")[0])

    stacked_patches = []
    for patch in patches_bands:
        with rio.open(patch, "r") as f:
            stacked_patches.append(f.read(1))
    
    stacked_patch = np.transpose(np.stack(stacked_patches))

    # Crop patch
    if stacked_patch.shape == (21, 21, 14):
        stacked_patch = stacked_patch[1:21, 1:21, :]
    if stacked_patch.shape == (20, 21, 14):
        stacked_patch = stacked_patch[:, 1:21, :]
    if stacked_patch.shape == (21, 20, 14):
        stacked_patch = stacked_patch[1:21, :, :]
    
    image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
    # Add id
    image_name = image_name + "_leak" + str(id) + "_idLeak_" + id_leak

    # Create stacked image save directory in patches_clean
#    save_path = "../data/patches_clean/leak/" + image_dir

#    if os.path.exists(save_path) == False:
#        os.mkdir(save_path)

    # Write stacked patch
    tifffile.imwrite("../data/patches_clean/leak/" + image_name + ".tif", stacked_patch)
    id += 1 

### Generalize for non leak images

In [7]:
## Unzip

# Get names of all the zips
zips_dirs = os.listdir("../data/patches_raw/non_leak")

# Remove .zip extension from the name
patch_dir_names = [re.search("S2_i\d+d_\d+-\d+-\d+_\d+", name).group() for name in zips_dirs]

parent_dir = "../data/patches_raw/non_leak/"

for dir, patch_dir in zip(zips_dirs, patch_dir_names):
    path_zip = os.path.join(parent_dir, dir)
    if zipfile.is_zipfile(path_zip):
        with zipfile.ZipFile(path_zip) as item:
            item.extractall(os.path.join(parent_dir, patch_dir))

In [8]:
# Get list of all patch directories
image_directories = os.listdir("../data/patches_raw/non_leak")

# Remove zip files from list
pattern = re.compile("^S.*\d$")
image_directories_nozips = list(filter(pattern.match, image_directories))

# Get band path for each patch
id = 0
for image_dir in image_directories_nozips:

    id_leak = re.search(r"(?<=i)(.*?)(?=d)", image_dir).group()
    patches_bands = glob("../data/patches_raw/non_leak/" + image_dir + "/*B*.tif")
    patches_bands.sort(key = my_digit_sort)
    patches_bands.append(glob("../data/patches_raw/non_leak/" + image_dir + "/*WVP*.tif")[0])
    patches_bands.append(glob("../data/patches_raw/non_leak/" + image_dir + "/*LST*.tif")[0])

    stacked_patches = []
    for patch in patches_bands:
        with rio.open(patch, "r") as f:
            stacked_patches.append(f.read(1))
    
    stacked_patch = np.transpose(np.stack(stacked_patches))

    # Crop patch
    if stacked_patch.shape == (21, 21, 14):
        stacked_patch = stacked_patch[1:21, 1:21, :]
    if stacked_patch.shape == (20, 21, 14):
        stacked_patch = stacked_patch[:, 1:21, :]
    if stacked_patch.shape == (21, 20, 14):
        stacked_patch = stacked_patch[1:21, :, :]

    image_name = re.search(r"(?<=\\)(.*?)(?=.B)", patches_bands[0]).group()
    # Add id
    image_name = image_name + "_nonleak" + str(id) + "_idLeak_" + id_leak

    # Create stacked image save directory in patches_clean
    #save_path = "../data/patches_clean/non_leak/" + image_dir
    #if os.path.exists(save_path) == False:
    #    os.mkdir(save_path)

    # Write stacked patch
    tifffile.imwrite("../data/patches_clean/non_leak/" + image_name + ".tif", stacked_patch)
    id += 1