In [1]:
import imageio
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from glob import glob
import numpy as np
import random
import xarray as xr
import yaml

import imageio.v3 as iio

from convml_tt.data.dataset import TRIPLET_TILE_FILENAME_FORMAT, TRIPLET_TILE_IDENTIFIER_FORMAT

### Make a set of tiles

In [2]:
# working folder
folder = "AquaHkmLabSea2022b"
filepath = "/home/eefjg/OneDrive/Leeds/PhD/Data/MODIS/"+folder

#### Make tiles from png images (old)

In [4]:
# image files
filenames = glob(filepath+"/sliced/*.png")

In [5]:
len(filenames)

71

In [6]:
tile_nx = tile_ny = 256
img_tiles = []

for file in filenames:
    img = imageio.imread(file)
    img = img[:,:,:3] # last term gets rid of alpha channel
    ny, nx, _ = img.shape
    for i in range(0, nx-tile_nx, 64):
        for j in range(0, ny-tile_ny, 64):
            img_tile = img[j:j+tile_ny, i:i+tile_nx, :]
            img_tiles.append(img_tile)

len(img_tiles)

  img = imageio.imread(file)


51120

##### Singlet dataset

In [31]:
fp_path_dataset = Path(filepath+"/tiles")
fp_path_dataset.mkdir(exist_ok=True, parents=True)

In [32]:
stride = 5 # What fraction of the tiles to keep?
n = len(img_tiles)
for n, i in enumerate(tqdm(range(n)[::stride], total=len(img_tiles)/stride)):
    fn = TRIPLET_TILE_FILENAME_FORMAT.format(triplet_id=n, tile_type="anchor")
    fp_tile = fp_path_dataset / fn
    img_tile = img_tiles[i]
    imageio.imwrite(uri=fp_tile, im=img_tile, format="png")

100%|██████████| 10224/10224.0 [02:23<00:00, 71.30it/s]


##### Triplet dataset

In [10]:
fp_path_dataset = Path(filepath+"/triplets")
fp_path_dataset.mkdir(exist_ok=True, parents=True)

In [11]:
stride = 5 # What fraction of the tiles to keep?
n = len(img_tiles)-2
for n, i in enumerate(tqdm(range(n)[::stride], total=len(img_tiles)/stride)):
    #anchor
    fn = TRIPLET_TILE_FILENAME_FORMAT.format(triplet_id=n, tile_type="anchor")
    fp_tile = fp_path_dataset / fn
    img_tile = img_tiles[i]
    imageio.imwrite(uri=fp_tile, im=img_tile, format="png")
    #neighbour - overlap half of the tile
    fn = TRIPLET_TILE_FILENAME_FORMAT.format(triplet_id=n, tile_type="neighbor")
    fp_tile = fp_path_dataset / fn
    img_tile = img_tiles[i+2] # not totally satisfactory, tiles from edges may not actually be neighbours, also randomise direction?
    imageio.imwrite(uri=fp_tile, im=img_tile, format="png")
    #distant - sampled from random image (how to ensure not from same image?)
    fn = TRIPLET_TILE_FILENAME_FORMAT.format(triplet_id=n, tile_type="distant")
    fp_tile = fp_path_dataset / fn
    img_tile = img_tiles[random.randint(0, len(img_tiles)-1)]
    imageio.imwrite(uri=fp_tile, im=img_tile, format="png")


100%|██████████| 10224/10224.0 [07:04<00:00, 24.08it/s]


### Make tiles using functions from convml-data
Now including metadata!!

#### Functions

In [3]:
# Functions from convml_data/pipeline/triplets.py
# This divides the scenes into sets for train, study etc, (don't really need it yet)
# N_triplets should be a dictionary e.g. {train: 10, study: 2} - can just use study for now

def split_scene_ids(scene_ids, N_triplets): 
    scene_collections = {}
    
    # split all scene IDs randomly so that for each collection in
    # `N_triplets` the fraction of scenes allocated equals the fraction
    # of triplets in the collection
    N_scenes_total = len(scene_ids)
    N_triplets_total = sum(N_triplets.values())
    scene_ids_shuffled = np.random.permutation(scene_ids)

    def split_list(arr, idx):
        return arr[:idx], arr[idx:]

    for i, (collection_name, N_triplets_collection) in enumerate(
        N_triplets.items()
    ):
        if i <= N_scenes_total - 1:
            f = N_triplets_collection / N_triplets_total
            N_scenes_collection = int(f * N_scenes_total)
        else:
            N_scenes_collection = len(scene_ids_shuffled)

        collection_scene_ids, scene_ids_shuffled = split_list(
            scene_ids_shuffled, N_scenes_collection
        )
        scene_collections[collection_name] = collection_scene_ids

    return scene_collections

In [4]:
# work out which scenes to take the anchor and distant tiles from by 
# picking two random scenes for each tile id.

def tile_scene_splits(scene_ids, scene_ids_by_collection, N_triplets):
    tiles_per_scene = {}

    for scene_id in scene_ids:
        tiles_per_scene[scene_id] = []

    for triplet_collection, n_triplets in N_triplets.items():
        collection_scene_ids = scene_ids_by_collection[triplet_collection]
        for n in range(n_triplets):
            # pick two random scene IDs, ensuring that they are different
            scene_id_anchor, scene_id_distant = np.random.choice(
                collection_scene_ids, size=2, replace=False # replace=False so same scene not picked twice
            )

            scene_ids = [scene_id_anchor, scene_id_distant]

            for scene_id, is_distant in zip(scene_ids, [False, True]):
                tiles_per_scene[scene_id].append(
                    dict(
                        triplet_id=n,
                        is_distant=is_distant,
                        triplet_collection=triplet_collection,
                            )
                        )
    return tiles_per_scene

In [5]:
# Next: work out tile locations
# adapted from convml_data/sampling/triplets.py

def generate_randomly_located_tile(slice_shape, tile_size, rng=None):
    """
    Generate a tile location for a specific `tile_size` that fits inside a slice
    """
    
    margin = tile_size * 0.6
    d_xmin = 0 + margin
    d_xmax = slice_shape[1] - margin
    d_ymin = 0 + margin
    d_ymax = slice_shape[0] - margin

    if rng is None:
        rng = np.random.default_rng(rng)

    x_t = (d_xmin + (d_xmax - d_xmin) * rng.uniform())
    y_t = (d_ymin + (d_ymax - d_ymin) * rng.uniform())

    return x_t, y_t


# generate a tile location randomly offset from another tile location
# i.e. the neighbour tile location

def generate_tile_domain_with_peturbed_location(
    slice_shape, tile_loc, tile_size, distance_size_scaling=0.5, rng=None
):
    """
    Generate a tile location for a specific `tile_size` that fits inside a slice,
    perturbed from a given `tile_loc` location
    distance_size_scaling = distance of neighbour centre from anchor centre in units of tile size
    """

    if rng is None:
        rng = np.random.default_rng(rng)

    theta = 2 * np.pi * rng.uniform()
    r = distance_size_scaling * tile_size # offset distance
    dlx = r * np.cos(theta)
    dly = r * np.sin(theta)

    x_t = (tile_loc[0] + dlx)
    y_t = (tile_loc[1] + dly)

    margin = tile_size * 0.6
    if x_t < margin or x_t > (slice_shape[1] - margin) \
        or y_t < margin or y_t > (slice_shape[0] - margin):
        return generate_tile_domain_with_peturbed_location(
            slice_shape=slice_shape,
            tile_loc=tile_loc,
            tile_size=tile_size,
            distance_size_scaling=distance_size_scaling,
            rng=rng,
        )
    else: 
        return x_t, y_t


def generate_triplet_location(slice_shape, tile_size, neigh_dist_scaling=0.5, rng=None):
    # Do you really need this when you don't sample anchor and distant from the same scene?
    """
    Generate a set of (x,y)-positions (a list of three specifically)
    representing the "anchor", "neighbor" and "distant" tile locations
    """
    anchor_tile_domain = generate_randomly_located_tile(
        slice_shape=slice_shape, tile_size=tile_size, rng=rng
    )

    neighbor_tile_domain = generate_tile_domain_with_peturbed_location(
        slice_shape=slice_shape,
        tile_loc=anchor_tile_domain,
        tile_size=tile_size,
        distance_size_scaling=neigh_dist_scaling,
        rng=rng,
    )
    distant_tile_domain = generate_randomly_located_tile(
        slice_shape=slice_shape, tile_size=tile_size, rng=rng
    )

    return [anchor_tile_domain, neighbor_tile_domain, distant_tile_domain]

In [6]:
# take tile_per_scene yaml, split by scene, then for each scene, generate tile x and y

def generate_tile_locations(tiles_per_scene, slice_shape, tile_size, neigh_dist_scaling=0.5, rng=None):
    ''' write triplet locations to yaml file for each scene
        tiles_per_scene: dictionary of scene IDs and tile IDs
        slice_shape: shape of the slice
        tile_size: size of the tile
        neigh_dist_scaling: distance of neighbour centre from anchor centre in units of tile size
        rng: random number generator
    '''

    scene_tile_locations = {}
    for scene_id, tiles in tiles_per_scene.items():
        tile_locations = []
        for tile in tiles:
            triplet_id = tile["triplet_id"]
            triplet_collection = tile["triplet_collection"]
            is_distant = tile["is_distant"]

            if is_distant:
                x_d, y_d = generate_randomly_located_tile(slice_shape, tile_size, rng=None)
                distant_meta = dict(
                    loc=dict(x_c=x_d, y_c=y_d),
                    tile_type="distant",
                    triplet_id=triplet_id,
                    triplet_collection=triplet_collection,
                )
                tile_locations.append(distant_meta)

            else:
                x_a, y_a = generate_randomly_located_tile(slice_shape, tile_size, rng=None)
                anchor_meta = dict(
                    loc=dict(x_c=x_a, y_c=y_a),
                    tile_type="anchor",
                    triplet_id=triplet_id,
                    triplet_collection=triplet_collection,
                )
                tile_locations.append(anchor_meta)

                x_n, y_n = generate_tile_domain_with_peturbed_location(
                    slice_shape=slice_shape,
                    tile_loc=[x_a, y_a],
                    tile_size=tile_size,
                    distance_size_scaling=neigh_dist_scaling,
                    rng=None,
                )
                neighbor_meta = dict(
                    loc=dict(x_c=float(x_n), y_c=float(y_n)),
                    tile_type="neighbor",
                    triplet_id=triplet_id,
                    triplet_collection=triplet_collection,
                )
                tile_locations.append(neighbor_meta)

        with open(filepath + "/meta/" + f"{scene_id}_tile_locations.yaml", "w") as f:
            yaml.dump(tile_locations, f)
        scene_tile_locations[scene_id] = tile_locations
    return scene_tile_locations


In [13]:
# Next: generate image and yaml file for each tile

def generate_all_tiles(datapath, savepath, scene_tile_locations, tile_size):
    ''' write triplet tiles to png and yaml file for each scene
        scene_tile_locations: dictionary of scene IDs and tile locations
        scene_id: scene ID
        slice_shape: shape of the slice
        tile_size: size of the tile
    '''

    for scene_id, tiles in tqdm(scene_tile_locations.items()):
        
        data = datapath + scene_id + ".nc"
        with xr.open_dataset(data) as scene_ds:
            scene_ds.load()
            
            for tile in tiles:
                triplet_id = tile["triplet_id"]
                triplet_collection = tile["triplet_collection"]
                tile_type = tile["tile_type"]
                x_c = tile["loc"]["x_c"]
                y_c = tile["loc"]["y_c"]
                    
                tile_identifier = TRIPLET_TILE_IDENTIFIER_FORMAT.format(triplet_id=triplet_id, 
                                                                        tile_type=tile_type)
                fn_data = f"{tile_identifier}.nc"
                fn_image = f"{tile_identifier}.png"
                fn_meta = f"{tile_identifier}.yml"
                
                # crop nc file to tile and save

                tile_ds = scene_ds.sel(x=slice(int(x_c-tile_size/2), int(x_c+tile_size/2)), 
                                    y=slice(int(y_c-tile_size/2), int(y_c+tile_size/2)))
                
                tile_ds.to_netcdf(savepath + fn_data)

                # save png of rgb
            
                img_data = iio.imread(datapath + scene_id + ".png")
                img_tile = img_data[slice(int(y_c-tile_size/2), int(y_c+tile_size/2)), 
                            slice(int(x_c-tile_size/2), int(x_c+tile_size/2))]

                iio.imwrite(uri=(savepath + fn_image), image=img_tile, format="png")

                # save yaml file with metadata

                tile_meta = dict(
                triplet_id=triplet_id,
                triplet_collection=triplet_collection,
                tile_type=tile_type,
                loc=dict(x_c=x_c, 
                        y_c=y_c,
                        central_latitude = float(scene_ds.latitude.sel(x=int(x_c), y=int(y_c)).values),    
                        central_longitude = float(scene_ds.longitude.sel(x=int(x_c), y=int(y_c)).values)), 
                        scene_id=scene_id,
                        land_fraction = 1 - float(tile_ds.watermask.mean())
            )
                with open(savepath + fn_meta, "w") as f:
                    yaml.dump(tile_meta, f)


To revisit:  
- Can I parallelise tile generation somehow?
- if need different collections i.e. train/study need to separate them

#### Run code

In [35]:
with open(filepath+"/meta/scene_ids.yaml", "r") as f:
    scene_ids = yaml.safe_load(f)

len(scene_ids.keys())

71

In [36]:
N_triplets = {"study": 10000}
scene_ids_by_collection = split_scene_ids(list(scene_ids.keys()), N_triplets)
tiles_per_scene = tile_scene_splits(scene_ids, scene_ids_by_collection, N_triplets)

# write tiles_per_scene to yaml
with open(filepath+"/meta/tile_scene_splits.yaml", "w") as f:
    yaml.dump(tiles_per_scene, f)

In [10]:
slice_shape = (4060, 1024) # y, x
tile_size = 256

In [38]:

scene_tile_locations = generate_tile_locations(
    tiles_per_scene, slice_shape, tile_size, neigh_dist_scaling=0.5, rng=None)

# write scene_tile_locations to yaml
with open(filepath+"/meta/all_scene_tile_locations.yaml", "w") as f:
    yaml.dump(scene_tile_locations, f)


If scene locations already generated then run from here:

In [15]:
tile_size = 256

with open(filepath+"/meta/all_scene_tile_locations.yaml", "r") as f:
    scene_tile_locations = yaml.safe_load(f)

In [11]:
datapath = filepath + "/sliced/"
savepath = filepath + "/triplets/"

path = Path(savepath)
path.mkdir(exist_ok=True, parents=True)

generate_all_tiles(datapath, savepath, scene_tile_locations, tile_size)

100%|██████████| 71/71 [1:10:31<00:00, 59.59s/it]
