In [None]:
import io
import os
import tarfile
import shutil
import subprocess
import glob
import random
import boto3
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from osgeo import gdal
from radiant_mlhub import Dataset, get_session
from urllib.parse import urlparse
from matplotlib.colors import ListedColormap

#### Access MLHUB API

connect to the API and fetch the landcovernet collection

In [None]:
os.environ['MLHUB_API_KEY'] = '31d9dc2064d0be58b464666ca085b50856a3df59534d244c48a42bc5f5f82722'
session = get_session()
# ref_landcovernet_na_v1 is for North America
# ref_landcovernet_af_v1 is for Africa 
# ...
dataset = Dataset.fetch('ref_landcovernet_na_v1')

In [None]:
# print dataset metadata and collections
print(f'Title: {dataset.title}')
print(f'DOI: {dataset.doi}')
print(f'Citation: {dataset.citation}')
print('\nCollection IDs and License:')
for collection in dataset.collections:
    print(f'    {collection.id} - {collection.license}')

create main folder where to save dataset

In [None]:
WDIR = "dataset/"
if not os.path.exists(WDIR):
    os.mkdir(WDIR)

#### Labels: from MLHUB, download landcovernet

Download dataset locally - only labels 

In [None]:
labels_tarfile = dataset.collections.labels[0].download(WDIR)

In [None]:
# extract labels into current directory
with tarfile.open(labels_tarfile, "r") as tar:
    tar.extractall(path = WDIR)

In [None]:
# remove extracted tarfile
os.remove(labels_tarfile)

get rid of alpha band, save to tif, translate to numpy array, delete original image

In [None]:
labels_folder = WDIR + "ref_landcovernet_na_v1_labels"

# create labels folder
if not os.path.exists(WDIR + "labels"):
    os.mkdir(WDIR + "labels")
    
# loop through labels tif files
for label in os.scandir(labels_folder):
    if not label.is_dir():
        continue
    if label.path == WDIR + 'ref_landcovernet_na_v1_labels/_common':
        continue

    dates = pd.read_csv(label.path+"/source_dates.csv")
    
    # get only band zero and drop it into a numpy array
    for date in dates[dates.columns[-1]].values:
        tif_out = WDIR + f"labels/{label.name[-15:]}_{date}.tif"
        npy_out = WDIR + f"labels/{label.name[-15:]}_{date}"
        if not os.path.exists(npy_out):
            subprocess.run(["gdal_translate", "-b", "1",
                          f"{label.path}/labels.tif", tif_out])
            ds = gdal.Open(tif_out)
            im = ds.GetRasterBand(1).ReadAsArray()
            os.remove(tif_out)
            np.save(npy_out,im)

    #delete previous folder (label.path)
    shutil.rmtree(label.path)
# remove old folder labels
shutil.rmtree(labels_folder)

quick labels sanity check

In [None]:
# get a random npy label file, show image
file = random.choice(os.listdir(WDIR + 'labels'))
im = np.load(file)
c_map = cm.get_cmap('rainbow')
c_map.set_bad('green')
b = im.astype('float32').copy()
b[b==5] = np.nan
plt.imshow(b, cmap=c_map)
plt.show()

#### Landsat images: from MLHUB, download landcovernet images

Download zipped dataset locally - only Landsat images

In [None]:
# download zipped landsat colletion
images_tarfile = dataset.collections.source_imagery[2].download(output_dir=WDIR)

In [None]:
# extract images into current directory
with tarfile.open('dataset/ref_landcovernet_na_v1_source_landsat_8.tar.gz', "r") as tar:
    tar.extractall(path = WDIR)

In [None]:
# remove tarfile
os.remove(images_tarfile)

In [None]:
# unzipped Landsat images folder
images_folder = WDIR + "ref_landcovernet_na_v1_source_landsat_8"

get only 6 bands (RGB + NIR + 2 SWIR bands), save to file as a stacked numpy array

In [None]:
# create Landsat images folder
if not os.path.exists(WDIR + "images"):
    os.mkdir(WDIR + "images")

# loop through images tif files
for image in os.scandir(images_folder):   
    if not image.is_dir():
        continue
    if len(glob.glob(image.path + '/*.tif')) == 0:
        print(f'no bands path: {image.path}')
        continue
    
    # get 6 out of 7 bands, save stack image to npy array
    # -- get rid of coastal band
    # -- red, green, blue, NIR, SWIR16, SWIR22
    npy_out = WDIR + f"images/image_{image.name[-17:]}" 
    if not os.path.exists(npy_out + '.npy'):
        
        tif_out = WDIR + f"images/image_{image.name[-17:]}.tif"
        subprocess.run(["gdal_merge.py", "-separate", "-o",
                tif_out,
                "-of","GTiff", f"{image.path}/B02.tif",
                f"{image.path}/B03.tif",f"{image.path}/B04.tif",f"{image.path}/B05.tif",
                f"{image.path}/B06.tif",f"{image.path}/B07.tif"])

        ds = gdal.Open(tif_out)
        array = []
        for b in range(1, ds.RasterCount+1):
            im = ds.GetRasterBand(b).ReadAsArray()
            array.append(im)
            
        os.remove(tif_out)
        array_stack = np.stack(array)
        np.save(npy_out, array_stack)

# remove old folder images
shutil.rmtree(images_folder)

Landsat images sanity check

In [None]:
# get a random npy image file, show image
file = random.choice(os.listdir(WDIR + 'images'))
im = np.load(WDIR + 'images/' + file)[3] # show NIR band
plt.imshow(im)
plt.show()

#### Labels and Landsat images: save dataset subset containing only images with 20-85% of pixels labeled as forest

In [None]:
def forest_percentage(label):
    per = np.sum(label==5)/label.flatten().shape[0]
    return per

def acceptable_levels_of_forest(label,up=0.85,low=0.2):
    """Returns True if forest percentage in a given image has
    forest percentage between up and low
    """
    per = forest_percentage(label)
    if per>=low and per<=up:
        return True
    return False

def get_list_ids(prefix="dataset"):
    #Get images
    labels_id = glob.glob(prefix+'/labels/*.npy')
    images_id = glob.glob(prefix+'/images/*.npy')
    list_id = np.intersect1d([image[-21:] for image in images_id],
                             [label[-21:] for label in labels_id])
    return list_id, len(list_id)

In [None]:
out_images = "data_forest_20_85/images/"
if not os.path.exists(out_images):
    os.mkdir(out_images)
    
out_labels = "data_forest_20_85/labels/"
if not os.path.exists(out_labels):
    os.mkdir(out_labels)

In [None]:
prefix = "dataset"
list_id,_ = get_list_ids(prefix=prefix)
for n in list_id:
    label_path = f'{prefix}/labels/labels_{n}'
    image_path = f'{prefix}/images/image_{n}'
    label = np.load(label_path)
    if acceptable_levels_of_forest(label):
        shutil.copyfile(label_path, out_labels + f'labels_{n}')
        shutil.copyfile(image_path, out_images + f'image_{n}')

#### Labels: move landcovernet into a S3 bucket

In [None]:
# define in which bucket to save the files
bucket = "landcoverchangedetection"
# get list of labels paths
files = glob.glob('dataset/labels/*.npy')

In [None]:
# save all labels in S3 bucket
folder = 'dataset/labels'
for file in files:
    file = file[12:]
    path = os.path.join(folder, file)
    boto3.Session().resource('s3').Bucket(bucket).Object(path).upload_file(path)
    os.remove(path)                  

#### Landsat images: move landcovernet into S3 bucket

In [None]:
# define in which bucket to save the files
bucket = "landcoverchangedetection"
# get list of images paths
files = glob.glob('dataset/images/*.npy')

In [None]:
# save all labels in S3 bucket
folder = 'dataset/images'
for file in files:
    file = file[12:]
    path = os.path.join(folder, file)
    boto3.Session().resource('s3').Bucket(bucket).Object(path).upload_file(path)
    os.remove(path)

#### Get only Labels and Landsat images with forest, save all in a second s3 folder (NOT NEEDED)

In [None]:
bucket = "landcoverchangedetection"
s3 = boto3.resource("s3")
my_bucket = s3.Bucket(bucket)
client = boto3.client("s3")

create the list of labels IDs from the bucket

In [None]:
labels_id = []
for object in my_bucket.objects.filter(Prefix='dataset/labels'):
    labels_id.append(object.key[15:])

get the s3 path of labels containing forest

In [None]:
lst = []
for object in my_bucket.objects.filter(Prefix='dataset/labels/'):
    s3_url = f"s3://{bucket}/{object.key}"
    bytes_ = io.BytesIO()
    parsed_s3 = urlparse(s3_url)
    client.download_fileobj(Fileobj=bytes_, Bucket=parsed_s3.netloc, 
                                    Key=parsed_s3.path[1:])
    bytes_.seek(0)
    X_ = np.load(bytes_, allow_pickle=True)
    if 5 in X_:
        lst.append(object.key)

move all the labels contaning forest in a different bucket division

In [None]:
for path in lst:
    file = path[-21:]
    copy_source = {
        'Bucket': 'landcoverchangedetection',
        'Key': 'dataset/labels/labels_' + file
    }

    bucket = s3.Bucket('landcoverchangedetection')

    bucket.copy(copy_source, 'dataset_with_forest/labels/labels_' + file)
    
    

move all Landsat images which labels contain forest in a different bucket division

In [None]:
for path in lst:
    file = path[-21:]
    copy_source = {
        'Bucket': 'landcoverchangedetection',
        'Key': 'dataset/images/image_' + file
    }

    bucket = s3.Bucket('landcoverchangedetection')

    bucket.copy(copy_source, 'dataset_with_forest/images/image_' + file)
    