In [1]:
# Create the training .npy files

import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from multiprocessing import Pool
import pandas as pd

In [2]:
path = "datasets/"

landsat8_train = path + "landsat8_train/train/"
deforest_train = path + "deforestation_train_resized/train/"

#os.system("rm " + dst + "/*")
dst = path + "with_labels/"
if not os.path.exists(dst):
    os.makedirs(dst)

In [3]:
'Landsat8_QA_PIXEL_-54.48_-3.33_2013_03_27.tiff'

def get_name_landsat(row, band):
    return f"Landsat8_{band}_{row['lon']:.2f}_{row['lat']:.2f}_{row['date'].year}_{row['date'].month:02d}_{row['date'].day:02d}.tiff"

images = np.array(os.listdir(landsat8_train))
images.sort()

images = images[:int(images.size/9)]

def get_dict(name):
    tokens = name.split("_")
    
    lat = float(tokens[4])
    lon = float(tokens[3])
    
    y = int(tokens[5])
    m = int(tokens[6])
    d = int(tokens[7].split('.')[0])
    
    return {"date": pd.Timestamp(y, m, d), "lat": lat, "lon": lon}

with Pool(16) as p:
    data = p.map(get_dict, images)

landsat_df = pd.DataFrame(data = data)
landsat_df

Unnamed: 0,date,lat,lon
0,2013-03-27,-3.33,-54.48
1,2013-05-20,-3.33,-54.48
2,2013-06-05,-3.33,-54.48
3,2013-06-21,-3.33,-54.48
4,2013-07-07,-3.33,-54.48
...,...,...,...
233220,2021-08-14,-4.39,-55.20
233221,2021-09-15,-4.39,-55.20
233222,2021-10-01,-4.39,-55.20
233223,2021-10-17,-4.39,-55.20


In [4]:
'Deforestation_-54.48_-3.35_2016_08_01.tiff'

def get_name_deforestation(row):
    return f"Deforestation_{row['lon']:.2f}_{row['lat']:.2f}_{row['date'].year}_{row['date'].month:02d}_{row['date'].day:02d}.tiff"

images = np.array(os.listdir(deforest_train))
images.sort()

def get_dict(name):
    tokens = name.split("_")
    
    lat = float(tokens[2])
    lon = float(tokens[1])
    
    y = int(tokens[3])
    m = int(tokens[4])
    d = int(tokens[5].split('.')[0])
    
    return {"date": pd.Timestamp(y, m, d), "lat": lat, "lon": lon}

with Pool(16) as p:
    data = p.map(get_dict, images)

deforest_df = pd.DataFrame(data = data)
deforest_df

Unnamed: 0,date,lat,lon
0,2016-08-01,-3.35,-54.48
1,2017-07-01,-3.35,-54.48
2,2017-08-01,-3.35,-54.48
3,2018-06-01,-3.35,-54.48
4,2018-08-01,-3.35,-54.48
...,...,...,...
17210,2019-08-01,-4.39,-55.20
17211,2020-06-01,-4.39,-55.20
17212,2020-08-01,-4.39,-55.20
17213,2021-05-01,-4.39,-55.20


In [5]:
# Funções 

def load_labels(img):
    img = Image.open(img)
    img = np.array(img).astype(np.float32) / 256
    return img

def load_image(img):
    img = Image.open(img)
    img = np.array(img).astype(np.float32)

    img -= 10000
    img /= 35000
    
    return img

def load_qa(img):
    # https://www.usgs.gov/landsat-missions/landsat-collection-1-level-1-quality-assessment-band
    with Image.open(img) as i:
        img = np.array(i)
    img = (img & 0b11111) == 0
    img = img.astype(np.float32)
    return img

def get_qa_proportion(lr):
    qa = load_qa(landsat8_train + get_name_landsat(lr[1], "QA_PIXEL"))
    return (qa == 0).mean()

def get_image_and_qa(lr):
    bands = []
    for band in [ "SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B6", "SR_B7"]:
        name = get_name_landsat(lr, band)
        bands.append(load_image(landsat8_train + name))
    img = np.stack(bands, axis = -1)

    qa = load_qa(landsat8_train + get_name_landsat(lr, "QA_PIXEL"))

    return np.concatenate([img, qa[:, :, None]], axis = -1)

In [6]:
with Pool(24) as p:
    clouds = p.map(get_qa_proportion, landsat_df.iterrows(), chunksize = landsat_df.shape[0] // 24)
clouds = np.array(clouds)

landsat_df = landsat_df[clouds < 0.7]

(clouds < 0.7).mean()

0.4728866973952192

In [7]:
# Iterate over the annotated deforestation images
def save_image(j):
    j = j[1]

    name = get_name_deforestation(j)
    lab = load_labels(deforest_train + name)[:, :, None]

    # Get all images from the same location
    (lat, lon) = (j["lat"], j["lon"])
    ll = landsat_df[(landsat_df["lat"] == lat) & (landsat_df["lon"] == lon)] 
    ll = ll.sort_values(by = "date").reset_index(drop = True) # Sort by date

    # Remove the future
    ll = ll[ll["date"] <= j["date"]]

    # Get the closest image to the deforestation
    time_diff = j["date"] - ll["date"]
    idx = time_diff.idxmin()

    image = get_image_and_qa(ll.iloc[idx])

    im0 = image
    try:
        lr = ll.iloc[idx - 1]
        im0 = get_image_and_qa(lr)
    except:
        pass

    im1 = image
    try:
        lr = ll.iloc[idx - 2]
        im1 = get_image_and_qa(lr)
    except:
        pass

    image = np.concatenate([im1, im0, image, lab], axis = -1)

    # save image
    mean = lab.mean()
    name = f"Labeled_{j['lat']:.2f}_{j['lon']:.2f}_{j['date'].year}_{j['date'].month:02d}_{j['date'].day:02d}_{mean:.2f}.npy"
    np.save(dst + name, image)

# TODO - find faster way
with Pool(16) as p:
    p.map(save_image, deforest_df.iterrows(), chunksize=500)

In [8]:
deforest_df.shape[0] // 24

717