# Deforestation Dataset

This notebook prepares the deforestation dataset

In [None]:
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime, timedelta
from tqdm import tqdm
import cv2
import glob

from multiearth_challenge.datasets import segmentation_dataset as sd

from multiearth_challenge.datasets import base_datasets as bd

from multiearth_challenge import tiff_file_tools as tft

from dateutil.relativedelta import relativedelta

%matplotlib inline

In [None]:
DATA_PATH = 'data/multiearth2023-dataset-final/'
DP_PATH = './dp'

In [None]:
forest_target = pd.read_csv('forest_target.csv')

In [None]:
forest_target.date.min()
forest_target.date.max()

In [None]:
forest_target.shape
forest_target.head()
forest_target.tail()

In [None]:
metas = [
    'sat_ls8_landsat8_train_meta.csv',
    'sat_s2_sent2_b5-b8_train_meta.csv',
    'sat_s1_sent1_train_meta.csv',
    'sat_s2_sent2_b9-b12_train_meta.csv',
    'sat_s2_sent2_b1-b4_train_meta.csv',
]
tile_stats = pd.concat([pd.read_csv(f"./dp/{f}") for f in metas])

In [None]:
pd.concat([
    tile_stats.groupby(['source', 'band']).imin.min(),
    tile_stats.groupby(['source', 'band']).imean.mean(),
    tile_stats.groupby(['source', 'band']).imax.max(),
], axis=1).reset_index()

In [None]:
tile_stats.head()

In [None]:
source_coords = tile_stats.groupby(['source', 'lat', 'lon', 'source_date']).band.count().reset_index()

# candidates for training

In [None]:
candidates = forest_target.merge(source_coords, on=['lat', 'lon'], suffixes = ['_target', '_source'])

candidates = candidates[candidates.date > candidates.source_date] # sat images from the past
candidates = candidates.sort_values(by=['lat', 'lon', 'date', 'source', 'source_date'])
candidates['source_rank'] =  candidates.groupby(['lat', 'lon', 'date', 'source']).source_date.rank(ascending=False)

In [None]:
candidates.shape
candidates.head()

In [None]:
candidates.nunique()

In [None]:
latest = candidates[candidates.source_rank <= 4]


In [None]:
latest.groupby(['source', 'source_rank']).count()

In [None]:
BAND_LIMITS = {
    "ls8": {
#         "SR_B1": (5000, 45000),
        "SR_B2": (5000, 45000),
        "SR_B3": (5000, 45000),
        "SR_B4": (5000, 45000), 
        "SR_B5": (5000, 45000),
        "SR_B6": (5000, 45000),
        "SR_B7": (5000, 45000),
#         "ST_B10": (5000, 45000),
    },
    "s1": {
        "VH": (-30, 2),
        "VV": (-20, 2),
    },
    "s2": {
#         "B1": (1000, 10000),
        "B2": (1000, 10000),
        "B3": (1000, 10000),
        "B4": (1000, 10000),
        "B5": (1000, 10000),
        "B6": (1000, 10000),
        "B7": (1000, 10000),
        "B8": (1000, 10000),
#         "B8A": (1000, 10000),
#         "B9": (1000, 10000),
        "B11": (1000, 10000),
        "B12": (1000, 10000),
    },
}

def normalize(img):
    img = img.astype(np.float64)
    img -= np.mean(img)
    img_std = np.std(img)
    img += img_std
    img /= img_std * 3.0
    img = np.clip(img, 0, 1)
    return img

def get_sat_img(source, band, lat, lon, source_date, use_mean_std=False):
    DP_PATH = './dp'
    SOURCE_DIR = f"{DP_PATH}/{source}"
    source_year = source_date[:4]
    source_key = f"{source}_{band}_{round(lat, 2)}_{round(lon, 2)}_{source_date}"
    source_path = f"{SOURCE_DIR}/{band}/{source_year}/{source_key}.npy"
    
    x = np.load(source_path)
    if use_mean_std:
        x = normalize(x)
    else:
        band_min, band_max = BAND_LIMITS[source][band]
        x = x.clip(band_min, band_max)
        
        if x.max() > x.min():
            x = (x - x.min()) / (x.max() - x.min())
        else:
            x = np.zeros((256, 256))
    x = cv2.resize(x, dsize=(256, 256))
    x = x * 255
    return x.astype(np.uint8)

def get_input_tensor(df):
    chs = []
    for source in ['ls8', 's1', 's2']:
        for source_rank in [1, 2, 3]:
            row = df[(df.source == source) & (df.source_rank == source_rank)]
            for band in BAND_LIMITS[source].keys():
                if len(row) == 0:
                    ch = np.zeros((256, 256))
                else:
                    ch = get_sat_img(source, band, row.lat.values[0], row.lon.values[0], row.source_date.values[0])
                chs.append(ch)
    x = np.stack(chs)
    x = x.transpose(1, 2, 0)
    return x

In [None]:
x.shape

In [None]:
DS_NAME = 'ds0'
ROOT_DIR = '/home/gabor/h2o/multi-earth-2023'
DS_DIR = f'{ROOT_DIR}/dp/{DS_NAME}'

os.makedirs(DS_DIR, exist_ok=True)

In [None]:
rows = []
for target_path, df in tqdm(latest.groupby('target_path')):
    x = get_input_tensor(df)
    
    lat = df.lat.values[0]
    lon = df.lon.values[0]
    target_date = df.date.values[0]
    target_path = df.target_path.values[0]
    
    input_path = f"{DS_DIR}/sats_{round(lat, 2)}_{round(lon, 2)}_{target_date}.npy"
    np.save(input_path, x)
    rows.append([lat, lon, target_date, target_path, input_path])


In [None]:
train_df = pd.DataFrame(rows, columns=['lat', 'lon', 'target_date', 'target_path', 'input_path'])

train_df = train_df.merge(forest_target[['target_path', 'rle', 'img_mean']], on='target_path')
train_df.rle = train_df.rle.fillna("")
train_df['class_id'] = [['deforestation'] for  rle in train_df.rle.values]
train_df['rles'] = [[rle] for rle in train_df.rle.values]

In [None]:
latlons = train_df[['lat', 'lon']].drop_duplicates()
latlons['rand_cv'] = 1 * (np.random.random(len(latlons)) > 0.8)
latlons
latlons

In [None]:
train_df = train_df.merge(latlons, on=['lat', 'lon'])

In [None]:
train_df.groupby('rand_cv').img_mean.count()

In [None]:
# !zip deforest_v0.zip deforest_train_v0.pq deforest_valid_v0.pq

In [None]:
train_df

In [None]:
train_df[train_df.rand_cv == 0].to_parquet('deforest_train_v0.pq', engine='pyarrow', index=False)
train_df[train_df.rand_cv == 1].to_parquet('deforest_valid_v0.pq', engine='pyarrow', index=False)

In [None]:
train_df.count()

In [None]:
forest_target