## GEO-AI Challenge for Cropland Mapping by ITU

- This noteboook demonstrates the GEE data download and data processing
- The data is downloaded separately for each region using only the samples provided for the competition
- Cloud filtering using image properties were followed by manual inspection/removal of images 



The data preparation workflow is as follows
- Separate provided train and test cvs by region
- Get a bounding geometry for each region to filter bounds of Sentinel-2 search
- Fetch Sentinel-2 Harmonized collection filtered by cloudy cover
- Download time series array per region
- Concatenate time series for each point into master array
- Create spectral indices

## import libraries

In [7]:
import os
import argparse
import json
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import ee
from scipy.interpolate import interp1d
# ee.Authenticate()
ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')

## set working directory

In [2]:
path = '/app/stella/dev/GeoITU/GeoITU_CropMapping'
os.chdir(path)

# import custom functions
from get_geometry import get_bounds
from utils_gee import *
from utils_datapreparation import *

## csv preparation

-  split train and test csv into separate csvs for each region using lat/lon ranges


In [3]:
## read csv containing lat and long field
train_data = pd.read_csv('data/Train.csv')
test_data = pd.read_csv('data/Test.csv')
print('list of columns', train_data.columns.values)
print('shape of train data', train_data.shape)
print('shape of test data', train_data.shape)


# ----------TRAIN
# split rows by region to allow sepearate file acquisition in earth engine
afghan_train_csv = train_data[train_data["Lon"] > 60]
iran_train_csv =   train_data[(train_data["Lon"] >40) & (train_data["Lon"]<60)]
sudan_train_csv =  train_data[train_data["Lat"] < 20]


# ----------TEST
afghan_test_csv = test_data[test_data["Lon"] > 60]
iran_test_csv =   test_data[(test_data["Lon"] >40) & (test_data["Lon"]<60)]
sudan_test_csv =  test_data[test_data["Lat"] < 20]

print('\nshape of afghan train', afghan_train_csv.shape, \
      'test', afghan_test_csv.shape)
print('shape of iran train', iran_train_csv.shape, \
      'test', iran_test_csv.shape)
print('shape of sudan train', sudan_train_csv.shape, \
      'test', sudan_test_csv.shape)


## assert that all seperations are unique in train
assert(iran_train_csv['ID'].values != sudan_train_csv['ID'].values).all()
assert(iran_train_csv['ID'].values != afghan_train_csv['ID'].values).all()


## assert that all seperations are unique in test
assert(iran_test_csv['ID'].values != sudan_test_csv['ID'].values).all()
assert(iran_test_csv['ID'].values != afghan_test_csv['ID'].values).all()

list of columns ['ID' 'Lat' 'Lon' 'Target']
shape of train data (1500, 4)
shape of test data (1500, 4)

shape of afghan train (500, 4) test (500, 3)
shape of iran train (500, 4) test (500, 3)
shape of sudan train (500, 4) test (500, 3)


## save csv per region

In [10]:
## Train
# afghan_train_csv.to_csv('data/Train_afghan.csv', index=False)
# iran_train_csv.to_csv('data/Train_iran.csv', index=False)
# sudan_train_csv.to_csv('data/Train_sudan.csv', index=False)

## Test
# afghan_test_csv.to_csv('data/Test_afghan.csv', index=False)
# iran_test_csv.to_csv('data/Test_iran.csv', index=False)
# sudan_test_csv.to_csv('data/Test_sudan.csv', index=False)

## download data by region

In [11]:
# get bounds for each region as ee object (pre-determined from earthengine)
afghan_geom = get_bounds('afghan')
iran_geom = get_bounds('iran')
sudan_geom = get_bounds('sudan')

#### afghan collection

In [7]:
# for Afghan region, it was observed that although label points 
# were collected in April, crops were still in season in May. 
# the search window was set from March to May

# Train
afghan_collection = prepare_collection(start_date='2022-03-01', end_date='2022-05-31', \
                                       ignore_idx=None, aoi=afghan_geom, cloud_filter=None)

afghan_collection = mosaic_by_date(afghan_collection)
print('number of images for afghan region', afghan_collection.size().getInfo())


## execute data download
# -- train
# generate_array(afghan_collection, afghan_train_csv, os.path.join(path, 'data', 'Train_afghan'))
 
# -- test
# generate_array(afghan_collection, afghan_test_csv, os.path.join(path, 'data', 'Test_afghan'))


number of images for afghan region 18


#### iran collection

In [8]:
# collection index to ignore
iran_col_ignore = ['20', '47', '43',  '42', '39', '35', '34', '33', '31', '30', '29', '28', '26']

iran_collection = prepare_collection(start_date='2019-07-01', end_date='2020-06-30', \
                                     ignore_idx=iran_col_ignore, aoi=iran_geom, cloud_filter=20)

iran_collection = mosaic_by_date(iran_collection)
print('number of images for iran region',iran_collection.size().getInfo())


## execute data download
# -- train
# generate_array(iran_collection, iran_train_csv, os.path.join(path, 'data', 'Train_iran'))

# -- test
# generate_array(iran_collection, iran_test_csv, os.path.join(path, 'data', 'Test_iran'))

number of images for iran region 55


#### sudan collection

In [9]:
# collection index to ignore
sudan_col_ignore = ['45', '43', '50', '48', '49', '35', '2', '0']

sudan_collection = prepare_collection(start_date='2019-07-01', end_date='2020-06-30', \
                                      ignore_idx=sudan_col_ignore, aoi=sudan_geom, cloud_filter=20)

# sudan_collection = mosaic_by_date(sudan_collection)
print('number of images for sudan region',sudan_collection.size().getInfo())


## execute data download
# -- train
# generate_array(sudan_collection, sudan_train_csv, os.path.join(path, 'data', 'Train_sudan'))

# --test
# generate_array(sudan_collection, sudan_test_csv, os.path.join(path, 'data', 'Test_sudan'))

number of images for sudan region 51


## create master array per country (for raw bands and indices)

This process collects all individual arrays in a folder and concatenates them
- for each country, a master array is created for time series, labels, ids
- time series has shape : number of samples x time x channels
- labels has shape : number of samples. not available for Test
- ids has shape : number of samples

In [4]:
list_country = ['afghan', 'sudan', 'iran']
list_mode = ['Train', 'Test']

for country in list_country:
    for mode in list_mode:
        
        npy_path = 'data/{}_{}'.format(mode, country)
        csv_path = 'data/{}.csv'.format(mode)
        
        if mode == 'Train':
            X, y_labels, ids = create_ml_data(npy_path, csv_path)

        if mode == 'Test':
            csv_path = None
            X, ids = create_ml_data(npy_path, csv_path)

        X_indices = compute_indices(X)
        print('for {} {}'.format(country, mode), X.shape, X_indices.shape)

        # save master arraay of raw bands only and indices only
        np.save(os.path.join('data', '{}_{}_raw.npy'.format(country, mode)), X)
        np.save(os.path.join('data', '{}_{}_indices.npy'.format(country, mode)), X_indices)

        # save y labels
        if mode == 'Train':
            np.save(os.path.join('data2', '{}_{}_labels.npy'.format(country, mode)), y_labels)

        # save ids
        np.save(os.path.join('data', '{}_{}_ids.npy'.format(country, mode)), ids)

for afghan Train (500, 18, 10) (500, 18, 15)
for afghan Test (500, 18, 10) (500, 18, 15)
for sudan Train (500, 51, 10) (500, 51, 15)
for sudan Test (500, 51, 10) (500, 51, 15)
for iran Train (500, 55, 10) (500, 55, 15)
for iran Test (500, 55, 10) (500, 55, 15)
