## **Landscape Data Preparation**

### **Import Dependencies**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import ee
import geemap
from tqdm.auto import tqdm
import os
from glob import glob
import json
import logging
import multiprocessing
import requests
import shutil
from retry import retry

import warnings
warnings.filterwarnings('ignore')

# create necessary folders
if os.path.exists('datasets'):
    print("'datasets' folder is already existed.")
else:
    os.makedirs('datasets')
    print("'datasets' folder has been successfully created.")

'datasets' folder is already existed.


### **Initialize a Map Object**

In [2]:
ee.Authenticate()
ee.Initialize(opt_url="https://earthengine-highvolume.googleapis.com")

Map = geemap.Map(basemap="Esri.WorldImagery")
Map

Map(center=[0, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(childr…

### **Import World Administrative Boundary Layer**

In [3]:
# Import World Administrative Layer
world = ee.FeatureCollection('projects/ee-geonextgis/assets/world_administrative_boundaries')

# Filter the countries belonging to Europe
europe = world.filter(ee.Filter.eq('continent', 'Europe'))
europe_country_names = europe.aggregate_array('name').getInfo()

# Define the countries to remove
countries_to_remove = {'Russian Federation', 'Belarus', 'Ukraine', 'Moldova, Republic of', 'Svalbard and Jan Mayen Islands'}
europe_country_names = list(set(europe_country_names).difference(countries_to_remove))
europe_country_names.append('Turkey')
europe = world.filter(ee.Filter.inList('name', europe_country_names))

style = {'fillColor': '00000000', 'color': 'red', 'width': 1}
Map.addLayer(europe.style(**style), {}, 'Europe')

### **Calculate the Area and Number of Samples per Country**

In [4]:
# Calculate the area for each country
area = europe.map(lambda f: ee.Feature(None, {'country': f.get('name'), 'area': f.area()}))
area = geemap.ee_to_df(area)
area['area'] = (area['area'] // 1000000).astype('int')
area = area[['country', 'area']]

# Calculate the area proportion
area['area_proportion'] = ((area['area'] / area['area'].sum())).round(4)

# Calculate the number of samples per country
total_n_samples = 1e5
area['n_of_smaples'] = (area['area_proportion'] * total_n_samples).astype('int')

# Drop the columns where number of samples is 0
area = area[area['n_of_smaples']>0]
area.sort_values(by='n_of_smaples', ascending=False, inplace=True)

print(area.shape)
area.head()

(41, 4)


Unnamed: 0,country,area,area_proportion,n_of_smaples
0,Turkey,779333,0.1345,13450
32,France,546646,0.0943,9430
30,Spain,505656,0.0873,8730
17,Sweden,443345,0.0765,7650
38,Germany,355924,0.0614,6140


### **Import the CORINE LULC Product for 2018**

In [5]:
# Import the CORINE LULC 2018
corine_2018 = ee.Image('COPERNICUS/CORINE/V20/100m/2018')\
                .select('landcover')
Map.setCenter(16.436, 39.825, 6);
Map.addLayer(corine_2018, {}, 'CORINE Land Cover')

In [6]:
# Extract all the class values from the image
class_values = corine_2018.get('landcover_class_values').getInfo()
class_palette = corine_2018.get('landcover_class_palette').getInfo()

print('Number of classes in the CORINE LULC:', len(class_values))

Number of classes in the CORINE LULC: 44


### **Generate Equal Number of Samples from Each Class**

In [7]:
# Function to generate sample points
def generateSamplePoints(image, 
                         country_name, 
                         band_name, 
                         scale=100, 
                         seed=42, 
                         save=False, 
                         output_folder=None):
    """
    Generate stratified sample points for a given country from a land use/land cover (LULC) image.

    Args:
        image (ee.Image): The Earth Engine image containing LULC data.
        country_name (str): Name of the country for which to generate sample points.
        band_name (str): Name of the band containing the LULC classification.
        scale (int, optional): Scale in meters for sampling. Default is 100.
        seed (int, optional): Random seed for reproducibility. Default is 42.
        save (bool, optional): Whether to save the samples as a shapefile. Default is False.
        output_folder (str, optional): Folder to save the output shapefile if `save` is True. Default is None.

    Returns:
        geopandas.GeoDataFrame: A GeoDataFrame containing the stratified sample points with their respective attributes.

    """
    country_geom = europe.filter(ee.Filter.eq('name', country_name)).first().geometry()
    country_n_samples = area[area['country']==country_name].values[0][-1]

    country_lulc = image.clip(country_geom)

    # Calculate the frequency histogram
    freq_hist = country_lulc.reduceRegion(
        reducer=ee.Reducer.frequencyHistogram(), 
        geometry=country_geom,
        scale=100, 
        bestEffort=True,
        maxPixels=1e10,
        tileScale=8
    )

    # Extract the class values
    country_class_values = [int(i) for i in freq_hist.getInfo()[band_name].keys()]

    # Calculate the sample per class
    sample_per_calss = country_n_samples // len(country_class_values)

    # Generate samples for each class
    lulc_samples = country_lulc.stratifiedSample(
        numPoints=sample_per_calss, 
        classBand=band_name,
        region=country_geom, 
        scale=scale, 
        seed=seed, 
        classValues=country_class_values, 
        classPoints=[sample_per_calss for i in range(len(country_class_values))], 
        dropNulls=True, 
        tileScale=8, 
        geometries=True
    )

    lulc_samples_gdf = geemap.ee_to_gdf(lulc_samples)

    if save!=False:
        if os.path.exists(output_folder)==False:
            os.makedirs(output_folder)

        file_name = f'{country_name}_samples.shp'
        out_file_path = os.path.join(output_folder, file_name)
        lulc_samples_gdf.to_file(out_file_path, driver='ESRI Shapefile')
        print(f'{file_name} saved successfully at {out_file_path}.')

    return lulc_samples_gdf

In [8]:
# # Apply the function over all the countries
# for country in tqdm(europe_country_names):
#     generateSamplePoints(
#         image=corine_2018,
#         country_name=country, 
#         band_name='landcover',
#         scale=100, 
#         seed=42, 
#         save=True, 
#         output_folder='datasets\\shapefiles'
#     )

### **Prepare the Sentinel Image Source**

In [9]:
# Function to remove clouds and snow from Sentine-2 imagery
def maskS2CloudsAndSnow(image, cloud_perc=10, mask_snow=True, snow_perc=10):
    
    cloudProb = image.select("MSK_CLDPRB")
    cloud = cloudProb.lt(cloud_perc)
    snowProb = image.select("MSK_SNWPRB")
    snow = snowProb.lt(snow_perc)
    scl = image.select("SCL")
    cloudShadow = scl.eq(3)
    cirrus = scl.eq(10)
    
    if mask_snow==True:
       mask = cloud.And(snow).And(cloudShadow.neq(1)).And(cirrus.neq(1))
    else:
       mask = cloud.And(cloudShadow.neq(1)).And(cirrus.neq(1))
        
    return image.updateMask(mask)

# Function to perform edge masking on Sentinel-1 images
def edgeMasking(image, min_thresh=30, max_thresh=45):
    
    angle = image.select("angle")
    mask = angle.gt(min_thresh).And(angle.lt(max_thresh))

    return image.updateMask(mask)

In [10]:
# Function to prepare Sentinel 1 and 2 composite
def prepareSentinelComposite(region, 
                             s1_band_names, 
                             s2_band_names,
                             start_date,
                             end_date):

    # Read the Sentinel-2 and Sentinel-1 images from the Earth Engine
    sentinel_2 = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")\
        .filterBounds(region.bounds())\
        .filterDate(start_date, end_date)\
        .filterMetadata("CLOUDY_PIXEL_PERCENTAGE", "less_than", 10)\
        .map(maskS2CloudsAndSnow)\
        .select(s2_band_names)\
        .median()\
        .clip(region.bounds())
    
    sentinel_1 = ee.ImageCollection("COPERNICUS/S1_GRD")\
        .filterBounds(region.bounds())\
        .filterDate(start_date, end_date)\
        .filter(ee.Filter.listContains("transmitterReceiverPolarisation", "VV"))\
        .filter(ee.Filter.listContains("transmitterReceiverPolarisation", "VH"))\
        .filter(ee.Filter.eq('instrumentMode', 'IW'))\
        .filter(ee.Filter.eq("orbitProperties_pass", "DESCENDING"))\
        .map(edgeMasking)\
        .select(s1_band_names)\
        .mean()\
        .clip(region.bounds())
    
    # Combine both the Image
    sentinel_combined = sentinel_2.addBands(sentinel_1)

    return sentinel_combined

### **Prepare LULC Source (CORINE, Dynamic World, ESRI)**

In [11]:
# Function to prepare Sentinel 1 and 2 composite
def prepareLULCSource(region,
                      year,
                      start_date,
                      end_date):
    
    # Prepare the CORINE LULC Dataset
    proj = ee.Projection("EPSG:4326")
    corine = ee.Image('COPERNICUS/CORINE/V20/100m/2018')\
        .select('landcover')\
        .rename('corine')\
        .reproject(proj, None, 100)\
        .clip(region.bounds())

    # Prepare the Dynamic World LULC dataset
    dynamic_world = ee.ImageCollection("GOOGLE/DYNAMICWORLD/V1")\
        .filterBounds(region.bounds())\
        .filterDate(start_date, end_date)\
        .select("label")\
        .mode()\
        .rename('dynamic_world')\
        .clip(region.bounds())
    
    # Prepare the ESRI LULC dataset
    esri = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS")\
        .filterBounds(region.bounds())\
        .filterDate(f'{year}-01-01', f'{year}-12-31')\
        .mosaic()\
        .remap([1,2,4,5,7,8,9,10,11],[1,2,3,4,5,6,7,8,9])\
        .rename('esri')\
        .clip(region.bounds())

    # Combine both the Image
    lulc_combined = corine.addBands(dynamic_world).addBands(esri)

    return lulc_combined

In [12]:
## Example usage
# Define the Sentinel-2 and Sentinel-1 band names
s2_band_names = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B11', 'B12']
s1_band_names = ["VH", "VV"]

# Define the start, end date, and year
start_date = "2017-06-01"
end_date = "2018-05-31"
year = 2018

test_region = europe.filter(ee.Filter.eq('name', 'Germany')).geometry()

test_sentinel_composite = prepareSentinelComposite(
    region=test_region, 
    s1_band_names=s1_band_names, 
    s2_band_names=s2_band_names,
    start_date=start_date,
    end_date=end_date
)

test_lulc_combined = prepareLULCSource(
    region=test_region,
    year=year,
    start_date=start_date,
    end_date=end_date
)

Map.addLayer(test_sentinel_composite, {"min": 0, "max": 4000, "bands": ["B8", "B4", "B3"]}, "Test Sentinel Image")
Map.addLayer(test_lulc_combined, {}, "Test LULC Image")

### **Data Extraction Module**

In [13]:
# Read all the dataframe path
country_samples_path =  glob('D:\GITHUB\landscape-classification\datasets\\'+'shapefiles\\'+'*.shp')
print('The number of shapefiles:', len(country_samples_path))

# Define the Sentinel-2 and Sentinel-1 band names
s2_band_names = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B11', 'B12']
s1_band_names = ["VH", "VV"]

# Define the start, end date, and year
start_date = "2017-06-01"
end_date = "2018-05-31"
year = 2018

The number of shapefiles: 37


In [19]:
for path in sorted(country_samples_path)[:1]:
    country = path.split("/")[-1].split("_")[0]
    print("******************************************************************************************")
    print(country)
    
    # Read the geodataframe
    sample_gdf = gpd.read_file(path)

    # Convert the geodataframe into a json object
    sample_json = json.loads(sample_gdf.to_json())["features"]

    # Set the output directories
    out_country_dir = os.path.join("D:\GITHUB\landscape-classification\datasets\rasters", country)
    out_image_dir = os.path.join(out_country_dir, "Images")
    out_mask_dir = os.path.join(out_country_dir, "Masks")

    for p in [out_country_dir, out_image_dir, out_mask_dir]:
        if os.path.exists(p):
            print(f"{p} directory is already existed.")
        else:
            os.mkdir(p)
            print(f"{p} directory has been successfully created!")

    # Prepare the datasets
    region = europe.filter(ee.Filter.eq('name', country)).geometry()

    sentinel_composite = prepareSentinelComposite(
        region=region,
        s1_band_names=s1_band_names,
        s2_band_names=s2_band_names,
        start_date=start_date,
        end_date=end_date
    )

    lulc_combined = prepareLULCSource(
        region=region,
        year=year,
        start_date=start_date,
        end_date=end_date
    )

    # Set output image parameters
    img_params = {
        "count": len(sample_gdf),
        "buffer": 1270,
        "dimensions": "256x256",  # The dimension of each image chip
        "format": "GEO_TIFF",  # The output image format, can be png, jpg, ZIPPED_GEO_TIFF, GEO_TIFF, NPY
        "processes": 30,  # How many processes to used for parallel processing
    }
    
    # Create a Function for Downloading Image
    @retry(tries=10, delay=1, backoff=2)
    def getResult(id, type, props, geom):

        index = id
        landcover = props["landcover"]
        coords = ee.Geometry.Point(geom["coordinates"])

        region = coords.buffer(img_params["buffer"]).bounds()

        if img_params["format"] in ["png", "jpg"]:
            img_url = sentinel_composite.getThumbURL(
                {
                    "region": region,
                    "dimensions": img_params["dimensions"],
                    "format": img_params["format"],
                }
            )

            mask_url = lulc_combined.getThumbURL(
                {
                    "region": region,
                    "dimensions": img_params["dimensions"],
                    "format": img_params["format"],
                }
            )

        else:
            img_url = sentinel_composite.getDownloadURL(
                {
                    "region": region,
                    "dimensions": img_params["dimensions"],
                    "format": img_params["format"],
                }
            )

            mask_url = lulc_combined.getDownloadURL(
                {
                    "region": region,
                    "dimensions": img_params["dimensions"],
                    "format": img_params["format"],
                }
            )

        if img_params["format"] == "GEO_TIFF":
            ext = "tif"
        else:
            ext = img_params["format"]

        r_img = requests.get(img_url, stream=True)
        if r_img.status_code != 200:
            r_img.raise_for_status()

        r_mask = requests.get(mask_url, stream=True)
        if r_mask.status_code != 200:
            r_mask.raise_for_status()

        basename = str(index).zfill(len(str(img_params["count"])))
        img_filename = f"{out_image_dir}/{basename}_{landcover}_Image.{ext}"
        mask_filename = f"{out_mask_dir}/{basename}_{landcover}_Mask.{ext}"

        with open(img_filename, "wb") as out_file:
            shutil.copyfileobj(r_img.raw, out_file)
        print("Done (Image): ", basename)

        with open(mask_filename, "wb") as out_file:
            shutil.copyfileobj(r_mask.raw, out_file)
        print("Done (Mask): ", basename)

    # Extract the patches
    logging.basicConfig()
    items = [list(i.values()) for i in sample_json]

    pool = multiprocessing.Pool(img_params["processes"])
    pool.starmap(getResult, items)

    pool.close()

    # # Define the folder to zip and output file name
    # folder_to_zip = os.path.join("/content/datasets/", country)
    # output_zip_file = f"{country}.zip"

    # # Zip the folder
    # !zip -r {output_zip_file} {folder_to_zip}

    # files.download(output_zip_file)

******************************************************************************************
D:\GITHUB\landscape-classification\datasets\shapefiles\Albania
D:\GITHUB\landscape-classification\datasets\shapefiles\Albania directory has been successfully created!
D:\GITHUB\landscape-classification\datasets\shapefiles\Albania\Images directory has been successfully created!
D:\GITHUB\landscape-classification\datasets\shapefiles\Albania\Masks directory has been successfully created!


In [18]:
test_gdf = gpd.read_file('D:\GITHUB\landscape-classification\datasets\shapefiles\Germany_samples.shp')

# convert the geodataframe into a json object
test_gdf_json = json.loads(test_gdf.to_json())["features"]
test_gdf_json[0]

{'id': '0',
 'type': 'Feature',
 'properties': {'landcover': 111},
 'geometry': {'type': 'Point',
  'coordinates': [9.252520529643364, 48.78194998258037]}}

In [21]:
# define the prefix
state = 'Germany'
output_country_folder = rf"D:\GITHUB\landscape-classification\datasets\rasters\{state}"
output_image_dir = rf"D:\GITHUB\landscape-classification\datasets\rasters\{state}\Images"
output_mask_dir = rf"D:\GITHUB\landscape-classification\datasets\rasters\{state}\Masks"

if os.path.exists(output_country_folder):
  print("Country directory is already existed.")
else:
  os.mkdir(output_country_folder)
  print("Country directory has been successfully created!")

if os.path.exists(output_image_dir):
  print("Image directory is already existed.")
else:
  os.mkdir(output_image_dir)
  print("Image directory has been successfully created!")

if os.path.exists(output_mask_dir):
  print("Mask directory is already existed.")
else:
  os.mkdir(output_mask_dir)
  print("Mask directory has been successfully created!")

Country directory has been successfully created!
Image directory has been successfully created!
Mask directory has been successfully created!


In [22]:
# Set Parameters
img_params = {
    "count": len(test_gdf),
    "buffer": 1270,
    "dimensions": "256x256",  # The dimension of each image chip
    "format": "GEO_TIFF",  # The output image format, can be png, jpg, ZIPPED_GEO_TIFF, GEO_TIFF, NPY
    "processes": 30,  # How many processes to used for parallel processing
    "out_dir": output_image_dir,  # The output directory. Default to the current working directly
}

mask_params = {
    "count": len(test_gdf),
    "buffer": 1270,
    "dimensions": "256x256",  # The dimension of each image chip
    "format": "GEO_TIFF",  # The output image format, can be png, jpg, ZIPPED_GEO_TIFF, GEO_TIFF, NPY
    "processes": 30,  # How many processes to used for parallel processing
    "out_dir": output_mask_dir,  # The output directory. Default to the current working directly
}

In [23]:
# Create a Function for Downloading Image
@retry(tries=10, delay=1, backoff=2)
def getResult(id, type, props, geom):

    index = id
    landcover = props["landcover"]
    coords = ee.Geometry.Point(geom["coordinates"])

    region = coords.buffer(img_params["buffer"]).bounds()

    if img_params["format"] in ["png", "jpg"]:
        img_url = test_sentinel_composite.getThumbURL(
            {
                "region": region,
                "dimensions": img_params["dimensions"],
                "format": img_params["format"],
            }
        )

        mask_url = test_lulc_combined.getThumbURL(
            {
                "region": region,
                "dimensions": mask_params["dimensions"],
                "format": mask_params["format"],
            }
        )

    else:
        img_url = test_sentinel_composite.getDownloadURL(
            {
                "region": region,
                "dimensions": img_params["dimensions"],
                "format": img_params["format"],
            }
        )

        mask_url = test_lulc_combined.getDownloadURL(
            {
                "region": region,
                "dimensions": mask_params["dimensions"],
                "format": mask_params["format"],
            }
        )

    if img_params["format"] == "GEO_TIFF":
        ext = "tif"
    else:
        ext = img_params["format"]

    r_img = requests.get(img_url, stream=True)
    if r_img.status_code != 200:
        r_img.raise_for_status()

    r_mask = requests.get(mask_url, stream=True)
    if r_mask.status_code != 200:
        r_mask.raise_for_status()

    img_out_dir = os.path.abspath(img_params["out_dir"])
    mask_out_dir = os.path.abspath(mask_params["out_dir"])
    basename = str(index).zfill(len(str(img_params["count"])))
    img_filename = f"{img_out_dir}/{basename}_{landcover}.{ext}"
    mask_filename = f"{mask_out_dir}/{basename}_{landcover}.{ext}"

    with open(img_filename, "wb") as out_file:
        shutil.copyfileobj(r_img.raw, out_file)
    print("Done (Image): ", basename)

    with open(mask_filename, "wb") as out_file:
        shutil.copyfileobj(r_mask.raw, out_file)
    print("Done (Mask): ", basename)

In [None]:
%%time
logging.basicConfig()
items = [list(i.values()) for i in test_gdf_json]

pool = multiprocessing.Pool(img_params["processes"])
pool.starmap(getResult, items)

pool.close()

In [103]:
for path in country_samples_path[:5]:
    country = path.split("\\")[-1].split("_")[0]

    region = europe.filter(ee.Filter.eq('name', country)).geometry()

    sentinel_composite = prepareSentinelComposite(
        region=region, 
        s1_band_names=s1_band_names, 
        s2_band_names=s2_band_names,
        start_date=start_date,
        end_date=end_date
    )

    lulc_combined = prepareLULCSource(
        region=region,
        year=year,
        start_date=start_date,
        end_date=end_date
    )

    output_dir = os.path.join(r"D:\GITHUB\landscape-classification\datasets\rasters", country)
    if os.path.exists(output_dir)==False:
        os.makedirs(output_dir)

    # Read the geodataframe
    country_gdf = gpd.read_file(path)

    # Convert the geodataframe into a json object
    country_json = json.loads(country_gdf.to_json())['features']

    # Set Parameters
    img_params = {
        "count": len(country_gdf),
        "buffer": 1270,
        "dimensions": "256x256",  # The dimension of each image chip
        "format": "GEO_TIFF",  # The output image format, can be png, jpg, ZIPPED_GEO_TIFF, GEO_TIFF, NPY
        "processes": 30,  # How many processes to used for parallel processing
        "out_dir": output_dir   #  Output directory. Default to the current working directly
    }

    # Function for Downloading Image
    @retry(tries=10, delay=1, backoff=2)
    def getResult(id, type, props, geom):

        index = id
        landcover = props["landcover"]
        coords = ee.Geometry.Point(geom["coordinates"])

        region = coords.buffer(img_params['buffer']).bounds()

        if format in ["png", "jpg"]:
            img_url = sentinel_composite.getThumbURL(
                {
                    "region": region,
                    "dimensions": dimensions,
                    "format": format,
                }
            )

            mask_url = lulc_combined.getThumbURL(
                {
                    "region": region,
                    "dimensions": dimensions,
                    "format": format,
                }
            )

        else:
            img_url = sentinel_composite.getDownloadURL(
                {
                    "region": region,
                    "dimensions": img_params['dimensions'],
                    "format": format,
                }
            )

            mask_url = lulc_combined.getDownloadURL(
                {
                    "region": region,
                    "dimensions": img_params['dimensions'],
                    "format": format,
                }
            )

        if format == "GEO_TIFF":
            ext = "tif"
        else:
            ext = format

        r_img = requests.get(img_url, stream=True)
        if r_img.status_code != 200:
            r_img.raise_for_status()

        r_mask = requests.get(mask_url, stream=True)
        if r_mask.status_code != 200:
            r_mask.raise_for_status()

        img_out_dir = os.path.join(output_dir, 'Images')
        mask_out_dir = os.path.join(output_dir, 'Masks')

        if os.path.exists(img_out_dir)==False:
            os.makedirs(img_out_dir)
        if os.path.exists(mask_out_dir)==False:
            os.makedirs(mask_out_dir)

        basename = str(index).zfill(len(str(img_params['count'])))
        img_filename = f"Image_{landcover}_{basename}.{ext}"
        mask_filename = f"Mask_{landcover}_{basename}.{ext}"

        with open(os.path.join(img_out_dir, img_filename), "wb") as out_file:
            shutil.copyfileobj(r_img.raw, out_file)
        print("Done (Image): ", basename)

        with open(os.path.join(mask_out_dir, mask_filename), "wb") as out_file:
            shutil.copyfileobj(r_mask.raw, out_file)
        print("Done (Mask): ", basename)

    logging.basicConfig()
    items = [list(i.values()) for i in country_json]

    pool = multiprocessing.Pool(img_params["processes"])
    pool.starmap(getResult, items)

    pool.close()