## **Import Dependencies**

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import cv2
import rasterio as rio
from rasterio.fill import fillnodata
import os
from glob import glob
from tqdm.auto import tqdm
from landscape_helper import check_image_quality
from pathlib import Path
from scipy.ndimage import distance_transform_edt
import zipfile
import io

import warnings
warnings.filterwarnings("ignore")

# Define the directories
data_dir = r"D:\HALDER\GITHUB\MSM-Research\Landscape-Classification\datasets"
out_dir = r"D:\HALDER\GITHUB\MSM-Research\Landscape-Classification\datasets_v1"

## **Read the Datasets**

In [6]:
# Define the image directories
seasonal_patches_paths = glob(rf'{data_dir}\seasonal_patches\*\*.zip')
topo_patches_paths = glob(rf'{data_dir}\topo_patches\*.zip')
label_patches_paths = glob(rf'{data_dir}\label_patches\*.zip')

# Print the number of files
print(len(seasonal_patches_paths), len(topo_patches_paths), len(label_patches_paths))

148 37 37


In [7]:
# Extract the country names
country_names = [i.split('\\')[-1][:-4] for i in label_patches_paths]
print('Number of countries:\n', np.array(country_names))

Number of countries:
 ['Albania' 'Austria' 'Azores Islands' 'Belgium' 'Bosnia & Herzegovina'
 'Bulgaria' 'Croatia' 'Czech Republic' 'Denmark' 'Estonia' 'Faroe Islands'
 'Finland' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland' 'Ireland'
 'Italy' 'Latvia' 'Lithuania' 'Luxembourg' 'Montenegro' 'Netherlands'
 'Norway' 'Poland' 'Portugal' 'Romania' 'Serbia' 'Slovakia' 'Slovenia'
 'Spain' 'Sweden' 'Switzerland'
 'The former Yugoslav Republic of Macedonia' 'Turkey'
 'U.K. of Great Britain and Northern Ireland']


## **Data Processing**

In [46]:
# Fuction to fill the data gaps
def fill_gaps(image_array, max_search_distance=100, smoothing_iterations=0):

    # Extract the Sentinel-2 image array
    s2_image_array = image_array[:11]

    # Calculate the sum of pixel values across the Sentinel-2 bands
    s2_image_array_sum = s2_image_array[:9].sum(axis=0)

    # Extract the NaN mask
    nodata_mask = np.where(s2_image_array_sum==0, 0, 1)

    # Apply fillnodata across all bands
    filled_data = np.empty_like(s2_image_array)

    for band in range(s2_image_array.shape[0]):  # Iterate over bands
        filled_data[band] = fillnodata(
            s2_image_array[band], 
            mask=nodata_mask,
            max_search_distance=max_search_distance, 
            smoothing_iterations=smoothing_iterations
    )

    # Concatenate the remaining bands
    filled_image_array = np.concatenate((filled_data, image_array[11:, :, :]), axis=0)

    return filled_image_array

In [117]:
# # Store all the image infos in a dictionary
# final_info_list = []

# # Iterate over all the countries
# for country in tqdm(country_names):
#     print('Country:', country)
#     print('*'*50)

#     # Prepare the directories for a single country
#     country_seasonal_paths = sorted([path for path in seasonal_patches_paths if country in path])

#     # Read the file names and process the data
#     for path in country_seasonal_paths:

#         # Get the season info
#         season = path.split("\\")[-2] # Param
#         print('Season:', season)
        
#         out_country_season_path = os.path.join(out_dir, 'seasonal_patches', season, country)

#         if os.path.exists(out_country_season_path):
#             print(f"{out_country_season_path} directory is already existed.")
#         else:
#             os.mkdir(out_country_season_path)
#             print(f"{out_country_season_path} directory has been successfully created!")
        
#         with zipfile.ZipFile(path, 'r') as zip_ref:
#             file_names = zip_ref.namelist()

#             for file_name in tqdm(file_names):
#                 if file_name.lower().endswith(".tif"):  # Only process GeoTIFF images
#                     with zip_ref.open(file_name) as file:
#                         with rio.open(io.BytesIO(file.read())) as src:
#                             image = src.read()  # (shape: [bands, height, width])

#                             # Compute spatial NaN percentage
#                             nan_count = np.where(image[5]==0, 1, 0).flatten().sum() # Considering a single Sentinel-2 band
#                             total_elements = image[5].size
#                             nan_percentage = round((nan_count / total_elements) * 100, 2)  # Param

#                             # Fill the data gaps
#                             filled_array = fill_gaps(image, max_search_distance=100, smoothing_iterations=0)

#                             # Convert into float32
#                             filled_array = filled_array.astype(np.float32)

#                             # print(f"{file_name}: {image.shape}")  # Print shape to confirm bands
                    
#                             # Define the outpath
#                             out_path = os.path.join('\\'.join(path.split('\\')[-3:-1]), country, f'{file_name[:-4]}.npy')
                            
#                             # Prepare the dictionary
#                             image_info_dict = {
#                                 'country': country,
#                                 'season': season,
#                                 'image_id': '_'.join(file_name.split("_")[:2]),
#                                 'nan_perc': nan_percentage,
#                                 'path': out_path
#                             }

#                             final_info_list.append(image_info_dict)

#                             # Save the image in a numpy format
#                             np.save(os.path.join(out_dir, out_path), filled_array)

#     # Convert the info list into a dataframe
#     final_info_list_df = pd.DataFrame(final_info_list)
#     final_info_list_df.to_csv(f'temp\seasonal_patches_info.csv', index=False)


In [162]:
# # Store all the image infos in a dictionary
# final_info_list = []

# # Read the file names and process the data
# for path in tqdm(label_patches_paths):

#     # Get the season info
#     country = path.split('\\')[-1][:-4] # Param
#     print('Country:', country)
        
#     out_country_path = os.path.join(out_dir, 'label_patches', country)

#     if os.path.exists(out_country_path):
#         print(f"{out_country_path} directory is already existed.")
#     else:
#         os.mkdir(out_country_path)
#         print(f"{out_country_path} directory has been successfully created!")
        
#     with zipfile.ZipFile(path, 'r') as zip_ref:
#         file_names = zip_ref.namelist()

#         for file_name in file_names:
#             if file_name.lower().endswith(".tif"):  # Only process GeoTIFF images
#                 with zip_ref.open(file_name) as file:
#                     try:
#                         with rio.open(io.BytesIO(file.read())) as src:
#                             # print(file_name)
#                             image = src.read().astype(np.float32)  # (shape: [bands, height, width])

#                             # Define the outpath
#                             out_path = os.path.join('\\'.join(path.split('\\')[-2:-1]), country, f'{file_name[:-4]}.npy')
                                
#                             # Prepare the dictionary
#                             image_info_dict = {
#                                 'country': country,
#                                 'image_id': '_'.join(file_name.split("_")[:2]),
#                                 'path': out_path
#                             }

#                             final_info_list.append(image_info_dict)

#                             # Save the image in a numpy format
#                             np.save(os.path.join(out_dir, out_path), image)
#                     except:
#                         continue

# # Convert the info list into a dataframe
# final_info_list_df = pd.DataFrame(final_info_list)
# final_info_list_df.to_csv(rf'temp\label_patches_info.csv', index=False)