### Stage 1: Data Preparation

- [ ] Download [OpenEarthMap](https://zenodo.org/records/7223446) dataset using `zenodo-get` command
- [ ] Organise Metadata for OpenEarthMap dataset
- [ ] Create training, test, and validation splits
- [ ] Export Metadata as a CSV file

In [1]:
from pathlib import Path
from src.utils.common import read_yaml, show_config
from src import PARAMS_FILEPATH
from typing import List
from dataclasses import dataclass

@dataclass(frozen=True)
class DataConfig: 
    doi: str
    out_dir: Path
    train_test_val_split: List
    metadata_file: Path
    random_seed: int

class ConfigManager:
    def __init__(self, config_path = PARAMS_FILEPATH) -> None:
        self.config = read_yaml(config_path)

    def get_data_config(self) -> DataConfig:
        Path(self.config["out_dir"]).mkdir(parents=True, exist_ok=True)
        cfg = DataConfig(
            doi = self.config["doi"],
            out_dir = Path(self.config["out_dir"]),
            train_test_val_split = self.config["train_test_val_split"],
            metadata_file = Path(self.config["metadata_file"]),
            random_seed = self.config["random_seed"]
        )
        return cfg

cfg = ConfigManager().get_data_config()
show_config(cfg)

doi: 10.5281/zenodo.7223446
out_dir: data
train_test_val_split: [0.75, 0.15, 0.1]
metadata_file: data/metadata.csv
random_seed: 37


In [6]:
# Components
import os
import subprocess

class DataComponents:
    def __init__(self, cfg: DataConfig) -> None:
        self.config = cfg
    
    def download_data(self, unzip: bool = True, remove_zip: bool = False) -> None:
        print(f">>>>>>>>>>>> Downloading data from Zenodo <<<<<<<<<<<<")
        if not Path(self.config.out_dir).joinpath("OpenEarthMap_wo_xBD").exists() and not Path(self.config.out_dir).joinpath("OpenEarthMap.zip").exists():
            cmd = f"zenodo_get -e -d {self.config.doi} -o {self.config.out_dir}"
            subprocess.run(cmd, shell=True)
            if unzip:
                print(f"--> Unzipping data")
                cmd = f"unzip -q {self.config.out_dir}/OpenEarthMap.zip -d {self.config.out_dir}"
                subprocess.run(cmd, shell=True)
                if remove_zip:
                    os.remove(f"{self.config.out_dir}/OpenEarthMap.zip")
        else:
            print("--> Data already downloaded. Skipping.")
            
    def aggregate_data(self) -> None:
        from pathlib import Path
        print(f">>>>>>>>>>>> Aggregating data <<<<<<<<<<<<")
        # Aggregate Metadata
        imgs = sorted(list(Path(self.config.out_dir).joinpath("OpenEarthMap_wo_xBD").rglob('*/images/*.tif')))
        labels = sorted(list(Path(self.config.out_dir).joinpath("OpenEarthMap_wo_xBD").rglob('*/labels/*.tif')))

        self.images = []
        self.masks = []
        # Check if labels exist
        for img in imgs:
            for label in labels:
                if img.stem == label.stem:
                    self.images.append(img)
                    self.masks.append(label)
                    
        print(f"--> Number of images with labels: {len(self.images)}")
        
        # Delete images without labels
        for img in imgs:
            if img not in self.images:
                os.remove(img)
        for label in labels:
            if label not in self.masks:
                os.remove(label)
    
    def split_data(self) -> None:
        import pandas as pd
        print(f">>>>>>>>>>>> Splitting data into train/val/test sets <<<<<<<<<<<<")  
        from sklearn.model_selection import train_test_split

        meta = pd.DataFrame({"image": self.images, "mask": self.masks})

        # Assuming cfg.train_test_val_split is a list of three values summing to 1, e.g. [0.7, 0.2, 0.1]
        train_ratio, val_ratio, test_ratio = self.config.train_test_val_split

        # First split: separate train from the rest
        train_df, temp_df = train_test_split(meta, train_size=train_ratio, random_state=self.config.random_seed)

        # Second split: divide the rest into val and test
        val_ratio_adjusted = val_ratio / (val_ratio + test_ratio)
        val_df, test_df = train_test_split(temp_df, train_size=val_ratio_adjusted, random_state=self.config.random_seed)

        # Assign groups
        train_df["group"] = "train"
        val_df["group"] = "val"
        test_df["group"] = "test"
        
        print(f"--> Number of images in train/val/test sets: {len(train_df)}, {len(val_df)}, {len(test_df)}")

        # Combine
        self.metadata = pd.concat([train_df, val_df, test_df], axis=0)

        # Save
        self.metadata.to_csv(self.config.metadata_file, index=False)
        print(f"Metadata saved to {self.config.metadata_file}")

In [7]:
pipeline = DataComponents(ConfigManager().get_data_config())
pipeline.download_data(unzip=True, remove_zip=False)
pipeline.aggregate_data()
pipeline.split_data()

>>>>>>>>>>>> Downloading data from Zenodo <<<<<<<<<<<<
----------- Data already downloaded -----------
>>>>>>>>>>>> Aggregating data <<<<<<<<<<<<
Number of images with labels: 2687
>>>>>>>>>>>> Splitting data into train/val/test sets <<<<<<<<<<<<
Number of images in train/val/test sets: 2015, 403, 269
Metadata saved to data/metadata.csv


### Stage 2: Data Preparation

### Stage 3: Model Training

### Stage 4: Model Evaluation