### Download Train Set

In [5]:
SAVE_ON_DRIVE = True

In [6]:
from google.colab import drive
import os
import shutil

drive.mount('/content/drive')

# Set paths for Validation and Test datasets
val_dataset_path = '/content/drive/MyDrive/LoveDA/Val'
test_dataset_path = '/content/drive/MyDrive/LoveDA/Test'


# Function to handle dataset download and extraction
def handle_dataset(dataset_name, zip_url, local_path, drive_path, save_on_drive):
    if not os.path.exists(local_path):
        if os.path.exists(f"{drive_path}.zip"):
            print(f"{dataset_name} dataset available on own drive, unzipping...")
            !unzip -q {drive_path}.zip -d ./
        else:
            print(f"Downloading {dataset_name} dataset...")
            !wget -O {dataset_name}.zip "{zip_url}"
            if save_on_drive:
                print(f"Saving {dataset_name} dataset on drive...")
                !cp {dataset_name}.zip {drive_path}.zip
                print(f"{dataset_name} dataset saved on drive")
            print(f"Unzipping {dataset_name} dataset...")
            !unzip -q {dataset_name}.zip -d ./
    else:
        print(f"{dataset_name} dataset already in local")

# # Handle Validation dataset
# handle_dataset("Validation", "https://zenodo.org/records/5706578/files/Val.zip?download=1", "./Val", "/content/drive/MyDrive/LoveDA/Val", SAVE_VAL_TOO)

# # Handle Test dataset
# handle_dataset("Test", "https://zenodo.org/records/5706578/files/Test.zip?download=1", "./Test", "/content/drive/MyDrive/LoveDA/Test", False)

# Handle Train dataset
handle_dataset("Train", "https://zenodo.org/records/5706578/files/Train.zip?download=1", "./Train", "/content/drive/MyDrive/LoveDA/Train", SAVE_ON_DRIVE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train dataset already in local


### Dirs

In [7]:
from PIL import Image
import os
import torch
from tqdm import tqdm

def pil_loader(path, color_type):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert(color_type)

urban_masks_path = "./Train/Urban/masks_png"
rural_masks_path = "./Train/Rural/masks_png"


### Calculation

In [8]:
import torch
import numpy as np

idx_to_class = {
    0: 'Unclassified',
    1: 'Background',
    2: 'Building',
    3: 'Road',
    4: 'Water',
    5: 'Barren',
    6: 'Forest',
    7: 'Agricultural',
}

class_count = {
    0: 0.0,
    1: 0.0,
    2: 0.0,
    3: 0.0,
    4: 0.0,
    5: 0.0,
    6: 0.0,
    7: 0.0
}

class_ids = [0, 1, 2, 3, 4, 5, 6, 7]

nof_imgs_that_have_unclassified = 0

total = 0
for masks_path in [urban_masks_path, rural_masks_path]:
    print(f"processing {masks_path}")

    loop = tqdm(os.listdir(masks_path))
    for filename in loop:
        loop.set_description(f"Processing {filename} from ") # update desc with filename

        mask = pil_loader(os.path.join(masks_path, filename), "L")

        # Convert the PIL Image to a PyTorch tensor
        mask_tensor = torch.from_numpy(np.array(mask))

        # Count the occurrences of each class using torch.unique
        unique_classes, counts = torch.unique(mask_tensor, return_counts=True)

        if 7 in unique_classes:
            nof_imgs_that_have_unclassified += 1

        # Update the class_count dictionary
        for class_idx, count in zip(unique_classes.tolist(), counts.tolist()):
            if class_idx in class_count:
                class_count[class_idx] += count
                total += count

    class_ratio = {}
    for key in class_count.keys():
        class_ratio[key] = class_count[key] / total

    for key in class_ratio.keys():
      print(f"{idx_to_class[key]}: {class_ratio[key]:.4f}")

    print()

    print("class_ratio = {")  # Start with the opening brace
    for key in class_ratio.keys():
        print(f"    {key}: {class_ratio[key]:.4f},")  # Print each key-value pair with indentation and comma
    print("}")  # End with the closing brace

    print()
    print(f"There are {nof_imgs_that_have_unclassified} images that contain the unclassified class")

processing ./Train/Urban/masks_png


Processing 2294.png from : 100%|██████████| 1156/1156 [00:41<00:00, 27.83it/s]


Unclassified: 0.0482
Background: 0.4612
Building: 0.2018
Road: 0.0883
Water: 0.0355
Barren: 0.0720
Forest: 0.0754
Agricultural: 0.0177

class_ratio = {
    0: 0.0482,
    1: 0.4612,
    2: 0.2018,
    3: 0.0883,
    4: 0.0355,
    5: 0.0720,
    6: 0.0754,
    7: 0.0177,
}

There are 187 images that contain the unclassified class
processing ./Train/Rural/masks_png


Processing 222.png from : 100%|██████████| 1366/1366 [00:49<00:00, 27.56it/s]

Unclassified: 0.0377
Background: 0.3444
Building: 0.1064
Road: 0.0507
Water: 0.0615
Barren: 0.0502
Forest: 0.1547
Agricultural: 0.1944

class_ratio = {
    0: 0.0377,
    1: 0.3444,
    2: 0.1064,
    3: 0.0507,
    4: 0.0615,
    5: 0.0502,
    6: 0.1547,
    7: 0.1944,
}

There are 1192 images that contain the unclassified class





### Output

In [9]:
# processing ./Train/Urban/masks_png
# Processing 2294.png from : 100%|██████████| 1156/1156 [00:41<00:00, 27.83it/s]
# Unclassified: 0.0482
# Background: 0.4612
# Building: 0.2018
# Road: 0.0883
# Water: 0.0355
# Barren: 0.0720
# Forest: 0.0754
# Agricultural: 0.0177

# class_ratio = {
#     0: 0.0482,
#     1: 0.4612,
#     2: 0.2018,
#     3: 0.0883,
#     4: 0.0355,
#     5: 0.0720,
#     6: 0.0754,
#     7: 0.0177,
# }

# There are 187 images that contain the unclassified class
# processing ./Train/Rural/masks_png
# Processing 222.png from : 100%|██████████| 1366/1366 [00:49<00:00, 27.56it/s]Unclassified: 0.0377
# Background: 0.3444
# Building: 0.1064
# Road: 0.0507
# Water: 0.0615
# Barren: 0.0502
# Forest: 0.1547
# Agricultural: 0.1944

# class_ratio = {
#     0: 0.0377,
#     1: 0.3444,
#     2: 0.1064,
#     3: 0.0507,
#     4: 0.0615,
#     5: 0.0502,
#     6: 0.1547,
#     7: 0.1944,
# }

# There are 1192 images that contain the unclassified class


### Pixel Weighting Things

What I would like to do is to weigh the training inversely to the frequency of occurrence of classes in Urban, so that if the model already learned to classify buildings let's say, then it will not focus on that, but on other things like agriculture (which btw it's probably much more prevalent in Rural)

In [11]:
# Urban class ratio
class_ratio = {
    0: 0.0482,
    1: 0.4612,
    2: 0.2018,
    3: 0.0883,
    4: 0.0355,
    5: 0.0720,
    6: 0.0754,
    7: 0.0177,
}

print("urban stats")
class_ratio_inverse = {}
for key, value in class_ratio.items():
    class_ratio_inverse[key] = 1 - value

print(class_ratio_inverse)

class_ratio = {
    0: 0.0377,
    1: 0.3444,
    2: 0.1064,
    3: 0.0507,
    4: 0.0615,
    5: 0.0502,
    6: 0.1547,
    7: 0.1944,
}

print("rural stats")
class_ratio_inverse = {}
for key, value in class_ratio.items():
    class_ratio_inverse[key] = 1 - value

print(class_ratio_inverse)

urban stats
{0: 0.9518, 1: 0.5388, 2: 0.7982, 3: 0.9117, 4: 0.9645, 5: 0.928, 6: 0.9246, 7: 0.9823}
rural stats
{0: 0.9623, 1: 0.6556, 2: 0.8936, 3: 0.9493, 4: 0.9385, 5: 0.9498, 6: 0.8452999999999999, 7: 0.8056}
