### Import libraries

In [None]:
import pathlib
import shutil

import numpy as np
import pandas as pd
from PIL import Image
import cv2
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
sns.set_context("talk", font_scale=1.0, rc={"font.family": "serif", "font.serif": ["Computer Modern Serif"]})
fig_size = (16,8)
import warnings
warnings.filterwarnings('ignore')

### Read list of files

List contains all images from dataset (5000 images). There are only asssign 3 columns to the list. 
</p>
To be deleted - Obvious errors in annotations - Correct files
</p>
Unclear - Unclear boundaries between classes or simplifications - Correct clear files
</p>
Strict delete - Inconsistencies between images in annotations. Correct clear strict files

In [None]:
selection_file_path = r"./filter_of_files.csv"
df = pd.read_csv(selection_file_path)
df

### Functions to be used

In [None]:
"""
this functions do binning into annotation classes per image
"""
def labels_binning(root_path:pathlib.Path, list_of_label_paths:list):
    np_hist = np.zeros((256,3))
    # Dataframe of overall stats
    df_2 = pd.DataFrame({
    "Label ID":[0,1,2,3],
    "Label name":["Clear", "Transparent", "Semi_transparent", "Opaque"],
    "Label pixel number":[0,0,0,0]
    })
    # Dataframe for image stats
    df_per_image = pd.DataFrame(columns= ["Filename", "Clear", "Transparent", "Semi_transparent", "Opaque"])
    for file in tqdm.tqdm(list_of_label_paths):
        lbl = np.array(Image.open(root_path / file)) #returns BGR (Blue-Green-Red)
        values, counts = np.unique(lbl, return_counts=True)
        clear = 0
        transparent = 0
        semitransparent = 0
        opaque = 0
        for value, count in zip(values, counts):
            df_2.loc[df_2["Label ID"]==value,"Label pixel number"] += count
            if value == 0:
                clear = count
            elif value == 1:
                transparent = count
            elif value == 2:
                semitransparent = count
            elif value == 3:
                opaque = count
        one_record = pd.DataFrame.from_dict([{
                "Filename":file,
                "Clear":clear, 
                "Transparent":transparent, 
                "Semi_transparent":semitransparent, 
                "Opaque":opaque
            }])
        df_per_image = pd.concat([df_per_image, one_record])
    df_per_image.set_index("Filename", inplace=True)
    df_per_image["Total_number_of_pixel"] = df_per_image.sum(axis = 1)
    list_of_column_names = ["Clear", "Transparent", "Semi_transparent", "Opaque"]
    for column in list_of_column_names:
        print(f"We are processing occlusion level of {column}")
        df_per_image[f"{column}_relative"] = df_per_image[column]/df_per_image["Total_number_of_pixel"]
    return df_per_image

In [None]:
def visualize_labels_distribution_per_image(df_stats_per_img):
    plt.figure(figsize=fig_size)
    plt.hist([df_stats_per_img["Clear_relative"], 
            df_stats_per_img["Transparent_relative"],
            df_stats_per_img["Semi_transparent_relative"], 
            df_stats_per_img["Opaque_relative"]], label = [
                "Clear", 
                "Transparent", 
                "Semi Transparent", 
                "Opaque"
            ])
    plt.xlabel("Percentage of of image covered in bin")
    plt.ylabel("Frequency")
    plt.title(f"Distribution of image occlusions types coverage")
    plt.legend()
    plt.show()

In [None]:
def visualize_labels_distribution(df_stats_per_img):
    plt.figure(figsize=fig_size)
    plt.bar(df_stats_per_img[["Clear", "Transparent", "Semi_transparent", "Opaque"]].sum().keys(), 
            df_stats_per_img[["Clear", "Transparent", "Semi_transparent", "Opaque"]].sum().values)
    plt.xlabel("Type of occlusion")
    plt.ylabel("Frequency")
    plt.title(f"Sum  class related pixels")
    plt.legend()
    plt.show()
    print(df_stats_per_img[["Clear", "Transparent", "Semi_transparent", "Opaque"]].sum()/df_stats_per_img["Total_number_of_pixel"].sum())

## Copy all files from source to split train and test

In [None]:
gtLabelsPath_source = pathlib.Path(r"../woodscape_input/gtLabels")
rgbImagesPath_source = pathlib.Path(r"../woodscape_input/rgbImages")
rgbLabelsPath_source = pathlib.Path(r"../woodscape_input/rgbLabels")

In [None]:
gtLabelsPath_test = pathlib.Path(r"../woodscape_preprocessed/test/gtLabels")
rgbImagesPath_test = pathlib.Path(r"../woodscape_preprocessed/test/rgbImages")
rgbLabelsPath_test = pathlib.Path(r"../woodscape_preprocessed/test/rgbLabels")

In [None]:
gtLabelsPath_train = pathlib.Path(r"../woodscape_preprocessed/train/gtLabels")
rgbImagesPath_train = pathlib.Path(r"../woodscape_preprocessed/train/rgbImages")
rgbLabelsPath_train = pathlib.Path(r"../woodscape_preprocessed/train/rgbLabels")

Create naive split by using 249 file sfrom beggining and 248 from the end. Files are sorted by names, which means we are selecting complete traces including (7-9 images per trace). There is no leak then

In [None]:
selection_df = df[df["Strict delete"]!="nok"]
list_of_files_selected = selection_df["Filename"].to_list()
selection = (list_of_files_selected[:249] + list_of_files_selected[-248:])
print(f"Len of selection for test is {len(selection)}")
all_train = set(df["Filename"].to_list()) - set(selection)
print(f"Len of selection for train is {len(all_train)}")
print(f"Does test+train equals to total number of file: {(len(all_train)+len(selection))==len(df)}")

### Copy all training images

In [None]:
gtLabelsPath_train.mkdir(exist_ok=True, parents=True)
rgbImagesPath_train.mkdir(exist_ok=True, parents=True)
rgbLabelsPath_train.mkdir(exist_ok=True, parents=True)


for selected_image_name in tqdm.tqdm(all_train):
    shutil.copy(str(gtLabelsPath_source/selected_image_name), str(gtLabelsPath_train/selected_image_name))
    shutil.copy(str(rgbImagesPath_source/selected_image_name), str(rgbImagesPath_train/selected_image_name))
    shutil.copy(str(rgbLabelsPath_source/selected_image_name), str(rgbLabelsPath_train/selected_image_name))

### Calculate stats per bin and visualize

In [None]:
train_stats = labels_binning(gtLabelsPath_train, all_train)
visualize_labels_distribution_per_image(train_stats)
visualize_labels_distribution(train_stats)

### Copy all test images

In [None]:
gtLabelsPath_test.mkdir(exist_ok=True, parents=True)
rgbImagesPath_test.mkdir(exist_ok=True, parents=True)
rgbLabelsPath_test.mkdir(exist_ok=True, parents=True)


for selected_image_name in tqdm.tqdm(selection):
    shutil.copy(str(gtLabelsPath_source/selected_image_name), str(gtLabelsPath_test/selected_image_name))
    shutil.copy(str(rgbImagesPath_source/selected_image_name), str(rgbImagesPath_test/selected_image_name))
    shutil.copy(str(rgbLabelsPath_source/selected_image_name), str(rgbLabelsPath_test/selected_image_name))

### Calculate stats per bin and visualize

In [None]:
test_stats = labels_binning(gtLabelsPath_test, selection)
visualize_labels_distribution_per_image(test_stats)
visualize_labels_distribution(test_stats)

#### Save train and test statistics

In [None]:
train_stats.to_csv("train_stats.csv")
test_stats.to_csv("test_stats.csv")

In [None]:
fig,axs = plt.subplots(1,2,figsize=(14,6))
colors = ["blue","orange","green", "red"]
classes = ["Clear", "Transparent", "Semi_transparent", "Opaque"]
axs[0].bar(classes, 
        (train_stats[classes].sum().values)/1000000,color=colors)
axs[1].bar(classes, 
        (test_stats[classes].sum().values)/1000000,color=colors)
#axs[0].set_xlabel("Type of occlusion")
axs[0].set_ylabel("Pixel frequency in milions" )
axs[0].set_xticklabels(classes, rotation = 25)
axs[0].set_title(f"Sum of class related pixels for training")
#axs[1].set_xlabel("Type of occlusion")
axs[1].set_ylabel("Pixel frequency in milions")
axs[1].set_xticklabels(classes, rotation = 24)
axs[1].set_title(f"Sum of class related pixels for test")
fig.tight_layout()
plt.savefig("train_test_split.pdf")
plt.show()

### Create txt list of files for network training using "Baseline set"

In [None]:
all_files = df[df.Filename.isin(all_train)]

train_files = all_files[:3833]
val_files = all_files[3833:]

for dataset_name, dataset in zip(["train", "val"], [train_files, val_files]):
    correct_files = []
    file = open(gtLabelsPath_train.parent/f'{dataset_name}_all_files.txt','w')
    for filename in dataset["Filename"].to_list():
        file.write(f"/rgbImages/{filename},/gtLabels/{filename} \n")
        correct_files.append(f"./rgbImages/{filename},./gtLabels/{filename}")
    file.close()
    print(f"for dataset {dataset_name} there is num of files: {len(correct_files)}")

In [None]:
train_all_stats = labels_binning(gtLabelsPath_train, train_files["Filename"].to_list())
visualize_labels_distribution_per_image(train_all_stats)
visualize_labels_distribution(train_all_stats)
val_all_stats = labels_binning(gtLabelsPath_train, val_files["Filename"].to_list())
visualize_labels_distribution_per_image(val_all_stats)
visualize_labels_distribution(val_all_stats)

### Create txt list of files for network training using "Correct files"

In [None]:
all_files = df[df.Filename.isin(all_train)]
correct_files = all_files[all_files["To be deleted"].isna()]

train_files = correct_files[:3148]
val_files = correct_files[3148:]

for dataset_name, dataset in zip(["train", "val"], [train_files, val_files]):
    correct_files = []
    file = open(gtLabelsPath_train.parent/f'{dataset_name}_correct_files.txt','w')
    for filename in dataset["Filename"].to_list():
        file.write(f"/rgbImages/{filename},/gtLabels/{filename} \n")
        correct_files.append(f"./rgbImages/{filename},./gtLabels/{filename}")
    file.close()
    print(f"for dataset {dataset_name} there is num of files: {len(correct_files)}")

In [None]:
train_correct_stats = labels_binning(gtLabelsPath_train, train_files["Filename"].to_list())
visualize_labels_distribution_per_image(train_correct_stats)
visualize_labels_distribution(train_correct_stats)
val_correct_stats = labels_binning(gtLabelsPath_train, val_files["Filename"].to_list())
visualize_labels_distribution_per_image(val_correct_stats)
visualize_labels_distribution(val_correct_stats)

### Create txt list of files for network training using "Correct clear files"

In [None]:
all_files = df[df.Filename.isin(all_train)]
clear_mask = all_files["Unclear"].isna()
correct_mask = all_files["To be deleted"].isna()
clear_files = all_files[clear_mask & correct_mask]

train_files = clear_files[:2629]
val_files = clear_files[2629:]


for dataset_name, dataset in zip(["train", "val"], [train_files, val_files]):
    correct_files = []
    file = open(gtLabelsPath_train.parent/f'{dataset_name}_correct_clear_files.txt','w')
    for filename in dataset["Filename"].to_list():
        file.write(f"/rgbImages/{filename},/gtLabels/{filename} \n")
        correct_files.append(f"./rgbImages/{filename},./gtLabels/{filename}")
    file.close()
    print(f"for dataset {dataset_name} there is num of files: {len(correct_files)}")

In [None]:
train_clear_stats = labels_binning(gtLabelsPath_train, train_files["Filename"].to_list())
visualize_labels_distribution_per_image(train_clear_stats)
visualize_labels_distribution(train_clear_stats)
val_clear_stats = labels_binning(gtLabelsPath_train, val_files["Filename"].to_list())
visualize_labels_distribution_per_image(val_clear_stats)
visualize_labels_distribution(val_clear_stats)

### Create txt list of files for network training using "Correct clear strict files"

In [None]:
all_files = df[df.Filename.isin(all_train)]
strict_delete_mask = all_files["Strict delete"].isna()
strict_files = all_files[strict_delete_mask]

train_files = strict_files[:1254]
val_files = strict_files[1254:]

for dataset_name, dataset in zip(["train", "val"], [train_files, val_files]):
    correct_files = []
    file = open(gtLabelsPath_train.parent/f'{dataset_name}_correct_clear_strict_files.txt','w')
    for filename in dataset["Filename"].to_list():
        file.write(f"/rgbImages/{filename},/gtLabels/{filename} \n")
        correct_files.append(f"./rgbImages/{filename},./gtLabels/{filename}")
    file.close()
    print(f"for dataset {dataset_name} there is num of files: {len(correct_files)}")

In [None]:
train_strict_stats = labels_binning(gtLabelsPath_train, train_files["Filename"].to_list())
visualize_labels_distribution_per_image(train_strict_stats)
visualize_labels_distribution(train_strict_stats)
val_strict_stats = labels_binning(gtLabelsPath_train, val_files["Filename"].to_list())
visualize_labels_distribution_per_image(val_strict_stats)
visualize_labels_distribution(val_strict_stats)

### Create txt list of files for network test using "test all files"

In [None]:
selection_df = df[df["Strict delete"]!="nok"]
test_df = pd.concat([selection_df[:249], selection_df[-248:]])

correct_files = []
file = open(gtLabelsPath_test.parent/f'test_all_files.txt','w')
for filename in test_df["Filename"].to_list():
    file.write(f"/rgbImages/{filename},/gtLabels/{filename} \n")
    correct_files.append(f"./rgbImages/{filename},./gtLabels/{filename}")
file.close()
print(f"for dataset test there is num of files: {len(correct_files)}")

In [None]:
test_all_stats = labels_binning(gtLabelsPath_test, test_df["Filename"].to_list())
visualize_labels_distribution_per_image(test_all_stats)
visualize_labels_distribution(test_all_stats)

### Show stats of all

In [None]:
colors = ["blue","orange"]
fig,axs = plt.subplots(2,2,figsize=(14,14))
classes = ["Clear", "Transparent", "Semi_transparent", "Opaque"]
# All data
axs[0,0].bar(classes, 
        (train_all_stats[classes].sum().values)/1000000, label = "train", color = colors[0])
axs[0,0].bar(classes, 
        (val_all_stats[classes].sum().values)/1000000, label = "val", color = colors[1])
#axs[0,0].set_xlabel("Type of occlusion")
axs[0,0].set_ylabel("Frequency in milions" )
axs[0,0].set_xticklabels(classes, rotation = 25)
axs[0,0].set_title(f"Sum of class related pixels-Baseline set")
axs[0,0].set_ylim([0,2000])
axs[0,0].legend()
# Correct
axs[0,1].bar(classes, 
        (train_correct_stats[classes].sum().values)/1000000, label = "train", color = colors[0])
axs[0,1].bar(classes, 
        (val_correct_stats[classes].sum().values)/1000000, label = "val", color = colors[1])
#axs[0,1].set_xlabel("Type of occlusion")
axs[0,1].set_ylabel("Frequency in milions" )
axs[0,1].set_xticklabels(classes, rotation = 25)
axs[0,1].set_title(f"Sum of class related pixels-Correct files")
axs[0,1].set_ylim([0,2000])
axs[0,1].legend()
# Clear
axs[1,0].bar(classes, 
        (train_clear_stats[classes].sum().values)/1000000, label = "train", color = colors[0])
axs[1,0].bar(classes, 
        (val_clear_stats[classes].sum().values)/1000000, label = "val", color = colors[1])
#axs[1,0].set_xlabel("Type of occlusion")
axs[1,0].set_ylabel("Frequency in milions" )
axs[1,0].set_xticklabels(classes, rotation = 25)
axs[1,0].set_title(f"Sum of class related pixels-Correct clear files")
axs[1,0].set_ylim([0,2000])
axs[1,0].legend()
# Strict annotation
axs[1,1].bar(classes, 
        (train_strict_stats[classes].sum().values)/1000000, label = "train", color = colors[0])
axs[1,1].bar(classes, 
        (val_strict_stats[classes].sum().values)/1000000, label = "val", color = colors[1])
#axs[1,1].set_xlabel("Type of occlusion")
axs[1,1].set_ylabel("Frequency in milions" )
axs[1,1].set_xticklabels(classes, rotation = 25)
axs[1,1].set_title(f"Sum of class related pixels-Correct clear strict files")
axs[1,1].set_ylim([0,2000])
axs[1,1].legend()
plt.tight_layout()
plt.legend()
plt.savefig("split_of_training_and_val_dataset.pdf")
plt.show()