# Vesuvis Data Preparation

In [None]:
!git config --global --add safe.directory /host

!git branch
!git status --short
!hostname

# Imports

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import PIL.Image as Image
from tqdm.notebook import tqdm

# Paths & Settings

In [None]:
COMPETITION_DATA_DIR = Path("/data/")

# Prepare DataFrame

In [None]:
def create_df_from_mask_paths(stage):
    mask_paths = sorted(COMPETITION_DATA_DIR.glob(f"{stage}/*/mask.png"))

    df = pd.DataFrame({"mask_png": mask_paths})

    df["mask_png"] = df["mask_png"].astype(str)

    df["stage"] = df["mask_png"].str.split("/").str[-3]
    df["fragment_id"] = df["mask_png"].str.split("/").str[-2]

    df["mask_npy"] = df["mask_png"].str.replace(
        stage, f"{stage}", regex=False
    )
    df["mask_npy"] = df["mask_npy"].str.replace("input", "working", regex=False)
    df["mask_npy"] = df["mask_npy"].str.replace("png", "npy", regex=False)

    if stage == "train":
        df["label_png"] = df["mask_png"].str.replace("mask", "inklabels", regex=False)
        df["label_npy"] = df["mask_npy"].str.replace("mask", "inklabels", regex=False)

    df["volumes_dir"] = df["mask_png"].str.replace(
        "mask.png", "surface_volume", regex=False
    )
    df["volume_npy"] = df["mask_npy"].str.replace("mask", "volume", regex=False)

    return df

In [None]:
train_df = create_df_from_mask_paths("train")

In [None]:
train_df

# Convert Data to NumPy

## Based on https://www.kaggle.com/code/jpposma/vesuvius-challenge-ink-detection-tutorial

In [None]:
def load_image(path):
    return Image.open(path)

def load_label_npy(path):
    label = load_image(path)
    return np.array(label) > 0


def load_mask_npy(path):
    mask = load_image(path).convert("1")
    return np.array(mask)


def load_z_slice_npy(path):
    z_slice = load_image(path)
    return np.array(z_slice, dtype=np.float32) / 65535.0


def load_volume_npy(volumes_dir):
    z_slices_paths = sorted(Path(volumes_dir).glob("*.tif"))

    z_first = load_z_slice_npy(z_slices_paths[0])
    z_slices = np.zeros(
        (len(z_slices_paths), z_first.shape[0], z_first.shape[1]), dtype=z_first.dtype
    )
    z_slices[0] = z_first
    
    tqdm.write(f"z_slices.shape {z_slices.shape}")
    for i in tqdm(range(1, len(z_slices_paths)), position=1):
        z_slices[i] = load_z_slice_npy(z_slices_paths[i])
    return z_slices

In [None]:
def save_data_as_npy(df, train=True):
    for row in tqdm(
        df.itertuples(), total=len(df), desc="Processing fragments", position=0
    ):
        mask_npy = load_mask_npy(row.mask_png)
        volume_npy = load_volume_npy(row.volumes_dir)

        Path(row.mask_npy).parent.mkdir(exist_ok=True, parents=True)
        np.save(row.mask_npy, mask_npy)
        np.save(row.volume_npy, volume_npy)
        tqdm.write(f"Created {row.volume_npy} with shape {volume_npy.shape}")
        del mask_npy
        del volume_npy

        if train:
            label_npy = load_label_npy(row.label_png)
            np.save(row.label_npy, label_npy)
            del label_npy

In [None]:
save_data_as_npy(train_df)

# Fix paths

In [None]:
train_df["label_npy"] = train_df["label_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)
train_df["mask_npy"] = train_df["mask_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)
train_df["volume_npy"] = train_df["volume_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)

train_df.to_csv(f"data.csv")

In [None]:
!ls