In [1]:
# Here we take care of paths.

from pathlib import Path
import os
print('Starting path:' + os.getcwd())
if os.getcwd()[-18:] == 'VESUVIUS_Challenge':
    pass
else:
    PATH = Path().resolve().parents[0]
    os.chdir(PATH)

# make sure you are in Paragraph_to_Tex folder
print('Current path:' + os.getcwd())

Starting path:/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/jupyter notebooks
Current path:/Users/gregory/PROJECT_ML/VESUVIUS_Challenge


In [2]:
import lab_black
from pathlib import Path

import numpy as np
import pandas as pd
import PIL.Image as Image
from tqdm.auto import tqdm

In [3]:
KAGGLE_DIR = PATH / "kaggle"

INPUT_DIR = KAGGLE_DIR / "input"

COMPETITION_DATA_DIR = INPUT_DIR / "vesuvius-challenge-ink-detection"

DOWNSAMPLING = 1.0
NUM_Z_SLICES = 64

In [4]:
def create_df_from_mask_paths( stage, downsampling):
    mask_paths = sorted(COMPETITION_DATA_DIR.glob(f"{stage}/*/mask.png"))

    df = pd.DataFrame({"mask_png": mask_paths})

    df["mask_png"] = df["mask_png"].astype(str)

    df["stage"] = df["mask_png"].str.split("/").str[-3]
    df["fragment_id"] = df["mask_png"].str.split("/").str[-2]

    df["mask_npy"] = df["mask_png"].str.replace(
        stage, f"{stage}_{downsampling}", regex=False
    )
    df["mask_npy"] = df["mask_npy"].str.replace("input", "working", regex=False)
    df["mask_npy"] = df["mask_npy"].str.replace("png", "npy", regex=False)

    if stage == "train":
        df["label_png"] = df["mask_png"].str.replace("mask", "inklabels", regex=False)
        df["label_npy"] = df["mask_npy"].str.replace("mask", "inklabels", regex=False)

    df["volumes_dir"] = df["mask_png"].str.replace(
        "mask.png", "surface_volume", regex=False
    )
    df["volume_npy"] = df["mask_npy"].str.replace("mask", "volume", regex=False)

    return df

In [5]:
train_df = create_df_from_mask_paths("train", DOWNSAMPLING)

In [6]:
train_df

Unnamed: 0,mask_png,stage,fragment_id,mask_npy,label_png,label_npy,volumes_dir,volume_npy
0,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,train,1,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...
1,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,train,2,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...
2,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,train,3,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...,/Users/gregory/PROJECT_ML/VESUVIUS_Challenge/k...


In [7]:
def load_image(path):
    return Image.open(path)


def resize_image(image, downsampling):
    size = int(image.size[0] * downsampling), int(image.size[1] * downsampling)
    return image.resize(size)


def load_and_resize_image(path, downsampling):
    image = load_image(path)
    return resize_image(image, downsampling)


def load_label_npy(path, downsampling):
    label = load_and_resize_image(path, downsampling)
    return np.array(label) > 0


def load_mask_npy(path, downsampling):
    mask = load_and_resize_image(path, downsampling).convert("1")
    return np.array(mask)


def load_z_slice_npy(path, downsampling):
    z_slice = load_and_resize_image(path, downsampling)
    return np.array(z_slice, dtype=np.float32) / 65535.0


def load_volume_npy(volumes_dir, num_z_slices, downsampling):
    mid = 65 // 2
    start = mid - num_z_slices // 2
    end = mid + num_z_slices // 2

    z_slices_paths = sorted(Path(volumes_dir).glob("*.tif"))[start:end]

    batch_size = num_z_slices // 4
    paths_batches = [
        z_slices_paths[i : i + batch_size]
        for i in range(0, len(z_slices_paths), batch_size)
    ]

    volumes = []
    for paths_batch in tqdm(
        paths_batches, leave=False, desc="Processing batches", position=1
    ):
        z_slices = [
            load_z_slice_npy(path, downsampling)
            for path in tqdm(
                paths_batch, leave=False, desc="Processing paths", position=2
            )
        ]
        volumes.append(np.stack(z_slices, axis=0))
        del z_slices

        # break

    volume = np.concatenate(volumes, axis=0)

    return volume

In [8]:
def save_data_as_npy(df, train=True):
    for row in tqdm(
        df.itertuples(), total=len(df), desc="Processing fragments", position=0
    ):
        
        mask_npy = load_mask_npy(row.mask_png, DOWNSAMPLING)
        volume_npy = load_volume_npy(row.volumes_dir, NUM_Z_SLICES, DOWNSAMPLING)

        Path(row.mask_npy).parent.mkdir(exist_ok=True, parents=True)
        np.save(row.mask_npy, mask_npy)
        np.save(row.volume_npy, volume_npy)

        if train:
            label_npy = load_label_npy(row.label_png, DOWNSAMPLING)
            np.save(row.label_npy, label_npy)

        tqdm.write(f"Created {row.volume_npy} with shape {volume_npy.shape}")

In [None]:
save_data_as_npy(train_df)

Processing fragments:   0%|          | 0/3 [00:00<?, ?it/s]

Processing batches:   0%|          | 0/4 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Created /Users/gregory/PROJECT_ML/VESUVIUS_Challenge/kaggle/working/vesuvius-challenge-ink-detection/train_1.0/1/volume.npy with shape (64, 8181, 6330)




Processing batches:   0%|          | 0/4 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

Processing paths:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
train_df["label_npy"] = train_df["label_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)
train_df["mask_npy"] = train_df["mask_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)
train_df["volume_npy"] = train_df["volume_npy"].str.replace(
    "working", "input/vesuvis-data-preparation", regex=False
)

train_df.to_csv(f"data_{DOWNSAMPLING}.csv")