In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from zipfile import ZipFile
import pathlib
import shutil

api = KaggleApi()
api.authenticate()

#### Utils 

In [None]:
def normalize_dataset_splits(root_path):
    """
    Renames folders starting with any capitalized version of "train", "test", "validation"
    to "train", "test", and "validation" respectively.
    """
    for folder_name in os.listdir(root_path):
        folder_path = os.path.join(root_path, folder_name)
        if os.path.isdir(folder_path):
            lower_folder_name = folder_name.lower()
            if lower_folder_name.startswith("train"):
                new_folder_path = os.path.join(root_path, "train")
            elif lower_folder_name.startswith("test"):
                new_folder_path = os.path.join(root_path, "test")
            elif lower_folder_name.startswith("val"):
                new_folder_path = os.path.join(root_path, "validation")
            else:
                continue  # Skip renaming if it doesn't match the criteria
            
            # Rename the folder if the new path is different from the original
            if new_folder_path != folder_path:
                os.rename(folder_path, new_folder_path)
                print(f"Renamed {folder_path} to {new_folder_path}")

def move_files_up(root_path, destination_path, depth=1, moveable_file_extensions=None):
    """
    Moves all files and folders from the root_path to the destination_path.
    """
    for file_name in os.listdir(root_path):
        file_path = os.path.join(root_path, file_name)
        if os.path.isdir(file_path):
            move_files_up(file_path, destination_path, depth + 1, moveable_file_extensions)
        else:
            if moveable_file_extensions is not None:
                file_extension = pathlib.Path(file_path).suffix
                if file_extension not in moveable_file_extensions:
                    continue
                shutil.move(file_path, destination_path)
                print(f"Moved {file_path} to {destination_path}")
    
    if depth > 1: # only remove folders if we are not at the root
        shutil.rmtree(root_path)

### Downloading the datasets

In [None]:
# Dataset keys for Kaggle API
DATASET_IDS = [
    "pkdarabi/brain-tumor-image-dataset-semantic-segmentation",
]

DATASET_NAMES = [
    'tumor-segmentation-boxes',
]

# Destination folder
DATASET_FOLDER_PATH = pathlib.Path().absolute().parent / "datasets"


In [None]:

for i, dataset in enumerate(DATASET_IDS):
    print(f"Downloading {dataset}...")
    dataset_path = DATASET_FOLDER_PATH / DATASET_NAMES[i]

    if os.path.exists(dataset_path):
        print(f"{dataset} already exists. Skipping...")
        continue

    api.dataset_download_files(dataset, path=dataset_path, quiet=False)

    # Unzip the downloaded files
    zip_file_path = f"{dataset_path}/{dataset.split('/')[-1]}.zip"
    with ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(dataset_path)

    # Remove the zip file
    os.remove(zip_file_path)
    
    # Rename folders
    normalize_dataset_splits(dataset_path)