In [1]:
# Instructions on how to use this script can be found at
# https://github.com/neptune-ai/examples/blob/main/utils/migration_tools/backup_neptune/README.md
VERSION = "0.1.1"

In [2]:
from dotenv import load_dotenv
load_dotenv()


True

In [3]:
# %% Import libraries
import io
import json
import logging
import os
from contextlib import redirect_stdout
from datetime import datetime
from typing import Optional

import neptune
from neptune import management
from tqdm.auto import tqdm

In [4]:
_UNFETCHABLE_NAMESPACES = [
    "sys/state",
    "source_code/git",
]

_JSON_FILENAME = "simple_metadata.json"

# %% Set up logging
log_filename = datetime.now().strftime("neptune_backup_%Y%m%d%H%M%S.log")
print(f"Logs available at {log_filename}")

logging.basicConfig(
    filename=log_filename,
    filemode="a",
    format="%(asctime)s %(name)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    force=True,
)

logging.getLogger("neptune.internal.operation_processors.async_operation_processor").setLevel(
    logging.CRITICAL
)

logging.info("Backup process started")

# %%

Logs available at neptune_backup_20250417125739.log


In [5]:
download_folder = "./bulk_downloads"

if download_folder == "":
    download_folder = os.getcwd()

In [6]:
os.makedirs(download_folder, exist_ok=True)
logging.info(f"Downloading to {download_folder}")

# %%

In [7]:
download_artifacts = "y"
logging.info(f"{download_artifacts=}")

# %%

In [8]:
projects = management.get_project_list()

print(f"Projects found: {projects}")
logging.info(f"Projects found: {projects}")

# %%

Projects found: ['sbuser/example-project-tensorflow-keras', 'sbuser/SX3M']


In [9]:
selected_projects = os.environ["NEPTUNE_PROJECT"]
logging.info(f"Exporting {selected_projects}")

In [10]:
if selected_projects == "all":
    selected_projects = projects
else:
    selected_projects = selected_projects.split(",")

In [11]:
# %%
def flatten_namespaces(
    dictionary: dict, prefix: Optional[list] = None, result: Optional[list] = None
) -> list:
    if prefix is None:
        prefix = []
    if result is None:
        result = []

    for k, v in dictionary.items():
        if isinstance(v, dict):
            flatten_namespaces(v, prefix + [k], result)
        elif prefix_str := "/".join(prefix):
            result.append(f"{prefix_str}/{k}")
        else:
            result.append(k)
    return result

In [12]:
# %% Start backup
print(f"Starting backup. View logs at {log_filename}. Press Ctrl/Cmd + C to cancel at any time")

Starting backup. View logs at neptune_backup_20250417125739.log. Press Ctrl/Cmd + C to cancel at any time


In [13]:
def download_metadata_for_runs(run_ids=[]):
    for project in tqdm(selected_projects, desc="Total progress"):
        project_download_path = os.path.join(download_folder, project)
        os.makedirs(project_download_path, exist_ok=True)
        logging.info(f"Downloading runs from {project} to {project_download_path}")
    
        with redirect_stdout(io.StringIO()) as f:
            with neptune.init_project(project=project, mode="read-only") as _project:
                # Fetch runs table
                runs_table = _project.fetch_runs_table(columns=[]).to_pandas()

                if len(run_ids) >= 1:
                    runs = run_ids
                else: 
                    runs = list(runs_table["sys/id"])
                    
                print(f"Running download for {len(runs)}:\n{run_ids}")                
                for run_id in tqdm(runs, desc=project):
                    with neptune.init_run(
                        project=project,
                        with_id=run_id,
                        mode="read-only",
                    ) as run:
                        run_download_path = os.path.join(project_download_path, run_id)
                        os.makedirs(run_download_path, exist_ok=True)
                        logging.info(f"Downloading {project}/{run_id} to {run_download_path}")

                        namespaces = flatten_namespaces(run.get_structure())
                        single_values = {}

                        for namespace in namespaces:
                            if namespace in _UNFETCHABLE_NAMESPACES:
                                continue

                            namespace_download_path = os.path.join(run_download_path, namespace)

                            try:
                                if str(run[namespace]).split()[0] == "<Artifact":
                                    if download_artifacts:
                                        # Download artifact
                                        run[namespace].download(namespace_download_path)
                                elif str(run[namespace]).split()[0] == "<StringSet":
                                    # Write to single_values container
                                    single_values[namespace] = run[namespace].fetch()

                                elif str(run[namespace]).split()[0] in (
                                    "<FloatSeries",
                                    "<StringSeries",
                                ):
                                    # Download FloatSeries, StringSeries as CSV
                                    os.makedirs(
                                        os.path.dirname(namespace_download_path),
                                        exist_ok=True,
                                    )
                                    run[namespace].fetch_values().to_csv(
                                        f"{str(os.path.join(namespace_download_path))}.csv",
                                        index=False,
                                    )

                                elif str(run[namespace]).split()[0] == "<File":
                                    # Download File
                                    os.makedirs(
                                        os.path.dirname(namespace_download_path),
                                        exist_ok=True,
                                    )
                                    ext = run[namespace].fetch_extension()
                                    run[namespace].download(f"{namespace_download_path}.{ext}")

                                elif str(run[namespace]).split()[0] == "<FileSeries":
                                    # Download FileSeries
                                    run[namespace].download(namespace_download_path)

                                elif str(run[namespace]).split()[0] == "<FileSet":
                                    # Download FileSet
                                    os.makedirs(
                                        os.path.dirname(namespace_download_path),
                                        exist_ok=True,
                                    )
                                    run[namespace].download(f"{namespace_download_path}.zip")

                                else:
                                    # Write to single_values container
                                    single_values[namespace] = run[namespace].fetch()

                                # Export single_values container as json
                                with open(
                                    os.path.join(run_download_path, _JSON_FILENAME),
                                    mode="w+",
                                ) as file:
                                    file.write(
                                        json.dumps(
                                            single_values,
                                            indent=4,
                                            sort_keys=True,
                                            default=str,
                                        )
                                    )

                            except Exception as e:
                                logging.error(f"Error while downloading {namespace}\n{e}")
                                break

    
    logging.info("Backup complete!")

# %%

In [14]:
download_metadata_for_runs(run_ids=["SXM-762","SXM-449"])

Total progress:   0%|          | 0/1 [00:00<?, ?it/s]



sbuser/SX3M:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching file...: 0 [00:00, ?/s]

Fetching monitoring/8c08bc31/cpu values: 0 [00:00, ?/s]

Fetching monitoring/8c08bc31/memory values: 0 [00:00, ?/s]

Fetching monitoring/8c08bc31/stderr values: 0 [00:00, ?/s]

Fetching monitoring/8c08bc31/stdout values: 0 [00:00, ?/s]