
# Notebook Overview

This notebook demonstrates how to download, verify, and manage DICOM test data from the OpenSlide project. It includes steps to fetch metadata, download files with integrity checks, and inspect the downloaded dataset. The workflow is designed for use in Databricks environments, following workspace policies for responsible resource usage and data management.

## Requirements
- Serverless Notebook Compute
- Assumes catalog, schema, volume `hls_radiology.openslide-cs-cmu-edu.test-data` UC volume


In [0]:
%sh 
cd /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM
pwd
ls

curl -O https://openslide.cs.cmu.edu/download/openslide-testdata/DICOM/index.yaml

In [0]:
# %sh rm -rf /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/*

In [0]:
%sh ls /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/

In [0]:
import os
import requests
import tempfile
import yaml
import hashlib

path = "/Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/index.yaml"
base_url = "https://openslide.cs.cmu.edu/download/openslide-testdata/DICOM/"
base_path = os.path.dirname(path)

with open(path) as file:
    f = yaml.load(file, Loader=yaml.FullLoader)
    slides = f.get("slides")
    for file_name in slides:
        expected_hash = slides[file_name].get("sha256")
        file_path = os.path.join(base_path, file_name)
        if not os.path.exists(file_path):
            print(f"Downloading {file_name} to {file_path}")
            url = base_url + file_name
            try:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    with tempfile.NamedTemporaryFile(delete=False, dir=base_path) as tmp_f:
                        for chunk in r.iter_content(chunk_size=65536):
                            tmp_f.write(chunk)
                            print(".", end="", flush=True)
                    # Calculate sha256 hash
                    sha256_hash = hashlib.sha256()
                    with open(tmp_f.name, "rb") as f_in:
                        for byte_block in iter(lambda: f_in.read(65536), b""):
                            sha256_hash.update(byte_block)
                    calculated_hash = sha256_hash.hexdigest()

                    if calculated_hash == expected_hash:
                        print(f"SHA256 hash matches for {file_name}")
                        os.rename(tmp_f.name, file_path)
                        print(f"File '{file_name}' downloaded successfully.")
                    else:
                        print(f"SHA256 hash mismatch for {file_name}: expected {expected_hash}, got {calculated_hash}")
                        os.remove(tmp_f.name)
            except requests.exceptions.RequestException as e:
                print(f"Error downloading file: {e}")


In [0]:
%sh ls -alH /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM