## Imports

This file should be located in the root directory of `single-cell-data-portal`.

In [None]:
from backend.wmg.pipeline.cube_pipeline import load_data_and_create_cube
from backend.wmg.pipeline.integrated_corpus.extract import get_dataset_asset_urls
from backend.wmg.data.utils import get_datasets_from_curation_api
import os

## Set environment variables
These variables are used for retrieving dataset assets.

In [None]:
%env DEPLOYMENT_STAGE=staging
%env API_URL=https://api.cellxgene.staging.single-cell.czi.technology

## Set parameters

In [None]:
NUM_DATASETS = 3 # download the first 3 datasets
DATASET_FOLDER_NAME = "pipeline_test_datasets" # folder to download the data into
MAX_GB_FILESIZE = 0.1 # only download datasets with < 0.1GB filesize

## Create datasets directory

In [None]:
os.system(f"mkdir -p {DATASET_FOLDER_NAME}");

## Get dataset asset URLs and metadata
The metadata will be used to filter out datasets that are too large to be processed locally on a laptop.

In [None]:
dataset_urls = get_dataset_asset_urls()
datasets = get_datasets_from_curation_api()
datasets_by_ids = dict(zip([d['dataset_id'] for d in datasets],datasets))
print(len(dataset_urls),'datasets')

## Download the first `NUM_DATASETS` datasets

In [None]:
num_datasets_downloaded = 0
for dataset_id,dataset_url in dataset_urls.items():
    
    print(dataset_id)

    dataset = datasets_by_ids[dataset_id]
    
    too_big=False
    for asset in dataset['assets']:
        if asset['filetype']=='H5AD' and asset['filesize']/1e9 > MAX_GB_FILESIZE:
            too_big=True
            break

    if too_big:
        print(f"Dataset bigger than {MAX_GB_FILESIZE}GB, skipping")
        continue

    os.system(f"mkdir -p {DATASET_FOLDER_NAME}/{dataset_id}")
    os.system(f"curl -o {DATASET_FOLDER_NAME}/{dataset_id}/local.h5ad {dataset_url}")
    num_datasets_downloaded+=1
    
    if num_datasets_downloaded == NUM_DATASETS:
        break

## Run the pipeline on the downloaded datasets
If running on mac, you might not be able to install pygraphviz with `pip`. If you're using conda, you can install it with `conda install -c conda-forge pygraphviz`.

At the end, the pipeline will attempt to upload the cube to the WMG s3 bucket. This will fail and is expected. Load the snapshot locally to explore its contents and assess whether the pipeline was successful. To learn how to load the snapshot locally, take a look at `example_dev_notebooks/local_endpoint_runner.ipynb`.

In [None]:
corpus_path, stats = load_data_and_create_cube(DATASET_FOLDER_NAME, extract_data=False, validate_cube=False)