## Imports

This file should be located in the root directory of `single-cell-data-portal`.

In [3]:
from backend.wmg.pipeline.cube_pipeline import load_data_and_create_cube
from backend.wmg.pipeline.integrated_corpus.extract import get_dataset_asset_urls
from backend.wmg.data.utils import get_datasets_from_curation_api
import os



## Set environment variables
These variables are used for retrieving dataset assets.

In [4]:
%env DEPLOYMENT_STAGE=staging
%env API_URL=https://api.cellxgene.staging.single-cell.czi.technology

env: DEPLOYMENT_STAGE=staging
env: API_URL=https://api.cellxgene.staging.single-cell.czi.technology


## Set parameters

In [6]:
NUM_DATASETS = 3 # download the first 3 datasets
DATASET_FOLDER_NAME = "pipeline_test_datasets" # folder to download the data into
MAX_GB_FILESIZE = 0.1 # only download datasets with < 0.1GB filesize

## Create datasets directory

In [11]:
os.system(f"mkdir -p {DATASET_FOLDER_NAME}");

## Get dataset asset URLs and metadata
The metadata will be used to filter out datasets that are too large to be processed locally on a laptop.

In [10]:
dataset_urls = get_dataset_asset_urls()
datasets = get_datasets_from_curation_api()
datasets_by_ids = dict(zip([d['dataset_id'] for d in datasets],datasets))
print(len(dataset_urls),'datasets')

## Download the first `NUM_DATASETS` datasets

In [12]:
num_datasets_downloaded = 0
for dataset_id,dataset_url in dataset_urls.items():
    
    print(dataset_id)

    dataset = datasets_by_ids[dataset_id]
    
    too_big=False
    for asset in dataset['assets']:
        if asset['filetype']=='H5AD' and asset['filesize']/1e9 > MAX_GB_FILESIZE:
            too_big=True
            break

    if too_big:
        print(f"Dataset bigger than {MAX_GB_FILESIZE}GB, skipping")
        continue

    os.system(f"mkdir -p {DATASET_FOLDER_NAME}/{dataset_id}")
    os.system(f"curl -o {DATASET_FOLDER_NAME}/{dataset_id}/local.h5ad {dataset_url}")
    num_datasets_downloaded+=1
    
    if num_datasets_downloaded == NUM_DATASETS:
        break

6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
Dataset bigger than 0.1GB, skipping
ebaaa22f-fbaa-4a9c-aa62-b938a4e2d319
Dataset bigger than 0.1GB, skipping
ab326369-b63c-48d8-8a7f-82ffa0df7002
Dataset bigger than 0.1GB, skipping
74cff64f-9da9-4b2a-9b3b-8a04a1598040
Dataset bigger than 0.1GB, skipping
5af90777-6760-4003-9dba-8f945fec6fdf
Dataset bigger than 0.1GB, skipping
bd65a70f-b274-4133-b9dd-0d1431b6af34
Dataset bigger than 0.1GB, skipping
96d4a4cd-db8e-4320-af10-a7d9418bc647
Dataset bigger than 0.1GB, skipping
137ca42b-33b3-48fa-ae80-02a0747027cb
Dataset bigger than 0.1GB, skipping
c05e6940-729c-47bd-a2a6-6ce3730c4919
Dataset bigger than 0.1GB, skipping
965386e9-1e4f-466d-bf59-ebdca4b66b9b
Dataset bigger than 0.1GB, skipping
7f08cbcc-5790-4576-8478-10e9a999b316
Dataset bigger than 0.1GB, skipping
d7291f04-fbbb-4d65-990a-f01fa44e915b
Dataset bigger than 0.1GB, skipping
9d8e5dca-03a3-457d-b7fb-844c75735c83
Dataset bigger than 0.1GB, skipping
842c6f5d-4a94-4eef-8510-8c792d1124bc
Dataset bigger

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 56.3M  100 56.3M    0     0   836k      0  0:01:09  0:01:09 --:--:--  831k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

be39785b-67cb-4177-be19-a40ee3747e45
Dataset bigger than 0.1GB, skipping
4c6f9f26-5470-455b-8933-c408232fbf56


100 39.3M  100 39.3M    0     0   834k      0  0:00:48  0:00:48 --:--:--  803k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

0ba636a1-4754-4786-a8be-7ab3cf760fd6
Dataset bigger than 0.1GB, skipping
3f32121d-126b-4e8d-9f69-d86502d2a1b1
Dataset bigger than 0.1GB, skipping
a13bda79-9134-46c9-9ed1-a2858be9aafe
Dataset bigger than 0.1GB, skipping
535e9336-2d8d-43c3-944d-bcbebe20df8a
Dataset bigger than 0.1GB, skipping
18e2a8c5-33f7-455e-a58a-b2ba6921db27
Dataset bigger than 0.1GB, skipping
12967895-3d58-4e93-be2c-4e1bcf4388d5
Dataset bigger than 0.1GB, skipping
ee195b7d-184d-4dfa-9b1c-51a7e601ac11


100 33.7M  100 33.7M    0     0   825k      0  0:00:41  0:00:41 --:--:--  860k


## Run the pipeline on the downloaded datasets
If running on mac, you might not be able to install pygraphviz with `pip`. If you're using conda, you can install it with `conda install -c conda-forge pygraphviz`.

At the end, the pipeline will attempt to upload the cube to the WMG s3 bucket. This will fail and is expected. Load the snapshot locally to explore its contents and assess whether the pipeline was successful. To learn how to load the snapshot locally, take a look at `example_dev_notebooks/local_endpoint_runner.ipynb`.

In [7]:
corpus_path, stats = load_data_and_create_cube(DATASET_FOLDER_NAME, extract_data=False, validate_cube=False)

INFO:backend.wmg.pipeline.integrated_corpus.job:Processing dataset 1 of 3
INFO:backend.wmg.pipeline.integrated_corpus.job:h5ad_file_path='pipeline_test_datasets/f801b7a9-80a6-4d09-9161-71474deb58ae/local.h5ad'
INFO:backend.wmg.pipeline.integrated_corpus.extract:Extracting pipeline_test_datasets/f801b7a9-80a6-4d09-9161-71474deb58ae/local.h5ad...
INFO:backend.wmg.pipeline.integrated_corpus.transform:Applying filters: assay, and lowly-covered cells
INFO:backend.wmg.pipeline.integrated_corpus.transform:Obtaining high-level tissues
INFO:backend.wmg.pipeline.integrated_corpus.job:loaded: shape=(5766, 32922)
INFO:backend.wmg.pipeline.integrated_corpus.load:Adding 32922 gene records...
INFO:backend.wmg.pipeline.integrated_corpus.load:Global var index length: (32922, 4)
INFO:backend.wmg.pipeline.integrated_corpus.load:Function load_dataset executed in 2.7785s
INFO:backend.wmg.pipeline.integrated_corpus.job:Processing dataset 2 of 3
INFO:backend.wmg.pipeline.integrated_corpus.job:h5ad_file_path=

INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0000576, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0000623, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0000650, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0000653, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0000669, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell type: CL:0001061, organism: NCBITaxon:9606
INFO:backend.wmg.pipeline.summary_cubes.marker_genes:Calculating markers for tissue: UBERON:0002113, cell 

ModuleNotFoundError: No module named 'pygraphviz'