In [8]:
from dandi.dandiapi import DandiAPIClient
import json
from urllib.parse import quote, unquote
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
import botocore
import zarr
import s3fs
fs = s3fs.S3FileSystem(anon=True)

## Define utility functions

In [10]:
# from dashboard
def assets_to_df(ds):
    assets = list(ds.get_assets())
    asset_info = []
    for asset in assets:
        path_parts = asset.path.split("/")
        sub = None
        for val in path_parts[:-1]:
            if val.startswith("sub-"):
                sub = val.split("sub-")[1]
        assetname = path_parts[-1]
        info = dict([[val.split("-")[0], "-".join(val.split("-")[1:])]
                     for val in assetname.split(".")[0].split("_")
                     if "-" in val])
        if sub:
            info["subdir"] = sub
        info["path"] = asset.path
        modality = None
        if "_" in assetname and "sub-" in assetname:
            path = "sub-".join(asset.path.split("sub-")[1:])
            if len(path.split("/")) > 1:
                modality = assetname.split("_")[-1].split(".")[0]
                info["modality"] = modality
        ext = ".".join(assetname.split(".")[1:])
        info["extension"] = ext
        info["modified"] = asset.modified
        asset_info.append(info)
    df = pd.DataFrame(asset_info)
    return df, assets

In [11]:
def get_url(ds, subj, sample, stain, ses):
    zarrs = list(ds.get_assets_by_glob(f"*{subj}/ses-{ses}/*_sample-{sample}_stain-{stain}_run-1*.ome.zarr"))

    sources = [f"dandiarchive/zarr/{val.get_content_url(regex='s3').split('/')[-2]}/" 
               for val in sorted(zarrs, key=lambda x: int(x.path.split("_chunk-")[1].split("_")[0]))]
    return sources


In [12]:
# checking basic things for arrays from a specific level 
def arrays3D_chunks_check(url_list, level=6):
    arr_chunks = []
    arr_errors = {"Zeros": [], "ClientError": [], f"No level {level}": []}
    for ii, url in enumerate(url_list):
        store_chunk = s3fs.S3Map(root=url, s3=fs, check=False)
        try:
            root = zarr.group(store=store_chunk)
        except (botocore.exceptions.ClientError, zarr.errors.ReadOnlyError, KeyError):
            arr_errors["ClientError"].append(ii)
        else:
            try:
                arr_chunks.append(root[f"/{level}"][0,0,:,:,:])
            except KeyError:
                arr_errors[f"No level {level}"].append(ii)
            else:
                if root[f"/{level}"][0,0,:,:,:].max() == 0:
                    arr_errors["Zeros"].append(ii)
    
    if len(set([arr.shape for arr in arr_chunks])) > 1:
        arr_errors["Different shapes"] = True
    else:
        arr_errors["Different shapes"] = None
    return arr_chunks, arr_errors

## Getting assets for a specific dandiset

In [13]:
dandiset = "000108"
# dandiset = "000026"

api = DandiAPIClient("https://api.dandiarchive.org/api")
ds = api.get_dandiset(dandiset)

df, assets = assets_to_df(ds)

# Make dandiset specific alterations to dataframe
if dandiset == "000108":
    remap = dict(calretinin='CR', npy='NPY')
    def sample_to_int(x):
        if isinstance(x, str) or not np.isnan(x):
            return int(x.split('R')[0])
        return x
    df.stain = df.stain.apply(lambda x: remap[x] if x in remap else x)
    #df['sample'] = df['sample'].apply(sample_to_int).astype(pd.Int64Dtype())
if dandiset == "000026":
    df = df[(df.path.str.contains("derivatives") & 
             ((df.path.str.contains("EPIC") == False) & 
              (df.path.str.contains("STER") == False)))
             == False]


## Running checks for level6 data for all subjects, samples, stains and sessions

In [14]:
for group in df.groupby("subdir"):
    sub = group[0]
    print(f"subject: {sub}")
    df_sub = df[(df["sub"] == sub)]
    df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)
    df_sub_agg = pd.concat((df_sub_agg.index.to_frame(), df_sub_agg), axis=1)

    df_sub_agg["errors"] = None
    samples = set(df_sub_agg["sample"].tolist())
    print("all samples: ", samples)
    for ii, sample in enumerate(samples):
        print("SAMPLE: ", ii, sample) 
        for stain in set(df_sub_agg.loc[sample]["stain"].tolist()):
            for ses in df_sub_agg.loc[sample].loc[stain]["ses"].tolist():
                url_list = get_url(ds, sub, sample, stain, ses)
                _, arrays_lev6_errors  = arrays3D_chunks_check(url_list, level=6)
                print("errors", stain, arrays_lev6_errors)
                df_sub_agg.loc[sample].loc[stain].loc[ses]["errors"] = arrays_lev6_errors


    df_sub_agg.to_csv(f'dferr_sub-{sub}.csv')

subject: MITU01
all samples:  {'170', '67', '11', '153', '32', '44', '166', '58', '116', '126', '151', '100', '123', '97', '20', '56', '53', '88', '85', '19', '92', '41', '164', '154', '127', '114', '102', '39', '40', '160', '60', '121', '156', '118', '86', '30', '57', '64', '125', '101', '6', '10', '112', '150', '152', '163', '90', '120', '83', '107', '104', '178', '26', '9', '22', '72', '25', '109', '13', '169', '59', '28', '98', '105', '99', '33', '167', '113', '157', '94', '128', '8', '103', '81', '159', '21', '168', '65', '172', '38', '117', '37', '165', '147', '73', '110', '17', '12', '162', '50', '124', '68', '179', '77', '42', '108', '16', '176R2', '161', '155', '158', '122', '150R2', '76', '91', '34', '23', '62', '115', '69', '15', '24', '36'}
SAMPLE:  0 170


  df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)


errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  1 67
errors YO {'Zeros': [4], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [3], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  2 11
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': True}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': True}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  3 153
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': N

  df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)


errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  1 16
errors IBA1 {'Zeros': [], 'ClientError': [], 'No level 6': [4], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [0], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [7], 'Different shapes': None}
SAMPLE:  2 12
errors IBA1 {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  3 10
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [0], 'Different shapes'

  df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)


errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  1 28
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors NN {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  2 8
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors LEC {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {

  df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)


errors GFAP {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors PV {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  1 mEhmAD03x15R2
errors NPY {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors SST {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  2 mEhm11206x15R3
errors NPY {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors SST {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  3 mEhm11206x15R2
errors ABETA {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors IBA1 {'Zeros': [], 'ClientError': [],

  df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)


errors GFAP {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors PV {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  1 ADO6R6
errors ABETA {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors CD31 {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  2 ADO6R3
errors NPY {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors SST {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
SAMPLE:  3 controlO6R4
errors GFAP {'Zeros': [], 'ClientError': [], 'No level 6': [], 'Different shapes': None}
errors YO {'Zeros': [], 'ClientError': [], 'No level 6': [],