In [None]:
from dandi.dandiapi import DandiAPIClient
import json
from urllib.parse import quote, unquote
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import botocore
import zarr
import s3fs
fs = s3fs.S3FileSystem(anon=True)

## Define utility functions

In [None]:
# from dashboard
def assets_to_df(ds):
    assets = list(ds.get_assets())
    asset_info = []
    for asset in assets:
        path_parts = asset.path.split("/")
        sub = None
        for val in path_parts[:-1]:
            if val.startswith("sub-"):
                sub = val.split("sub-")[1]
        assetname = path_parts[-1]
        info = dict([[val.split("-")[0], "-".join(val.split("-")[1:])]
                     for val in assetname.split(".")[0].split("_")
                     if "-" in val])
        if sub:
            info["subdir"] = sub
        info["path"] = asset.path
        modality = None
        if "_" in assetname and "sub-" in assetname:
            path = "sub-".join(asset.path.split("sub-")[1:])
            if len(path.split("/")) > 1:
                modality = assetname.split("_")[-1].split(".")[0]
                info["modality"] = modality
        ext = ".".join(assetname.split(".")[1:])
        info["extension"] = ext
        info["modified"] = asset.modified
        asset_info.append(info)
    df = pd.DataFrame(asset_info)
    return df, assets

In [None]:
def get_url(ds, subj, sample, stain, ses):
    zarrs = list(ds.get_assets_by_glob(f"*{subj}/ses-{ses}/*_sample-{sample}_stain-{stain}_run-1*.ome.zarr"))

    sources = [f"dandiarchive/zarr/{val.get_content_url(regex='s3').split('/')[-2]}/" 
               for val in sorted(zarrs, key=lambda x: int(x.path.split("_chunk-")[1].split("_")[0]))]
    return sources


In [None]:
# checking basic things for arrays from a specific level 
def arrays3D_chunks_check(url_list, level=6):
    arr_chunks = []
    arr_errors = {"Zeros": [], "ClientError": [], f"No level {level}": []}
    for ii, url in enumerate(url_list):
        store_chunk = s3fs.S3Map(root=url, s3=fs, check=False)
        try:
            root = zarr.group(store=store_chunk)
        except botocore.exceptions.ClientError:
            arr_errors["ClientError"].append(ii)
        else:
            try:
                arr_chunks.append(root[f"/{level}"][0,0,:,:,:])
            except KeyError:
                arr_errors[f"No level {level}"].append(ii)
            else:
                if root[f"/{level}"][0,0,:,:,:].max() == 0:
                    arr_errors["Zeros"].append(ii)
    
    if len(set([arr.shape for arr in arr_chunks])) > 1:
        arr_errors["Different shapes"] = True
    else:
        arr_errors["Different shapes"] = None
    return arr_chunks, arr_errors

## Getting assets for a specific dandiset

In [None]:
dandiset = "000108"
# dandiset = "000026"

api = DandiAPIClient("https://api.dandiarchive.org/api")
ds = api.get_dandiset(dandiset)

df, assets = assets_to_df(ds)

# Make dandiset specific alterations to dataframe
if dandiset == "000108":
    remap = dict(calretinin='CR', npy='NPY')
    def sample_to_int(x):
        if isinstance(x, str) or not np.isnan(x):
            return int(x.split('R')[0])
        return x
    df.stain = df.stain.apply(lambda x: remap[x] if x in remap else x)
    #df['sample'] = df['sample'].apply(sample_to_int).astype(pd.Int64Dtype())
if dandiset == "000026":
    df = df[(df.path.str.contains("derivatives") & 
             ((df.path.str.contains("EPIC") == False) & 
              (df.path.str.contains("STER") == False)))
             == False]


## Running checks for level6 data for all subjects, samples, stains and sessions

In [None]:
for group in df.groupby("subdir"):
    sub = group[0]
    print(f"subject: {sub}")
    df_sub = df[(df["sub"] == sub)]
    df_sub_agg = df_sub.groupby(['sample', 'stain', 'ses'])["chunk", "path"].agg(list)
    df_sub_agg = pd.concat((df_sub_agg.index.to_frame(), df_sub_agg), axis=1)

    df_sub_agg["errors"] = None
    samples = set(df_sub_agg["sample"].tolist())
    print("all samples: ", samples)
    for ii, sample in enumerate(samples):
        print("SAMPLE: ", ii, sample) 
        for stain in set(df_sub_agg.loc[sample]["stain"].tolist()):
            for ses in df_sub_agg.loc[sample].loc[stain]["ses"].tolist():
                url_list = get_url(ds, sub, sample, stain, ses)
                _, arrays_lev6_errors  = arrays3D_chunks_check(url_list, level=6)
                print("errors", stain, arrays_lev6_errors)
                df_sub_agg.loc[sample].loc[stain].loc[ses]["errors"] = arrays_lev6_errors


    df_sub_agg.to_csv(f'dferr_sub-{sub}.csv')