In this notebook, i explore the primary data and create tables (saved as CSV), to summarize it and access it easily.

In [41]:
%load_ext autoreload
%autoreload 2
import os
from pathlib import Path
from natsort import natsorted
import pandas as pd
import zarr

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def get_zarr_files(data_dir, from_csv=True):
    """
    Explores the image data and returns a dataframe with the following columns:
        - path: relative path to the zarr file
        - split: train, or test
        - file_type: "denoised.zarr", "ctfdeconvolved.zarr", "isonetcorrected.zarr" or "wbp.zarr"
        - spacing: "VoxelSpacing10.000"
        - experiment: "TS_5_4", "TS_6_4", "TS_69_2", "TS_6_6", "TS_73_6", "TS_86_3" or "TS_99_9"
    """
    if from_csv:
        return pd.read_csv(data_dir / "zarr_files.csv")
    zarrs = [path.relative_to(data_dir).__str__() for path in natsorted(data_dir.rglob('*.zarr'))]
    files =  pd.DataFrame(zarrs, columns=["path"])
    files["split"] = files["path"].apply(lambda x: Path(x).parts[0])
    files["file_type"] = files["path"].apply(lambda x: Path(x).parts[-1])
    files["spacing"] = files["path"].apply(lambda x: Path(x).parts[-2])
    files["experiment"] = files["path"].apply(lambda x: Path(x).parts[-3])
    return files

In [None]:
data_dir = Path(os.environ['DATA_DIR'])

first_time = False
if first_time:
    zarr_files = get_zarr_files(data_dir, from_csv=False)
    zarr_files.to_csv(data_dir / "zarr_files.csv", index=False)

zarr_files = get_zarr_files(data_dir)
zarr_files

Unnamed: 0,path,split,file_type,spacing,experiment
0,test/static/ExperimentRuns/TS_5_4/VoxelSpacing...,test,denoised.zarr,VoxelSpacing10.000,TS_5_4
1,test/static/ExperimentRuns/TS_6_4/VoxelSpacing...,test,denoised.zarr,VoxelSpacing10.000,TS_6_4
2,test/static/ExperimentRuns/TS_69_2/VoxelSpacin...,test,denoised.zarr,VoxelSpacing10.000,TS_69_2
3,train/static/ExperimentRuns/TS_5_4/VoxelSpacin...,train,ctfdeconvolved.zarr,VoxelSpacing10.000,TS_5_4
4,train/static/ExperimentRuns/TS_5_4/VoxelSpacin...,train,denoised.zarr,VoxelSpacing10.000,TS_5_4
5,train/static/ExperimentRuns/TS_5_4/VoxelSpacin...,train,isonetcorrected.zarr,VoxelSpacing10.000,TS_5_4
6,train/static/ExperimentRuns/TS_5_4/VoxelSpacin...,train,wbp.zarr,VoxelSpacing10.000,TS_5_4
7,train/static/ExperimentRuns/TS_6_4/VoxelSpacin...,train,ctfdeconvolved.zarr,VoxelSpacing10.000,TS_6_4
8,train/static/ExperimentRuns/TS_6_4/VoxelSpacin...,train,denoised.zarr,VoxelSpacing10.000,TS_6_4
9,train/static/ExperimentRuns/TS_6_4/VoxelSpacin...,train,isonetcorrected.zarr,VoxelSpacing10.000,TS_6_4


In [None]:
zarr_files["split"].value_counts()

split
train    28
test      3
Name: count, dtype: int64

In [None]:
zarr_files["experiment"].value_counts()

experiment
TS_5_4     5
TS_6_4     5
TS_69_2    5
TS_6_6     4
TS_73_6    4
TS_86_3    4
TS_99_9    4
Name: count, dtype: int64

In [None]:
zarr_files["file_type"].value_counts()

file_type
denoised.zarr           10
ctfdeconvolved.zarr      7
isonetcorrected.zarr     7
wbp.zarr                 7
Name: count, dtype: int64

## Non zarr files

In [None]:
def list_files_excluding_zarr(root_dir):
    """
    List all files in a directory (recursively), excluding ZARR directories
    """
    root_path = Path(root_dir)

    for dirpath, dirnames, filenames in os.walk(root_path):
        # Skip directories that end with .zarr by removing them from dirnames
        dirnames[:] = [d for d in dirnames if not d.endswith('.zarr')]

        # Yield files from the current directory
        for filename in filenames:
            yield Path(dirpath) / filename

In [7]:
def get_json_files(data_dir, from_csv=True):
    """
    returns a dataframe with the following columns:
        - path: relative path to the json file
        - split: train, or test
        - experiment: "TS_5_4", "TS_6_4", "TS_69_2", "TS_6_6", "TS_73_6", "TS_86_3" or "TS_99_9"
        - file_type: "ribosome.json", "virus-like-particle.json", "beta-galactosidase.json",
            "beta-amylase.json", "apo-ferritin.json", or "thyroglobulin.json"
    """
    if from_csv:
        return pd.read_csv(data_dir / "json_files.csv")
    non_zarrs = [path for path in list_files_excluding_zarr(data_dir / "train/overlay/ExperimentRuns/")]
    paths = [path.relative_to(data_dir).__str__() for path in non_zarrs]
    files = pd.DataFrame(paths, columns=["path"])
    files["split"] = files["path"].apply(lambda x: Path(x).parts[0])
    files["experiment"] = files["path"].apply(lambda x: Path(x).parts[-3])
    files["file_type"] = files["path"].apply(lambda x: Path(x).parts[-1])
    return files

In [8]:
first_time = False
if first_time:
    json_files = get_json_files(data_dir, from_csv=False)
    json_files.to_csv(data_dir / "json_files.csv", index=False)
json_files = get_json_files(data_dir)
json_files

Unnamed: 0,path,split,experiment,file_type
0,train/overlay/ExperimentRuns/TS_5_4/Picks/ribo...,train,TS_5_4,ribosome.json
1,train/overlay/ExperimentRuns/TS_5_4/Picks/viru...,train,TS_5_4,virus-like-particle.json
2,train/overlay/ExperimentRuns/TS_5_4/Picks/beta...,train,TS_5_4,beta-galactosidase.json
3,train/overlay/ExperimentRuns/TS_5_4/Picks/beta...,train,TS_5_4,beta-amylase.json
4,train/overlay/ExperimentRuns/TS_5_4/Picks/apo-...,train,TS_5_4,apo-ferritin.json
5,train/overlay/ExperimentRuns/TS_5_4/Picks/thyr...,train,TS_5_4,thyroglobulin.json
6,train/overlay/ExperimentRuns/TS_99_9/Picks/rib...,train,TS_99_9,ribosome.json
7,train/overlay/ExperimentRuns/TS_99_9/Picks/vir...,train,TS_99_9,virus-like-particle.json
8,train/overlay/ExperimentRuns/TS_99_9/Picks/bet...,train,TS_99_9,beta-galactosidase.json
9,train/overlay/ExperimentRuns/TS_99_9/Picks/bet...,train,TS_99_9,beta-amylase.json


In [None]:
json_files.file_type.value_counts()

file_type
ribosome.json               7
virus-like-particle.json    7
beta-galactosidase.json     7
beta-amylase.json           7
apo-ferritin.json           7
thyroglobulin.json          7
Name: count, dtype: int64

In [16]:
def get_experiment_paths(data_dir, experiment):
    """
    Returns a dictionary with the following keys:
        - images: dictionary with the following keys:
            - denoised: path to the denoised zarr file
            - ctfdeconvolved: path to the ctfdeconvolved zarr file
            - isonetcorrected: path to the isonetcorrected zarr file
            - wbp: path to the wbp zarr file
        - jsons: dictionary with the following keys:
            - ribosome: path to the ribosome json file
            - virus-like-particle: path to the virus-like-particle json file
            - beta-galactosidase: path to the beta-galactosidase json file
            - beta-amylase: path to the beta-amylase json file
            - apo-ferritin: path to the apo-ferritin json file
            - thyroglobulin: path to the thyroglobulin json file
    """
    images = {}
    for file_type in ["denoised", "ctfdeconvolved", "isonetcorrected", "wbp"]:
        images[file_type] = data_dir / f"train/static/ExperimentRuns/{experiment}/VoxelSpacing10.000/{file_type}.zarr"
    jsons = {}
    for file_type in ["ribosome", "virus-like-particle", "beta-galactosidase", "beta-amylase", "apo-ferritin", "thyroglobulin"]:
        jsons[file_type] = data_dir / f"train/overlay/ExperimentRuns/{experiment}/Picks/{file_type}.json"
    return dict(images=images, jsons=jsons)

In [17]:
def get_df_from_json(json_path):
    """
    For a given JSON file, returns a dataframe with the following columns:
        - pickable_object_name: one of "ribosome", "virus-like-particle", "beta-galactosidase",
            "beta-amylase", "apo-ferritin", or "thyroglobulin"
        - x: x coordinate of the point
        - y: y coordinate of the point
        - z: z coordinate of the point
        - instance_id, run_name, session_id, trust_orientation, unit, user_id, voxel_spacing, and transformation_
    """
    df = pd.read_json(json_path)
    df[["x", "y", "z"]] = df["points"].apply(lambda x: pd.Series(x["location"])[["x", "y", "z"]])
    df["instance_id"] = df["points"].apply(lambda x: x["instance_id"])
    df["transformation_"] = df["points"].apply(lambda x: str(x["transformation_"]))
    return df[df.columns.difference(["points"])].copy()


In [18]:
def get_objects(data_dir, experiment):
    """
    For a given experiment, returns a dataframe with the following columns:
        - pickable_object_name: one of "ribosome", "virus-like-particle", "beta-galactosidase",
            "beta-amylase", "apo-ferritin", or "thyroglobulin"
        - x: x coordinate of the point
        - y: y coordinate of the point
        - z: z coordinate of the point
        - instance_id, run_name, session_id, trust_orientation, unit, user_id, voxel_spacing, and transformation
    """
    paths = get_experiment_paths(data_dir, experiment)
    file_types = ["ribosome", "virus-like-particle", "beta-galactosidase", "beta-amylase", "apo-ferritin", "thyroglobulin"]
    dfs = [get_df_from_json(paths["jsons"][file_type]) for file_type in file_types]
    df = pd.concat(dfs, ignore_index=True)
    return df

In [19]:
objects = get_objects(data_dir, "TS_5_4")

In [20]:
objects.nunique()

instance_id               1
pickable_object_name      6
run_name                  1
session_id                1
transformation_           1
trust_orientation         1
unit                      1
user_id                   1
voxel_spacing             0
x                       140
y                       140
z                       140
dtype: int64

In [30]:
EXPERIMENTS = ['TS_5_4', 'TS_6_4', 'TS_69_2', 'TS_6_6', 'TS_73_6', 'TS_86_3',
       'TS_99_9']
OBJECT_NAMES = ['ribosome', 'virus-like-particle', 'beta-galactosidase',
         'beta-amylase', 'apo-ferritin', 'thyroglobulin']

def get_all_objects(data_dir, from_csv=True, filtered=True):
    """
    For all experiments, returns a dataframe with the following columns:
        - particle_type: one of "ribosome", "virus-like-particle", "beta-galactosidase",
            "beta-amylase", "apo-ferritin", or "thyroglobulin"
        - x: x coordinate of the point
        - y: y coordinate of the point
        - z: z coordinate of the point
        - experiment: one of "TS_5_4", "TS_6_4", "TS_69_2", "TS_6_6", "TS_73_6", "TS_86_3", "TS_99_9"
    """
    if from_csv:
        return pd.read_csv(data_dir / "objects.csv")
    dfs = []
    for experiment in EXPERIMENTS:
        df = get_objects(data_dir, experiment)
        df = df.rename(columns={"pickable_object_name": "particle_type",
                                "run_name": "experiment"})
        dfs.append(df)
    df = pd.concat(dfs).reset_index(drop=True)
    if filtered:
        df = df[['experiment', 'particle_type', 'x', 'y', 'z']]
    return df

objects = get_all_objects(data_dir, from_csv=False, filtered=False)
objects

Unnamed: 0,instance_id,particle_type,experiment,session_id,transformation_,trust_orientation,unit,user_id,voxel_spacing,x,y,z
0,0,ribosome,TS_5_4,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,4601.271,601.066,600.934
1,0,ribosome,TS_5_4,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,4803.789,455.425,514.016
2,0,ribosome,TS_5_4,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,4715.436,825.374,802.166
3,0,ribosome,TS_5_4,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,5003.275,782.745,802.480
4,0,ribosome,TS_5_4,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,710.459,3815.845,1405.435
...,...,...,...,...,...,...,...,...,...,...,...,...
1264,0,thyroglobulin,TS_99_9,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,2790.000,3945.000,790.000
1265,0,thyroglobulin,TS_99_9,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,1110.308,5734.615,862.154
1266,0,thyroglobulin,TS_99_9,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,3148.098,3234.683,952.634
1267,0,thyroglobulin,TS_99_9,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",True,angstrom,curation,,1541.778,4082.630,989.996


In [31]:
objects.nunique()

instance_id             1
particle_type           6
experiment              7
session_id              1
transformation_         1
trust_orientation       1
unit                    1
user_id                 1
voxel_spacing           0
x                    1268
y                    1267
z                    1219
dtype: int64

In [None]:
first_time = False
if first_time:
    objects = get_all_objects(data_dir, from_csv=False, filtered=True)
    objects.to_csv(data_dir / "objects.csv", index=False)
else:
    objects = pd.read_csv(data_dir / "objects.csv")
objects

Unnamed: 0,experiment,particle_type,x,y,z
0,TS_5_4,ribosome,4601.271,601.066,600.934
1,TS_5_4,ribosome,4803.789,455.425,514.016
2,TS_5_4,ribosome,4715.436,825.374,802.166
3,TS_5_4,ribosome,5003.275,782.745,802.480
4,TS_5_4,ribosome,710.459,3815.845,1405.435
...,...,...,...,...,...
1264,TS_99_9,thyroglobulin,2790.000,3945.000,790.000
1265,TS_99_9,thyroglobulin,1110.308,5734.615,862.154
1266,TS_99_9,thyroglobulin,3148.098,3234.683,952.634
1267,TS_99_9,thyroglobulin,1541.778,4082.630,989.996


In [36]:
pivot = objects.pivot_table(index="experiment", columns="particle_type", values="x", aggfunc="count").T
pivot

experiment,TS_5_4,TS_69_2,TS_6_4,TS_6_6,TS_73_6,TS_86_3,TS_99_9
particle_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apo-ferritin,46,35,58,41,95,64,36
beta-amylase,10,12,9,14,12,9,21
beta-galactosidase,12,16,12,11,14,23,24
ribosome,31,37,74,23,46,55,65
thyroglobulin,30,34,30,35,28,45,49
virus-like-particle,11,9,10,19,22,29,13


In [38]:
pivot.sum()

experiment
TS_5_4     140
TS_69_2    143
TS_6_4     193
TS_6_6     143
TS_73_6    217
TS_86_3    225
TS_99_9    208
dtype: int64

In [39]:
pivot.sum(axis=1)

particle_type
apo-ferritin           375
beta-amylase            87
beta-galactosidase     112
ribosome               331
thyroglobulin          251
virus-like-particle    113
dtype: int64

In [43]:
for experiment in EXPERIMENTS:
    paths = get_experiment_paths(data_dir, experiment)
    zarr_path = paths["images"]["denoised"]
    zarray = zarr.open(data_dir / zarr_path)[0]
    print(experiment, zarray.shape)

TS_5_4 (184, 630, 630)
TS_6_4 (184, 630, 630)
TS_69_2 (184, 630, 630)
TS_6_6 (184, 630, 630)
TS_73_6 (184, 630, 630)
TS_86_3 (184, 630, 630)
TS_99_9 (184, 630, 630)
