In [1]:
# Required to load columns with extension types
import elbow.dtypes

import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

Load the table from parquet. Print the column names and pandas types.

> Note, not all types are preserved when converting parquet to pandas. In particular, strings are mapped to objects and ints with `None` to float with `NaN`.

In [3]:
df = pd.read_parquet("tables/bids-examples.parquet")

print(f"Shape: ", df.shape)
print(
    "Columns:\n"
    + "\n".join(f"  {name}: {typ}" for name, typ in df.dtypes.to_dict().items())
)

Shape:  (10225, 41)
Columns:
  dataset: object
  dataset_path: object
  dataset_description: json
  sub: object
  ses: object
  sample: object
  task: object
  acq: object
  ce: object
  trc: object
  stain: object
  rec: object
  dir: object
  run: float64
  mod: object
  echo: float64
  flip: float64
  inv: float64
  mt: object
  part: object
  proc: object
  hemi: object
  space: object
  split: object
  recording: object
  chunk: float64
  atlas: object
  res: object
  den: object
  label: object
  desc: object
  datatype: object
  suffix: object
  ext: object
  entities: json
  sidecar: json
  image_header: json
  image_affine: ndarray
  file_path: object
  link_target: object
  mod_time: float64


Sort the dataset. By default, the rows of the dataset may be in arbitrary order.

Then display the first few rows.

In [4]:
df = df.sort_values(["dataset", "sub", "ses", "task", "run"])

In [5]:
df.head(4)

Unnamed: 0,dataset,dataset_path,dataset_description,sub,ses,sample,task,acq,ce,trc,stain,rec,dir,run,mod,echo,flip,inv,mt,part,proc,hemi,space,split,recording,chunk,atlas,res,den,label,desc,datatype,suffix,ext,entities,sidecar,image_header,image_affine,file_path,link_target,mod_time
3950,7t_trt,/ocean/projects/med220004p/clane2/code/bids2ta...,"{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}",1,1,,rest,fullbrain,,,,,,1.0,,,,,,,,,,,,,,,,,,func,physio,.tsv.gz,"{'sub': '01', 'ses': '1', 'task': 'rest', 'acq...","{'StartTime': 0, 'SamplingFrequency': 100, 'Co...",,,/ocean/projects/med220004p/clane2/code/bids2ta...,,1683159000.0
3954,7t_trt,/ocean/projects/med220004p/clane2/code/bids2ta...,"{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}",1,1,,rest,fullbrain,,,,,,1.0,,,,,,,,,,,,,,,,,,func,bold,.nii.gz,"{'sub': '01', 'ses': '1', 'task': 'rest', 'acq...",{'CogAtlasID': 'https://www.cognitiveatlas.org...,,,/ocean/projects/med220004p/clane2/code/bids2ta...,,1683159000.0
3953,7t_trt,/ocean/projects/med220004p/clane2/code/bids2ta...,"{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}",1,1,,rest,fullbrain,,,,,,2.0,,,,,,,,,,,,,,,,,,func,bold,.nii.gz,"{'sub': '01', 'ses': '1', 'task': 'rest', 'acq...",{'CogAtlasID': 'https://www.cognitiveatlas.org...,,,/ocean/projects/med220004p/clane2/code/bids2ta...,,1683159000.0
3955,7t_trt,/ocean/projects/med220004p/clane2/code/bids2ta...,"{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}",1,1,,rest,fullbrain,,,,,,2.0,,,,,,,,,,,,,,,,,,func,physio,.tsv.gz,"{'sub': '01', 'ses': '1', 'task': 'rest', 'acq...","{'StartTime': 0, 'SamplingFrequency': 100, 'Co...",,,/ocean/projects/med220004p/clane2/code/bids2ta...,,1683159000.0


Count the number of non-null entries per column.

In [6]:
column_counts = df.count(axis=0)
column_counts

dataset                10225
dataset_path           10225
dataset_description    10201
sub                    10225
ses                     3506
sample                    16
task                    7992
acq                      422
ce                         0
trc                        0
stain                      8
rec                       58
dir                        3
run                     6736
mod                        1
echo                     541
flip                      53
inv                       20
mt                        25
part                      16
proc                     208
hemi                      83
space                    301
split                      0
recording                  5
chunk                      8
atlas                      0
res                       84
den                        0
label                     84
desc                     280
datatype                8836
suffix                 10163
ext                    10225
entities      

We see that some BIDS entities never appear in any of the example datasets.

In [7]:
column_counts[column_counts == 0]

ce             0
trc            0
split          0
atlas          0
den            0
link_target    0
dtype: int64

Count the number of files per dataset (with sidecar metadata and/or image header metadata).

> Note, most of the image files in the bids-examples dataset have empty headers.

In [8]:
df.groupby("dataset").agg(
    {"file_path": "count", "sidecar": "count", "image_header": "count"}
)

Unnamed: 0_level_0,file_path,sidecar,image_header
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7t_trt,635,350,0
asl001,4,2,0
asl002,5,3,0
asl003,5,3,0
asl004,6,4,0
asl005,5,3,0
ds000001-fmriprep,420,52,0
ds000117,1089,641,0
ds000246,32,22,0
ds000247,100,70,0
