In [2]:
from argparse import ArgumentParser
import os
from pathlib import Path
import re
import functools
import operator

import datalad.api as dl
import pandas as pd

In [2]:
?dl.containers_add

[0;31mSignature:[0m
[0mdl[0m[0;34m.[0m[0mcontainers_add[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0murl[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdataset[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcall_fmt[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mimage[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mupdate[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add a container to a dataset

Parameters
----------
name : str
  The name to register the container under. This also determines the
  default location of the container image within the dataset.
url : str or None, optional
  A URL (or local path) to get the container image from. If the URL
  scheme is one recognized by Singularity ('shub://' or 'docker://'),
  a command format string f

In [7]:
key_vals = 'subject == sub-001A sub-001B ; voi == acc pcc * subject'

In [4]:
pattern = "<!glob>(inputs/mri-raw/sub-*/anat/*T1w.nii.gz)<!regex>(sub-\w+)<!unique>"
vals = pattern

In [54]:
pattern

'<glob>(inputs/mri-raw/sub-*/anat/*T1w.nii.gz)<regex>()'

In [5]:
def is_numeric(x):
    try:
        int(x)
        return True
    except:
        return False
    
def try_search(regex, val):
    try:
        return re.search(regex, val).group()
    except:
        return None
        

In [6]:
super_dataset_path = Path('/misc/geminis2/ramirezd/test_bet/')
glob_characters = '[a-z,A-Z,0-9,\\,\/,\-,_,\.,\*,\[,\],\:,\+,\?,\!]'
regex_characters = '[a-z,A-Z,0-9,\\\,\/,\-,_,\.,\*,\[,\],\:,\+,\?,\!,\(,\),\<,\>]'


if '<!glob>' in pattern and not '<!in>':
    try:
        globbing = re.search(f"(?<=\<!glob\>\(){glob_characters}+(?=\))", pattern).group()
        vals = sorted([str(path) for path in Path(super_dataset_path).glob(globbing)])
    except:
        raise Exception('Not a valid globbing pattern.')
if '<!regex>' in pattern:
    regex = re.search(f"(?<=\<!regex\>\(){regex_characters}+(?=\)$)", pattern.replace('<!unique>','').strip()).group()
    vals = [try_search(regex, val) for val in vals]
    if all(val is None for val in vals):
        raise Exception("Not a valid regex or no matches.")
if '<!unique>' in pattern:
    vals = sorted(set(vals))
    

vals

['sub-001A',
 'sub-001B',
 'sub-002A',
 'sub-002B',
 'sub-003A',
 'sub-003B',
 'sub-004A',
 'sub-004B',
 'sub-005A',
 'sub-005B',
 'sub-006A',
 'sub-006B',
 'sub-007A',
 'sub-007B',
 'sub-008A',
 'sub-008B',
 'sub-009A',
 'sub-009B',
 'sub-010A',
 'sub-010B',
 'sub-010C',
 'sub-011A',
 'sub-011B',
 'sub-012A',
 'sub-012B',
 'sub-013A',
 'sub-013B',
 'sub-014A',
 'sub-014B',
 'sub-015A',
 'sub-015B',
 'sub-016A',
 'sub-016B',
 'sub-017A',
 'sub-017B',
 'sub-018A',
 'sub-018B',
 'sub-019A',
 'sub-019B',
 'sub-020A',
 'sub-020B',
 'sub-021A',
 'sub-021B',
 'sub-022A',
 'sub-022B',
 'sub-023A',
 'sub-023B',
 'sub-024A',
 'sub-024B',
 'sub-025A',
 'sub-025B',
 'sub-026A',
 'sub-026B',
 'sub-027A',
 'sub-027B',
 'sub-028A',
 'sub-028B',
 'sub-029A',
 'sub-029B',
 'sub-030A',
 'sub-030B',
 'sub-031A',
 'sub-031B',
 'sub-032A',
 'sub-032B',
 'sub-033A',
 'sub-033B',
 'sub-034A',
 'sub-034B',
 'sub-035A',
 'sub-035B',
 'sub-036A',
 'sub-036B',
 'sub-037A',
 'sub-037B',
 'sub-038A',
 'sub-038B',

In [118]:
regex

'sub-\\w+'

In [21]:
key_vals.split(';')[0]

'subject = sub-001A sub-001B '

In [9]:
vals_plus

'acc pcc * subject'

In [8]:
variable_dict = {}
for key_val in key_vals.split(';'):
    key_val = key_val.split('==')
    key = key_val[0].strip()
    pattern = key_val[1].strip()
    
    if '*' in pattern:
        vals = vals_plus.split('*')[0].split()
        len_key = vals_plus.split('*')[1].strip()
        
        if is_numeric(len_key):
            n_elements = int(len_key)
        else:
            n_elements = len(variable_dict[len_key])
            
        vals = [[val]*n_elements for val in vals]
        vals = functools.reduce(operator.iconcat, vals, [])
    else:
        vals = vals_plus.split()
        
    variable_dict[key] = vals

In [7]:
len_key

'subject'

In [83]:
variable_dict

{'subject': ['sub-001A', 'sub-001B'], 'voi': ['acc', 'acc', 'pcc', 'pcc']}

In [30]:
super_dataset = 'fair_test'
input_datasets = ['/misc/geminis2/twinsmx/datasets/mri_study/mri_study-raw/mri-raw/']
output_datasets = ['bet']
container_dataset = '/misc/geminis2/containers/'
container_name = 'fsl-6-0-4'

In [21]:
dl.create(super_dataset, cfg_proc='yoda')

[INFO] Running procedure cfg_yoda 
[INFO] == Command start (output follows) ===== 
[INFO] == Command exit (modification check follows) ===== 


run(ok): /misc/geminis2/ramirezd/fb_test/fair_test (dataset) [/misc/geminis/ramirezd/miniconda3/envs/p...]
create(ok): /misc/geminis2/ramirezd/fb_test/fair_test (dataset)
action summary:
  create (ok: 1)
  run (ok: 1)


Dataset('/misc/geminis2/ramirezd/fb_test/fair_test')

In [26]:
inputs_root = Path(super_dataset) / 'inputs'
inputs_root.mkdir()

In [27]:
for input_dataset in input_datasets:
    input_dataset_path = str(inputs_root / Path(input_dataset).stem)
    dl.clone(input_dataset, input_dataset_path, dataset=super_dataset)

[INFO] Attempting a clone into /misc/geminis2/ramirezd/fb_test/fair_test/inputs/mri-raw 
[INFO] Attempting to clone from /misc/geminis2/twinsmx/datasets/mri_study/mri_study-raw/mri-raw/ to /misc/geminis2/ramirezd/fb_test/fair_test/inputs/mri-raw 
[INFO] Completed clone attempts for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/inputs/mri-raw) 


install(ok): inputs/mri-raw (dataset)
add(ok): inputs/mri-raw (dataset)
add(ok): .gitmodules (file)
save(ok): . (dataset)
add(ok): .gitmodules (file)
save(ok): . (dataset)
action summary:
  add (ok: 3)
  install (ok: 1)
  save (ok: 2)


In [38]:
image_path

'fair_test/inputs/containers/.datalad/environments/fsl-6-0-4/image'

In [40]:
if container_dataset and container_name:
    input_container_dataset_path = str(inputs_root / 'containers')
    image_path = str(Path(input_container_dataset_path) / '.datalad' / 'environments' / container_name / 'image')
    dl.clone(container_dataset, input_container_dataset_path, dataset=super_dataset)
    dl.containers_add(container_name, call_fmt="apptainer run -e {img} {cmd}", image=image_path)

[INFO] Attempting a clone into /misc/geminis2/ramirezd/fb_test/fair_test/inputs/containers 
[INFO] Attempting to clone from /misc/geminis2/containers/ to /misc/geminis2/ramirezd/fb_test/fair_test/inputs/containers 
[INFO] Completed clone attempts for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/inputs/containers) 


install(ok): inputs/containers (dataset)
add(ok): inputs/containers (dataset)
add(ok): .gitmodules (file)
save(ok): . (dataset)
add(ok): .gitmodules (file)
save(ok): . (dataset)
action summary:
  add (ok: 3)
  install (ok: 1)
  save (ok: 2)
add(ok): .datalad/config (file)
save(ok): . (dataset)
action summary:
  add (ok: 1)
  save (ok: 1)
add(ok): .datalad/config (file)
save(ok): . (dataset)
containers_add(ok): /misc/geminis2/ramirezd/fb_test/fair_test/inputs/containers/.datalad/environments/fsl-6-0-4/image (file)
action summary:
  add (ok: 1)
  containers_add (ok: 1)
  save (ok: 1)


In [41]:
outputs_root = Path(super_dataset) / 'outputs'
outputs_root.mkdir()
for output_dataset in output_datasets:
    output_dataset_path = str(outputs_root / output_dataset)
    dl.create(output_dataset_path, dataset=super_dataset)

add(ok): outputs/bet (dataset)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): outputs/bet (dataset)
action summary:
  add (ok: 2)
  create (ok: 1)
  save (ok: 1)


In [42]:
gitignore_path = Path(super_dataset) / '.gitignore'
with open(gitignore_path, 'w') as gitignore_file:
    gitignore_file.write('.fairlybig')
dl.save(dataset=super_dataset, message='Add .gitignore')

add(ok): .gitignore (file)
save(ok): . (dataset)
action summary:
  add (ok: 1)
  save (ok: 1)


[{'action': 'add',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test/.gitignore',
  'type': 'file',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'status': 'ok',
  'message': '',
  'key': None},
 {'action': 'save',
  'type': 'dataset',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'status': 'ok'}]

In [43]:
super_dataset_id = dl.Dataset(super_dataset).id

In [46]:
input_ria_path

'/misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/input_ria'

In [47]:
output_ria_path = str((Path(super_dataset) / '.fairlybig' / 'output_ria').absolute())
input_ria_path = str((Path(super_dataset) / '.fairlybig' / 'input_ria').absolute())

dl.create_sibling_ria(
    f'ria+file://{output_ria_path}',
    name='output_ria',
    dataset=super_dataset,
    new_store_ok=True,
)

dl.create_sibling_ria(
    f'ria+file://{input_ria_path}',
    name='input_ria',
    dataset=super_dataset,
    new_store_ok=True,
)

[INFO] Creating a new RIA store at /misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/output_ria 
[INFO] create siblings 'output_ria' and 'output_ria-storage' ... 
[INFO] Fetching updates for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 


update(ok): . (dataset)
update(ok): . (dataset)


[INFO] Configure additional publication dependency on "output_ria-storage" 


configure-sibling(ok): . (sibling)
create-sibling-ria(ok): /misc/geminis2/ramirezd/fb_test/fair_test (dataset)
action summary:
  configure-sibling (ok: 1)
  create-sibling-ria (ok: 1)
  update (ok: 1)


[INFO] Creating a new RIA store at /misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/input_ria 
[INFO] create siblings 'input_ria' and 'input_ria-storage' ... 
[INFO] Fetching updates for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 


update(ok): . (dataset)
update(ok): . (dataset)


[INFO] Configure additional publication dependency on "input_ria-storage" 


configure-sibling(ok): . (sibling)
create-sibling-ria(ok): /misc/geminis2/ramirezd/fb_test/fair_test (dataset)
action summary:
  configure-sibling (ok: 1)
  create-sibling-ria (ok: 1)
  update (ok: 1)


[{'action': 'update',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'type': 'dataset',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'status': 'ok'},
 {'action': 'configure-sibling',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'type': 'sibling',
  'name': 'input_ria',
  'annex-ignore': True,
  'url': '/misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/input_ria/94f/905fe-dccb-4d0f-8fde-230d5f0e2520',
  'fetch': '+refs/heads/*:refs/remotes/input_ria/*',
  'datalad-publish-depends': 'input_ria-storage',
  'status': 'ok',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test'},
 {'action': 'create-sibling-ria',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'type': 'dataset',
  'status': 'ok'}]

In [48]:
dl.push(dataset=super_dataset, to='output_ria')
dl.push(dataset=super_dataset, to='input_ria')

[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Transfer data 
[INFO] Transfer data 
[INFO] Update availability information 
[INFO] Start enumerating objects 
[INFO] Start counting objects 
[INFO] Start compressing objects 
[INFO] Start writing objects 


publish(ok): . (dataset) [refs/heads/master->output_ria:refs/heads/master [new branch]]
publish(ok): . (dataset) [refs/heads/git-annex->output_ria:refs/heads/git-annex [new branch]]


[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 
[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 


action summary:
  publish (ok: 2)


[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Transfer data 
[INFO] Transfer data 
[INFO] Update availability information 
[INFO] Start enumerating objects 
[INFO] Start counting objects 
[INFO] Start compressing objects 
[INFO] Start writing objects 


publish(ok): . (dataset) [refs/heads/master->input_ria:refs/heads/master [new branch]]
publish(ok): . (dataset) [refs/heads/git-annex->input_ria:refs/heads/git-annex [new branch]]


[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 
[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test) 


action summary:
  publish (ok: 2)


[{'action': 'publish',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'type': 'dataset',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'status': 'ok',
  'target': 'input_ria',
  'refspec': 'refs/heads/master:refs/heads/master',
  'operations': ['new-branch'],
  'hints': None,
  'message': 'refs/heads/master->input_ria:refs/heads/master [new branch]'},
 {'action': 'publish',
  'refds': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'type': 'dataset',
  'path': '/misc/geminis2/ramirezd/fb_test/fair_test',
  'status': 'ok',
  'target': 'input_ria',
  'refspec': 'refs/heads/git-annex:refs/heads/git-annex',
  'operations': ['new-branch'],
  'hints': None,
  'message': 'refs/heads/git-annex->input_ria:refs/heads/git-annex [new branch]'}]

In [49]:
for output_dataset in output_datasets:
    
    output_dataset_path = str(outputs_root / output_dataset)
    
    dl.create_sibling_ria(
        f'ria+file://{output_ria_path}',
        name='output_ria',
        dataset=output_dataset_path
    )

    dl.create_sibling_ria(
        f'ria+file://{input_ria_path}',
        name='input_ria',
        dataset=output_dataset_path
    )
    
    dl.push(dataset=output_dataset_path, to='output_ria')
    dl.push(dataset=output_dataset_path, to='input_ria')


[INFO] Creating a new RIA store at /misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/output_ria 
[INFO] create siblings 'output_ria' and 'output_ria-storage' ... 
[INFO] Fetching updates for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 


update(ok): . (dataset)
update(ok): . (dataset)


[INFO] Configure additional publication dependency on "output_ria-storage" 


configure-sibling(ok): . (sibling)
create-sibling-ria(ok): /misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet (dataset)
action summary:
  configure-sibling (ok: 1)
  create-sibling-ria (ok: 1)
  update (ok: 1)


[INFO] Creating a new RIA store at /misc/geminis2/ramirezd/fb_test/fair_test/.fairlybig/input_ria 
[INFO] create siblings 'input_ria' and 'input_ria-storage' ... 
[INFO] Fetching updates for Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 


update(ok): . (dataset)
update(ok): . (dataset)


[INFO] Configure additional publication dependency on "input_ria-storage" 


configure-sibling(ok): . (sibling)
create-sibling-ria(ok): /misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet (dataset)
action summary:
  configure-sibling (ok: 1)
  create-sibling-ria (ok: 1)
  update (ok: 1)


[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Transfer data 
[INFO] Transfer data 
[INFO] Update availability information 
[INFO] Start enumerating objects 
[INFO] Start counting objects 
[INFO] Start compressing objects 
[INFO] Start writing objects 


publish(ok): . (dataset) [refs/heads/master->output_ria:refs/heads/master [new branch]]
publish(ok): . (dataset) [refs/heads/git-annex->output_ria:refs/heads/git-annex [new branch]]


[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 
[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 


action summary:
  publish (ok: 2)


[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Determine push target 
[INFO] Push refspecs 
[INFO] Transfer data 
[INFO] Transfer data 
[INFO] Update availability information 
[INFO] Start enumerating objects 
[INFO] Start counting objects 
[INFO] Start compressing objects 
[INFO] Start writing objects 


publish(ok): . (dataset) [refs/heads/master->input_ria:refs/heads/master [new branch]]
publish(ok): . (dataset) [refs/heads/git-annex->input_ria:refs/heads/git-annex [new branch]]


[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 
[INFO] Finished push of Dataset(/misc/geminis2/ramirezd/fb_test/fair_test/outputs/bet) 


action summary:
  publish (ok: 2)


In [50]:
if output_datasets:
    output_datasets_string = ''
    for output_dataset in output_datasets:
        output_datasets_string += f'{output_dataset} '
    output_datasets_string = output_datasets_string.strip()
else:
    output_datasets_string = None

user = os.getenv('USER')

In [53]:
job_config_dict ={
    'job_name':[None],
    'dl_cmd':[None],
    'container':[container_name],
    'commit':[None],
    'inputs':[None],
    'outputs':[None],
    'is_explicit':[False],
    'output_datasets':[output_datasets_string],
    'prereq_get':[None],
    'message':[None],
    'super_id':[super_dataset_id],
    'clone_target':[input_ria_path],
    'push_target':[output_ria_path],
    'ephemeral_location':["/tmp /misc/{host}[0-9]/"+user],
    'req_disk_gb':[None],
    'queue':['all.q'],
    'slots':[None],
    'vmem':[None],
    'h_rt':[None],
    'env_vars':[None],
    'batch':['0001']
}

fairlybig_path = Path(super_dataset) / '.fairlybig'
(fairlybig_path / 'code').mkdir()

pd.DataFrame(job_config_dict).to_csv(str(fairlybig_path / 'code' / 'job_config.csv'), index=False)