# Tutorial
Here is how to use the `bids_selector` module (which is still under development).
The first thing to know is that the main object you will be dealing with is a BidsArchitecture.
As indicated in its name, the object will provide a view of the architecture of the dataset as a function of the user input.
The database is basically all the files parsed into a pandas dataframe. And it is from this dataframe I will work on.
For example if we want a list of all the eeg data which is a `pickle` object (extension `.pkl`) we will input the object as follow:

In [1]:
from pathlib import Path
from eeg_research.system.bids_selector  import BidsArchitecture, BidsQuery, BidsPath, BidsDescriptor
import os
root = Path("/data2/Projects/eeg_fmri_natview/derivatives")
architecture = BidsArchitecture(
    root = root,
    datatype='eeg',
    suffix='eeg',
    extension = '.pkl'
)


We can have a view of the desired dataset by getting the attribute `database` of the `architecture` instance:

In [2]:
path = architecture.database.iloc[0]['filename']

In [None]:
import eeg_research.system.bids_selector as selector
from pathlib import Path
f = Path('/data/sub-01/sus-03/sub-01_ses_01_task-rast_runass-01_dsscr-GfpBk_eeg.pkl')
selector.validate_bids_file(f)


BidsValidationError: Non standardized BIDS name
/data/sub-01/sus-03/sub-01_ses_01_task-rast_ron-01_dsc-GfpBk_eeg.pkl
                            ^^^ ^^           ^^^^^^ ^^^^^^^^^

In [None]:
import re
from pathlib import Path
import os

class BidsValidationError(Exception):
    """Custom exception for BIDS validation errors."""
    pass

def validate_bids_file(file: Path):
    """Validate the BIDS filename and pathname."""
    # Define BIDS rules
    valid_keys = {'sub', 'ses', 'task', 'run', 'acq', 'desc'}
    key_value_pattern = re.compile(r"(?P<key>\w+)-(?P<value>[\w\d]+)")
    path_pattern = re.compile(r"(sub|ses|task|run|acq|desc)-[\w\d]+")
    errors = []
    indicators = []

    # Separate root and BIDS path components
    filename = file.stem if file.suffix else None  # Check if it's a file or folder
    bids_key_values_file = filename[:filename.rfind('_')]
    bids_suffix = filename[filename.rfind('_'):]
    
    if filename:
        root = file.parents[3]
    else:
        root = file.parents[2]
        
    root_parts = root.parts
    bids_start = next((i for i, part in enumerate(root_parts) if part.startswith("sub-")), None)
    if bids_start is None:
        raise BidsValidationError("Path does not contain any BIDS elements (e.g., 'sub-*').")
    
    # 1. Validate BIDS path components
    for part in root_parts[bids_start:]:
        if path_pattern.match(part) is None:
            errors.append(f"Invalid path component: '{part}' should match the pattern '<key>-<value>'.")
            indicators.append("^" * len(part))
        else:
            indicators.append(" " * len(part))

    # 2. Validate filename components (if it's a file)
    if filename:
        elements = bids_key_values_file.split("_")
        for element in elements:
            match = key_value_pattern.match(element)
            if not match:
                errors.append(f"Invalid element in filename: '{element}' is not in the format '<key>-<value>'.")
                indicators.append("^" * len(element))
                continue
            
            key, value = match.group("key"), match.group("value")
            if key not in valid_keys:
                errors.append(
                    f"Wrong key, got '{key}'. Key should be one of the following: {', '.join(sorted(valid_keys))}."
                )
                indicators.append("^" * len(element))
            else:
                indicators.append(" " * len(element))
    else:
        # For folder validation, ignore the filename
        indicators.append(" " * len(root_parts[-1]))

    # Combine indicators and errors
    error_indicator = " "*(len(os.fspath(root))+1) + " ".join(indicators)
    if errors:
        message = (
            f"Non standardized BIDS name\n"\
            + "\n".join(f"{i + 1}. {error}" for i, error in enumerate(errors))\
            + f"\n{file}\n{error_indicator}"
        )
        raise BidsValidationError(message)

# Example usage
try:
    # File validation (both path and filename checked)
    f = Path("/data/dududu/dididi/saab-01/sucemoila-01/eeg/sub-01_ses-01_task-rast_run-01_desc-GfpBk_eeg.pkl")
    validate_bids_file(f)
except BidsValidationError as e:
    raise e

try:
    # Folder validation (only path checked)
    folder = Path("/data/sub-01/sus-01")
    validate_bids_file(folder)
except BidsValidationError as e:
    print(e)



BidsValidationError: Non standardized BIDS name
1. Invalid element in filename: 'eeg.pkl' is not in the format '<key>-<value>'.
/data/dududu/dididi/saab-01/sucemoila-01/sub-01_ses-01_task-rast_run-01_desc-GfpBk_eeg.pkl
                                                                                   ^^^^^^^

In [17]:
import re
from pathlib import Path
import os

class BidsValidationError(Exception):
    """Custom exception for BIDS validation errors."""
    pass

def validate_bids_file(file: Path):
    """Validate the BIDS filename and pathname."""
    # Define BIDS rules
    valid_datatype_pattern = re.compile(r"^[a-z0-9]+$")
    valid_filename_pattern = re.compile(
        r"^sub-(?P<sub>[\w\d]+)"
        r"(?:_ses-(?P<ses>[\w\d]+))?"
        r"_task-(?P<task>[\w\d]+)"
        r"(?:_acq-(?P<acq>[\w\d]+))?"
        r"(?:_run-(?P<run>[\w\d]+))?"
        r"(?:_recording-(?P<recording>[\w\d]+))?"
        r"_(?P<suffix>[\w]+)?"
        r".(?P<extension>[\w\.]+)$"
    )

    errors = []
    indicators = []

    filename = os.fspath(file.name) if file.suffix else None  # Check if it's a file or folder
    
    if filename:
        root = Path(*file.parent.parts[:-3])
        bids_path_parts = file.parent.parts[-3:-1]
        datatype = file.parent.parts[-1]
    else:
        root = Path(*file.parent.parts[:-2])
        bids_path_parts = file.parent.parts[-2:]
        datatype = file.parts[-1]
        
    print(type(root))
    print(bids_path_parts)
    print(datatype)
    if not bids_path_parts[0].startswith("sub-"):
        raise BidsValidationError("Path does not contain any BIDS elements (e.g., 'sub-*').")
    
    # 1. Validate datatype
    print(f"Datatype valid: {valid_datatype_pattern.match(datatype)}")
    if datatype and not valid_datatype_pattern.match(datatype):
        errors.append(f"Invalid datatype: '{datatype}' should be a lowercase alphanumeric string.")
        indicators.append("^" * len(datatype))

    # 2. Validate BIDS path components
    path_pattern = re.compile(r"(sub|ses)-[\w\d]+")
    for part in bids_path_parts:
        print(f"{part} valid: {path_pattern.match(part)}")
        if not path_pattern.match(part):
            errors.append(f"Invalid path component: '{part}' should match the pattern '<key>-<value>'.")
            indicators.append("^" * len(part))
        else:
            indicators.append(" " * len(part))

    # 3. Validate filename
    if filename:
        match = valid_filename_pattern.match(filename)
        print(f"Match filename {filename}: {match}")
        if not match:
            errors.append(f"Invalid filename: '{filename}' does not match the expected BIDS format.")
            indicators.append("^" * len(filename))

    # Combine indicators and errors
    error_indicator = " "*(len(os.fspath(root))+1) + " ".join(indicators)
    if errors:
        message = (
            f"Non standardized BIDS name\n{file}\n{error_indicator}\n\n"
            + "\n".join(f"{i + 1}. {error}" for i, error in enumerate(errors))
        )
        raise BidsValidationError(message)

# Example usage
try:
    # Valid file
    f = Path("/data/sub-01/ses-01/eeg/sub-01_ses-01_task-test_acq-fast_run-01_eeg.vhdr")
    validate_bids_file(f)
except BidsValidationError as e:
    raise e

try:
    # Invalid file
    f = Path("/data/sub-01/ses-01/EEG/sub-01_ses-01_task-test_acq-fast_run-01_bad.json")
    validate_bids_file(f)
except BidsValidationError as e:
    raise e


<class 'pathlib.PosixPath'>
('sub-01', 'ses-01')
eeg
Datatype valid: <re.Match object; span=(0, 3), match='eeg'>
sub-01 valid: <re.Match object; span=(0, 6), match='sub-01'>
ses-01 valid: <re.Match object; span=(0, 6), match='ses-01'>
Match filename sub-01_ses-01_task-test_acq-fast_run-01_eeg.vhdr: <re.Match object; span=(0, 48), match='sub-01_ses-01_task-test_acq-fast_run-01_eeg.vhdr'>
<class 'pathlib.PosixPath'>
('sub-01', 'ses-01')
EEG
Datatype valid: None
sub-01 valid: <re.Match object; span=(0, 6), match='sub-01'>
ses-01 valid: <re.Match object; span=(0, 6), match='ses-01'>
Match filename sub-01_ses-01_task-test_acq-fast_run-01_bad.json: <re.Match object; span=(0, 48), match='sub-01_ses-01_task-test_acq-fast_run-01_bad.json'>


BidsValidationError: Non standardized BIDS name
/data/sub-01/ses-01/EEG/sub-01_ses-01_task-test_acq-fast_run-01_bad.json
      ^^^              

1. Invalid datatype: 'EEG' should be a lowercase alphanumeric string.

In [9]:
f = Path('/data/sub-01/sus-03/sub-01_ses_01_task-rast_runass-01_dsscr-GfpBk_eeg.pkl')
def contain_standard_name(key):
    standard_names = [
    "sub",
    "ses",
    "task",
    "acq",
    "run",
    "record",
    "desc",
    ]
    
    
    return any([name in key for name in standard_names])
standard_names = [
"sub",
"ses",
"task",
"acq",
"run",
"record",
"desc",
]

for name in standard_names:
    if name in f.name:
        elem_boundaries = (f.name.find(name), f.name.find(name) + len(name))
        print(name)
        if f.name[elem_boundaries[1]] != "-":
            print("PROUUUUT")
        

sub
ses
PROUUUUT
task
run
PROUUUUT


In [16]:
import bids_selector
from pathlib import Path
f = Path('/data/sub-01/sus-03/eeg/')#sub-01_ses_01_tusk-rast_run-01_dsc-GfpBk_eeg.pkl')

In [16]:
Path(*f.parent.parts[-2:])

PosixPath('ses-01/eeg')

In [32]:
Path.joinpath

'sub-01_ses_01_tusk-rast_run-01_dsc-GfpBk_eeg'

In [15]:
f.is_file()

False

In [52]:
try:
    indicate_error_in_file(f)
except BidsValidationError as e:
    print(e)

Non standardized BIDS name
sub-01_ses_01_task-rast_ron-01_dsc-GfpBk
       ^   ^            ^      ^        


In [28]:
print(message)

sub-01_ses_01_task-rast_ron-01_dsc-GfpBk
       ^   ^            ^      ^        


In [15]:
has_standard_name('sub-01') 

True

In [5]:
path.name = 'sub_01_sas-01_tasking-rest_run-01_desc-GfpBk_eeg.pkl'

AttributeError: property 'name' of 'PosixPath' object has no setter

Let's say now we want only the tasks `monkey1`, `inscapes` and `rest`. We will have to call the method `select` of the architecture instance. This method will update internally the database which will then have only the desired selection.

In [3]:
architecture.select(task = ['monkey1','inscapes', 'rest'])

#Let's take a look at the database
architecture.database

Unnamed: 0,root,subject,session,datatype,task,run,acquisition,description,suffix,extension,filename
0,/data2/Projects/eeg_fmri_natview/derivatives,01,01,eeg,rest,01,,GfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
1,/data2/Projects/eeg_fmri_natview/derivatives,01,01,eeg,monkey1,02,,GfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
3,/data2/Projects/eeg_fmri_natview/derivatives,01,01,eeg,monkey1,01,,CustomGfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
4,/data2/Projects/eeg_fmri_natview/derivatives,01,01,eeg,monkey1,02,,BandsGfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
7,/data2/Projects/eeg_fmri_natview/derivatives,01,01,eeg,rest,01,,Raw,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
...,...,...,...,...,...,...,...,...,...,...,...
2145,/data2/Projects/eeg_fmri_natview/derivatives,19,02,eeg,rest,01,,CustomGfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
2147,/data2/Projects/eeg_fmri_natview/derivatives,19,02,eeg,rest,01,,BandsGfpBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
2150,/data2/Projects/eeg_fmri_natview/derivatives,19,02,eeg,rest,01,,BandsEnv,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...
2151,/data2/Projects/eeg_fmri_natview/derivatives,19,02,eeg,rest,01,,BandsEnvBk,eeg,.pkl,/data2/Projects/eeg_fmri_natview/derivatives/s...


If we check what are the selected task:

In [4]:
architecture.database['task'].unique()

array(['rest', 'monkey1', 'inscapes'], dtype=object)

The thing is now we can't get another task than the one selected. The BidsArchitecture instance has been overwritten.
However, it is possible to get the specific selection IN another object without modifying the original instance by calling the method `copy()`:

In [5]:
#For this example I have to re-initiate my BidsArchitecture instance because it has been overwritten
root = Path("/data2/Projects/eeg_fmri_natview/derivatives")
architecture = BidsArchitecture(
    root = root,
    datatype='eeg',
    suffix='eeg',
    extension = '.pkl'
)

#Let's perform a selection and put that in another instance now:
selection = architecture.copy().select(task = "checker")



As we can see the `architecture` instance is not modified, we still have all the tasks:

In [6]:
architecture.database['task'].unique()

array(['rest', 'monkey1', 'checker', 'tp', 'peer', 'inscapes', 'dme',
       'dmh', 'monkey5', 'monkey2'], dtype=object)

And the `selection` instance has the desired task:

In [7]:
selection.database['task'].unique()

array(['checker'], dtype=object)

We can also select a desired range of data that has numerical values such as `subject`, `session` or `run`.

In [None]:
import pickle
data