In [1]:
server1 = "/mnt/WarrenNAS/Data/SST/*" #ses-2YearFollowUpYArm1
server2 = "/mnt/storage/SST/*"

In [2]:
import glob
from pathlib import Path
from collections import defaultdict
import json
from datetime import datetime
import polars as pl  # ensure polars is installed in your environment

def index_root(root_path: Path):
    root = Path(root_path)
    file_map = {}   # rel_path -> {size, mtime}
    for p in root.rglob('*'):
        try:
            if p.is_file():
                st = p.stat()
                rel = p.relative_to(root).as_posix()
                file_map[rel] = {'size': st.st_size, 'mtime': st.st_mtime}
        except (FileNotFoundError, PermissionError):
            # skip files that disappear or are inaccessible
            continue

    # build directory aggregates (recursive: each directory counts files in its subtree)
    dir_map = defaultdict(lambda: {'file_count': 0, 'total_size': 0})
    for rel, meta in file_map.items():
        parts = rel.split('/')
        # include '.' as the root directory
        for i in range(len(parts)):
            dir_rel = '/'.join(parts[:i]) if i > 0 else '.'
            dir_map[dir_rel]['file_count'] += 1
            dir_map[dir_rel]['total_size'] += meta['size']
    return file_map, dict(dir_map)

def compare_indexes(f1, d1, f2, d2, name1='A', name2='B', show_limit=20):
    set1 = set(f1.keys())
    set2 = set(f2.keys())
    only1 = sorted(set1 - set2)
    only2 = sorted(set2 - set1)
    common = sorted(set1 & set2)

    same = []
    diff = []
    for k in common:
        a = f1[k]['size']
        b = f2[k]['size']
        if a == b:
            same.append((k, a, f1[k]['mtime'], f2[k]['mtime']))
        else:
            diff.append((k, a, b, f1[k]['mtime'], f2[k]['mtime']))

    # prepare richer lists including sizes/mtimes for export
    only1_rich = [(p, f1[p]['size'], f1[p]['mtime']) for p in only1]
    only2_rich = [(p, f2[p]['size'], f2[p]['mtime']) for p in only2]

    print(f"Compared {name1} vs {name2}")
    print(f"  files only in {name1}: {len(only1)}")
    print(f"  files only in {name2}: {len(only2)}")
    print(f"  files in both with same size: {len(same)}")
    print(f"  files in both with different size: {len(diff)}")

    if only1:
        print(f"\n  Sample files only in {name1}:")
        for p in only1[:show_limit]:
            print(f"    {p}  ({f1[p]['size']} bytes)")

    if only2:
        print(f"\n  Sample files only in {name2}:")
        for p in only2[:show_limit]:
            print(f"    {p}  ({f2[p]['size']} bytes)")

    if diff:
        print(f"\n  Sample differing files (path, size_in_{name1}, size_in_{name2}):")
        for k, a, b, ma, mb in diff[:show_limit]:
            ta = datetime.fromtimestamp(ma).isoformat()
            tb = datetime.fromtimestamp(mb).isoformat()
            print(f"    {k}  {a} -> {b}   mtime: {ta} | {tb}")

    # directory-level summary
    dir_keys = sorted(set(d1.keys()) | set(d2.keys()))
    dir_diff = []
    for dk in dir_keys:
        da = d1.get(dk, {'file_count': 0, 'total_size': 0})
        db = d2.get(dk, {'file_count': 0, 'total_size': 0})
        if da['file_count'] != db['file_count'] or da['total_size'] != db['total_size']:
            dir_diff.append((dk, da, db))

    print(f"\n  directories differing (count/size): {len(dir_diff)}")
    for dk, da, db in dir_diff[:show_limit]:
        print(f"    {dk}  files: {da['file_count']} -> {db['file_count']}  size: {da['total_size']} -> {db['total_size']}")

    return {
        'only_in_1': only1_rich,
        'only_in_2': only2_rich,
        'same': same,
        'different_files': diff,
        'different_dirs': dir_diff
    }

# Expand glob patterns from variables server1 and server2 defined in another cell
roots1 = sorted(glob.glob(server1))
roots2 = sorted(glob.glob(server2))

if not roots1:
    print("No matches for server1 pattern:", server1)
if not roots2:
    print("No matches for server2 pattern:", server2)
if not roots1 or not roots2:
    raise SystemExit("Cannot compare without matching roots for both server patterns.")

results = []

# If the number of matched roots is equal, compare pairwise in sorted order (useful when patterns list corresponding folders)
if len(roots1) == len(roots2):
    for r1, r2 in zip(roots1, roots2):
        print("\n=== Comparing pair ===")
        print("  ", r1)
        print("  ", r2)
        f1, d1 = index_root(Path(r1))
        f2, d2 = index_root(Path(r2))
        res = compare_indexes(f1, d1, f2, d2, name1=Path(r1).name, name2=Path(r2).name)
        results.append({'left': r1, 'right': r2, 'fmap_left': f1, 'fmap_right': f2, 'result': res})
else:
    # otherwise build combined index for each side, prefixing entries with the root name to avoid collisions
    def combined_index(roots):
        files = {}
        dirs = {}
        for root in sorted(roots):
            rootp = Path(root)
            f_map, d_map = index_root(rootp)
            prefix = rootp.name  # short prefix to keep relative paths unique
            for k, v in f_map.items():
                files[f"{prefix}/{k}"] = v
            for dk, dv in d_map.items():
                dirs[f"{prefix}/{dk}"] = dv
        return files, dirs

    print("Different number of matched roots; comparing combined indexes.")
    f1, d1 = combined_index(roots1)
    f2, d2 = combined_index(roots2)
    res = compare_indexes(f1, d1, f2, d2, name1='server1_combined', name2='server2_combined')
    results.append({'left': roots1, 'right': roots2, 'fmap_left': f1, 'fmap_right': f2, 'result': res})

# Optionally: save a compact JSON summary
with open('compare_summary.json', 'w') as fh:
    json.dump(results, fh, indent=2)

# Build Polars DataFrame for file-level comparisons
file_rows = []
for entry in results:
    left_label = entry['left']
    right_label = entry['right']
    res = entry['result']
    # only in left
    for path, size, mtime in res['only_in_1']:
        file_rows.append({
            'left_root': left_label,
            'right_root': right_label,
            'status': 'only_in_left',
            'path': path,
            'size_left': size,
            'size_right': None,
            'mtime_left': datetime.fromtimestamp(mtime).isoformat(),
            'mtime_right': None
        })
    # only in right
    for path, size, mtime in res['only_in_2']:
        file_rows.append({
            'left_root': left_label,
            'right_root': right_label,
            'status': 'only_in_right',
            'path': path,
            'size_left': None,
            'size_right': size,
            'mtime_left': None,
            'mtime_right': datetime.fromtimestamp(mtime).isoformat()
        })
    # same
    for path, size, mtime_left, mtime_right in res['same']:
        file_rows.append({
            'left_root': left_label,
            'right_root': right_label,
            'status': 'same',
            'path': path,
            'size_left': size,
            'size_right': size,
            'mtime_left': datetime.fromtimestamp(mtime_left).isoformat(),
            'mtime_right': datetime.fromtimestamp(mtime_right).isoformat()
        })
    # different
    for path, size_left, size_right, mtime_left, mtime_right in res['different_files']:
        file_rows.append({
            'left_root': left_label,
            'right_root': right_label,
            'status': 'different',
            'path': path,
            'size_left': size_left,
            'size_right': size_right,
            'mtime_left': datetime.fromtimestamp(mtime_left).isoformat(),
            'mtime_right': datetime.fromtimestamp(mtime_right).isoformat()
        })

files_df = pl.DataFrame(file_rows)

# Build Polars DataFrame for directory-level differences
dir_rows = []
for entry in results:
    left_label = entry['left']
    right_label = entry['right']
    for dk, da, db in entry['result']['different_dirs']:
        dir_rows.append({
            'left_root': left_label,
            'right_root': right_label,
            'dir_path': dk,
            'files_left': da['file_count'],
            'files_right': db['file_count'],
            'size_left': da['total_size'],
            'size_right': db['total_size']
        })

dirs_df = pl.DataFrame(dir_rows)

# Display in Jupyter
print("\nFile-level comparison (polars DataFrame):")
display(files_df)  # Jupyter display

print("\nDirectory-level differences (polars DataFrame):")
display(dirs_df)

# Export CSVs
files_df.write_csv('compare_files.csv')
dirs_df.write_csv('compare_dirs.csv')

print("\nSaved compare_files.csv and compare_dirs.csv")
print("\nSummary written to compare_summary.json")


=== Comparing pair ===
   /mnt/WarrenNAS/Data/SST/dataset_description.json
   /mnt/storage/SST/dataset_description.json
Compared dataset_description.json vs dataset_description.json
  files only in dataset_description.json: 0
  files only in dataset_description.json: 0
  files in both with same size: 0
  files in both with different size: 0

  directories differing (count/size): 0

=== Comparing pair ===
   /mnt/WarrenNAS/Data/SST/sub-NDARINV003RTV85
   /mnt/storage/SST/sub-NDARINV003RTV85
Compared sub-NDARINV003RTV85 vs sub-NDARINV003RTV85
  files only in sub-NDARINV003RTV85: 1
  files only in sub-NDARINV003RTV85: 0
  files in both with same size: 3468
  files in both with different size: 0

  Sample files only in sub-NDARINV003RTV85:
    ses-baselineYear1Arm1/func/sub-NDARINV003RTV85_ses-baselineYear1Arm1_task-sst_run-01_bold.feat/flobs.feat/tsplot/.ps_tsplot_zstat14_ev1.png.0jeiVZ  (0 bytes)

  directories differing (count/size): 6
    .  files: 3469 -> 3468  size: 3373525068 -> 33

left_root,right_root,status,path,size_left,size_right,mtime_left,mtime_right
str,str,str,str,i64,i64,str,str
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""only_in_left""","""ses-baselineYear1Arm1/func/sub…",0,,"""2025-09-04T10:42:15.640575""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.con""",679,679,"""2025-09-04T10:47:28.751583""","""2023-08-25T12:32:09.774625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.frf""",12,12,"""2025-09-04T10:47:28.739583""","""2023-08-25T12:32:09.774625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.fsf""",15474,15474,"""2025-09-04T10:47:28.731583""","""2023-08-25T12:32:09.717625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.mat""",36202,36202,"""2025-09-04T10:47:28.743583""","""2023-08-25T12:32:09.774625"""
…,…,…,…,…,…,…,…
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-2YearFollowUpYArm1/anat/su…",67109216,67109216,"""2025-09-18T12:07:51.549217""","""2022-08-01T18:43:32"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-4YearFollowUpYArm1/anat/su…",101,101,"""2025-09-18T12:07:52.375216""","""1969-12-31T18:00:00"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-4YearFollowUpYArm1/anat/su…",67109216,67109216,"""2025-09-18T12:07:52.369217""","""2022-08-01T18:44:55"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-baselineYear1Arm1/anat/sub…",101,101,"""2025-09-18T12:07:53.213217""","""1969-12-31T18:00:00"""



Directory-level differences (polars DataFrame):


left_root,right_root,dir_path,files_left,files_right,size_left,size_right
str,str,str,i64,i64,i64,i64
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…",""".""",3469,3468,3373525068,3373525068
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1""",3459,3458,3371905439,3371905439
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func""",3446,3445,3293933186,3293933186
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func/sub…",2604,2603,1069170677,1069170677
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func/sub…",1801,1800,576506206,576506206
…,…,…,…,…,…,…
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",10,0,1797323179,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",2,0,67109300,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",8,0,1730213879,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-baseli…",2,0,67109300,0



Saved compare_files.csv and compare_dirs.csv

Summary written to compare_summary.json


In [3]:
files_df

left_root,right_root,status,path,size_left,size_right,mtime_left,mtime_right
str,str,str,str,i64,i64,str,str
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""only_in_left""","""ses-baselineYear1Arm1/func/sub…",0,,"""2025-09-04T10:42:15.640575""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.con""",679,679,"""2025-09-04T10:47:28.751583""","""2023-08-25T12:32:09.774625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.frf""",12,12,"""2025-09-04T10:47:28.739583""","""2023-08-25T12:32:09.774625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.fsf""",15474,15474,"""2025-09-04T10:47:28.731583""","""2023-08-25T12:32:09.717625"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""same""","""ses-baselineYear1Arm1.mat""",36202,36202,"""2025-09-04T10:47:28.743583""","""2023-08-25T12:32:09.774625"""
…,…,…,…,…,…,…,…
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-2YearFollowUpYArm1/anat/su…",67109216,67109216,"""2025-09-18T12:07:51.549217""","""2022-08-01T18:43:32"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-4YearFollowUpYArm1/anat/su…",101,101,"""2025-09-18T12:07:52.375216""","""1969-12-31T18:00:00"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-4YearFollowUpYArm1/anat/su…",67109216,67109216,"""2025-09-18T12:07:52.369217""","""2022-08-01T18:44:55"""
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVZZ…","""same""","""ses-baselineYear1Arm1/anat/sub…",101,101,"""2025-09-18T12:07:53.213217""","""1969-12-31T18:00:00"""


In [4]:
files_df.filter(pl.col.status != "same")

left_root,right_root,status,path,size_left,size_right,mtime_left,mtime_right
str,str,str,str,i64,i64,str,str
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""only_in_left""","""ses-baselineYear1Arm1/func/sub…",0,,"""2025-09-04T10:42:15.640575""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV2Z…","""only_in_left""","""sub-NDARINV2ZA2LC3N/ses-2YearF…",84,,"""2025-10-21T12:06:31.028025""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV2Z…","""only_in_left""","""sub-NDARINV2ZA2LC3N/ses-2YearF…",67109216,,"""2025-10-21T12:06:31.020025""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV2Z…","""only_in_left""","""sub-NDARINV2ZA2LC3N/ses-2YearF…",377,,"""2025-10-21T12:06:18.931025""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV2Z…","""only_in_left""","""sub-NDARINV2ZA2LC3N/ses-2YearF…",865080352,,"""2025-10-21T12:06:18.870025""",
…,…,…,…,…,…,…,…
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""only_in_left""","""sub-NDARINVV7F3YE27/ses-2YearF…",4630,,"""2025-10-21T12:12:49.834034""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""only_in_left""","""sub-NDARINVV7F3YE27/ses-2YearF…",21652,,"""2025-10-21T12:12:49.836034""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""only_in_left""","""sub-NDARINVV7F3YE27/ses-baseli…",84,,"""2025-10-21T12:12:51.668034""",
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""only_in_left""","""sub-NDARINVV7F3YE27/ses-baseli…",67109216,,"""2025-10-21T12:12:51.636034""",


In [5]:
dirs_df

left_root,right_root,dir_path,files_left,files_right,size_left,size_right
str,str,str,i64,i64,i64,i64
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…",""".""",3469,3468,3373525068,3373525068
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1""",3459,3458,3371905439,3371905439
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func""",3446,3445,3293933186,3293933186
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func/sub…",2604,2603,1069170677,1069170677
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINV00…","""ses-baselineYear1Arm1/func/sub…",1801,1800,576506206,576506206
…,…,…,…,…,…,…
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",10,0,1797323179,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",2,0,67109300,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-2YearF…",8,0,1730213879,0
"""/mnt/WarrenNAS/Data/SST/sub-ND…","""/mnt/storage/SST/sub-NDARINVV7…","""sub-NDARINVV7F3YE27/ses-baseli…",2,0,67109300,0


import os
import shutil
from pathlib import Path

def copy_dir_merge(src: Path, dst: Path):
    """
    Merge-copy contents of src into dst, preserving metadata where possible.
    Creates dst if needed. Handles files, directories and symlinks.
    """
    src = Path(src)
    dst = Path(dst)

    if not src.exists():
        print(f"SKIP: source does not exist: {src}")
        return

    # If source is a file, copy it (into dst if dst is a directory, else to dst path)
    if src.is_file():
        if dst.exists() and dst.is_dir():
            target = dst / src.name
        else:
            target = dst
            target.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, target)
        print(f"Copied file {src} -> {target}")
        return

    # Source is a directory: ensure destination directory exists
    dst.mkdir(parents=True, exist_ok=True)

    copied = 0
    for p in src.rglob('*'):
        try:
            rel = p.relative_to(src)
        except Exception:
            # shouldn't happen, but skip if it does
            continue
        target = dst.joinpath(rel)

        try:
            if p.is_dir():
                target.mkdir(parents=True, exist_ok=True)
            elif p.is_symlink():
                # replicate symlink (overwrite if exists)
                try:
                    if target.exists() or target.is_symlink():
                        target.unlink()
                except Exception:
                    pass
                linkto = os.readlink(p)
                os.symlink(linkto, target)
                copied += 1
            elif p.is_file():
                target.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(p, target)
                copied += 1
        except Exception as e:
            print(f"ERROR copying {p} -> {target}: {e}")

    print(f"Copied {copied} files/links from {src} -> {dst}")

# Build set of unique (left_root, right_root) pairs from dirs_df
pairs = {
    (d['left_root'], d['right_root'])
    for d in dirs_df.select(['left_root', 'right_root']).unique().to_dicts()
}

# Copy each right_root -> left_root
for left_root, right_root in sorted(pairs):
    src = Path(right_root)
    dst = Path(left_root)

    print("=" * 60)
    print(f"Copying from right_root: {src}")
    print(f"         to left_root:  {dst}")

    # If the pair points to the same path, skip
    if src.resolve() == dst.resolve():
        print("SKIP: source and destination are the same (resolved).")
        continue

    copy_dir_merge(src, dst)

# Check how many participants don't have func

In [73]:
#nii_server1 = "/mnt/WarrenNAS/Data/SST/*/*/*/*.nii"
nii_server1 = "/mnt/storage/SST/*/*/*/*.nii"

import glob as glob

In [74]:
nii_paths = glob.glob(nii_server1)

In [76]:
len(nii_paths)

45727

In [79]:
sub_parts = nii_paths[0].split("/")
sub_parts

['',
 'mnt',
 'storage',
 'SST',
 'sub-NDARINVTFRWPXDT',
 'ses-baselineYear1Arm1',
 'func',
 'sub-NDARINVTFRWPXDT_ses-baselineYear1Arm1_task-sst_run-01_bold.nii']

In [80]:
offset = 1
part_id = sub_parts[5-offset]
year = sub_parts[6-offset]
type = sub_parts[7-offset]
nii_file = sub_parts[8-offset]

In [25]:
import traceback

In [82]:
df_nii = []

for nii_path in nii_paths:

    sub_parts = nii_path.split("/")
    try:
        df_nii.append(
            dict(
                part_id = sub_parts[5-offset].replace('sub-', ""),
                year = sub_parts[6-offset].replace("ses-", ""),
                type = sub_parts[7-offset],
                nii_file = sub_parts[8-offset],
            )
        )
    except Exception as e:
        print("failed for ", sub_parts)
        traceback.print_exc()
        break

df_nii = pl.from_dicts(df_nii)
df_nii

part_id,year,type,nii_file
str,str,str,str
"""NDARINVTFRWPXDT""","""baselineYear1Arm1""","""func""","""sub-NDARINVTFRWPXDT_ses-baseli…"
"""NDARINVTFRWPXDT""","""baselineYear1Arm1""","""func""","""sub-NDARINVTFRWPXDT_ses-baseli…"
"""NDARINVTFRWPXDT""","""baselineYear1Arm1""","""anat""","""sub-NDARINVTFRWPXDT_ses-baseli…"
"""NDARINVTFRWPXDT""","""2YearFollowUpYArm1""","""func""","""sub-NDARINVTFRWPXDT_ses-2YearF…"
"""NDARINVTFRWPXDT""","""2YearFollowUpYArm1""","""func""","""sub-NDARINVTFRWPXDT_ses-2YearF…"
…,…,…,…
"""NDARINV67TUZJU7""","""2YearFollowUpYArm1""","""anat""","""sub-NDARINV67TUZJU7_ses-2YearF…"
"""NDARINVPJ2N86U5""","""baselineYear1Arm1""","""anat""","""sub-NDARINVPJ2N86U5_ses-baseli…"
"""NDARINVPJ2N86U5""","""2YearFollowUpYArm1""","""anat""","""sub-NDARINVPJ2N86U5_ses-2YearF…"
"""NDARINVJ3DFH4DA""","""baselineYear1Arm1""","""anat""","""sub-NDARINVJ3DFH4DA_ses-baseli…"


## List of years per participant

df_nii_counts = df_nii.group_by(["part_id"]).n_unique()
df_nii_counts

In [83]:
df_nii_counts = df_nii.pivot(on=["type", "year"], index=["part_id"], values="nii_file", aggregate_function= "len" )#pl.element().str.concat(delimiter=","))
df_nii_counts

part_id,"{""func"",""baselineYear1Arm1""}","{""anat"",""baselineYear1Arm1""}","{""func"",""2YearFollowUpYArm1""}","{""anat"",""2YearFollowUpYArm1""}","{""anat"",""4YearFollowUpYArm1""}","{""func"",""4YearFollowUpYArm1""}"
str,u32,u32,u32,u32,u32,u32
"""NDARINVTFRWPXDT""",2,1,2,1,,
"""NDARINVKHL7D46V""",,1,2,1,,
"""NDARINVEHVK33X7""",2,1,,1,,
"""NDARINVATE29B31""",2,1,2,1,1,
"""NDARINVVA82JDEJ""",,,2,,,
…,…,…,…,…,…,…
"""NDARINVGVC01P7F""",2,1,,1,,
"""NDARINVH2LEDF77""",2,,2,,,
"""NDARINV67TUZJU7""",,1,2,1,,
"""NDARINVPJ2N86U5""",,1,,1,,


In [84]:
df_nii_counts.columns

['part_id',
 '{"func","baselineYear1Arm1"}',
 '{"anat","baselineYear1Arm1"}',
 '{"func","2YearFollowUpYArm1"}',
 '{"anat","2YearFollowUpYArm1"}',
 '{"anat","4YearFollowUpYArm1"}',
 '{"func","4YearFollowUpYArm1"}']

In [88]:
df_nii_counts.select(pl.col("part_id",'{"func","baselineYear1Arm1"}')).filter(pl.col('{"func","baselineYear1Arm1"}').is_null()) #.filter(pl.all().is_not_null)

part_id,"{""func"",""baselineYear1Arm1""}"
str,u32
"""NDARINVKHL7D46V""",
"""NDARINVVA82JDEJ""",
"""NDARINVT6F8N4WR""",
"""NDARINVY04AT16M""",
"""NDARINV0V139J8T""",
…,…
"""NDARINV44A0LV53""",
"""NDARINVRCXPE6ZR""",
"""NDARINV67TUZJU7""",
"""NDARINVPJ2N86U5""",


In [None]:
df_nii_counts.select(pl.col("part_id",'{"func","baselineYear1Arm1"}', '{"anat","baselineYear1Arm1"}')).filter(pl.col('{"anat","baselineYear1Arm1"}').is_null())

part_id,"{""anat"",""baselineYear1Arm1""}"
str,u32
"""NDARINVVA82JDEJ""",
"""NDARINVYM9J1XYN""",
"""NDARINVKUJ7WV09""",
"""NDARINV5HGRW46V""",
"""NDARINVNDYK09VH""",
…,…
"""NDARINV4UFLY3Z0""",
"""NDARINVXKYUY7PR""",
"""NDARINVDHRT3GB2""",
"""NDARINV3N0KGZ5G""",


In [97]:
df_mri = pl.read_csv("/mnt/storage/processed_mri/file_details_with_mri.csv")
df_mri_filtered = df_mri.filter(pl.col.has_baseline_b01_b02_T01 == 'No', pl.col.eventname == 'baselineYear1Arm1')
df_mri_filtered

src_subject_id,eventname,01_T1w.nii,01_bold.nii,02_bold.nii,has_baseline_b01_b02_T01,mri_info_visitid,mri_info_manufacturer,mri_info_manufacturersmn,mri_info_deviceserialnumber,mri_info_magneticfieldstrength,mri_info_softwareversion,mri_info_studydate,imgincl_t1w_include,imgincl_t2w_include,imgincl_dmri_include,imgincl_rsfmri_include,imgincl_mid_include,imgincl_nback_include,imgincl_sst_include,use_for_analysis
str,str,str,str,str,str,str,str,str,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str
"""NDARINV005V6D2C""","""baselineYear1Arm1""","""sub-NDARINV005V6D2C_ses-baseli…",,,"""No""","""G031_INV005V6D2C_baseline""","""GE MEDICAL SYSTEMS""","""DISCOVERY MR750""","""HASHe3ce02d3""",3.0,"""27\LX\MR Software release:DV26…",2.0180422e7,1.0,1.0,1.0,0.0,1.0,0.0,0.0,"""No"""
"""NDARINV00LJVZK2""","""baselineYear1Arm1""","""sub-NDARINV00LJVZK2_ses-baseli…",,,"""No""","""S076_INV00LJVZK2_baseline""","""SIEMENS""","""Prisma""","""HASH03db707f""",3.0,"""syngo MR E11""",2.0170819e7,1.0,1.0,1.0,0.0,1.0,0.0,0.0,"""No"""
"""NDARINV00NPMHND""","""baselineYear1Arm1""","""sub-NDARINV00NPMHND_ses-baseli…",,,"""No""","""P043_INV00NPMHND_baseline""","""Philips Medical Systems""","""Achieva dStream""","""HASHdb2589d4""",3.0,"""5.3.0\5.3.0.3""",2.0170404e7,1.0,1.0,1.0,1.0,0.0,0.0,0.0,"""No"""
"""NDARINV00U4FTRU""","""baselineYear1Arm1""","""sub-NDARINV00U4FTRU_ses-baseli…","""sub-NDARINV00U4FTRU_ses-baseli…",,"""No""","""G032_INV00U4FTRU_baseline""","""GE MEDICAL SYSTEMS""","""DISCOVERY MR750""","""HASHfeb7e81a""",3.0,"""25\LX\MR Software release:DV25…",2.0180519e7,1.0,1.0,0.0,1.0,1.0,1.0,1.0,"""No"""
"""NDARINV010ZM3H9""","""baselineYear1Arm1""","""sub-NDARINV010ZM3H9_ses-baseli…",,,"""No""","""S012_INV010ZM3H9_baseline""","""SIEMENS""","""Prisma_fit""","""HASHe4f6957a""",3.0,"""syngo MR E11""",2.0180505e7,1.0,1.0,1.0,0.0,1.0,1.0,0.0,"""No"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""NDARINVZZFG6J5U""","""baselineYear1Arm1""","""sub-NDARINVZZFG6J5U_ses-baseli…","""sub-NDARINVZZFG6J5U_ses-baseli…",,"""No""","""G032_INVZZFG6J5U_baseline""","""GE MEDICAL SYSTEMS""","""DISCOVERY MR750""","""HASH4b0b8b05""",3.0,"""25\LX\MR Software release:DV25…",2.0180421e7,1.0,1.0,1.0,1.0,0.0,0.0,1.0,"""No"""
"""NDARINVZZNX6W2P""","""baselineYear1Arm1""","""sub-NDARINVZZNX6W2P_ses-baseli…",,,"""No""","""S020_INVZZNX6W2P_baseline""","""SIEMENS""","""Prisma""","""HASH11ad4ed5""",3.0,"""syngo MR E11""",2.0170313e7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"""No"""
"""NDARINVZZZ2ALR6""","""baselineYear1Arm1""","""sub-NDARINVZZZ2ALR6_ses-baseli…",,,"""No""","""G010_INVZZZ2ALR6_baseline""","""GE MEDICAL SYSTEMS""","""DISCOVERY MR750""","""HASH5b2fcf80""",3.0,"""25\LX\MR Software release:DV25…",2.0170615e7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"""No"""
"""NDARINVZZZNB0XC""","""baselineYear1Arm1""","""sub-NDARINVZZZNB0XC_ses-baseli…",,,"""No""","""S011_INVZZZNB0XC_baseline""","""SIEMENS""","""Prisma""","""HASH5b0cf1bb""",3.0,"""syngo MR E11""",2.0170103e7,1.0,1.0,1.0,1.0,1.0,0.0,1.0,"""No"""


In [98]:
df_mri_filtered.filter(pl.col.has_baseline_b01_b02_T01 == 'No').select(pl.col.mri_info_manufacturer.value_counts()).unnest("mri_info_manufacturer")

mri_info_manufacturer,count
str,u32
,1
"""Philips Medical Systems""",505
"""SIEMENS""",2738
"""GE MEDICAL SYSTEMS""",1638


## Anat and Func for each participant

## Func Runs for each participant