In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
participant = pd.read_csv("participants.tsv", sep="\t")
participant.head()

Unnamed: 0,participant_id,sex,age
0,sub-000103,M,24.45
1,sub-000148,F,25.01
2,sub-000149,F,44.32
3,sub-000159,F,27.34
4,sub-000175,M,32.6


In [36]:
scores = pd.read_csv("scores.tsv", sep="\t")
scores.head()

Unnamed: 0,bids_name,score
0,sub-000103_acq-standard_T1w,1
1,sub-000103_acq-headmotion1_T1w,2
2,sub-000103_acq-headmotion2_T1w,3
3,sub-000148_acq-standard_T1w,1
4,sub-000148_acq-headmotion1_T1w,3


Here we perform a 60-20-20 split by **participant** to ensure no anatomical data leakage

In [30]:
X_train, X_test = train_test_split(
    participant["participant_id"], test_size=0.2, random_state=1
)
X_train, X_val = train_test_split(X_train, test_size=0.25, random_state=1)
len(X_train), len(X_val), len(X_test)

(88, 30, 30)

We now need to retrieve every file and associated scores corresponding to our participants

In [37]:
def extract_sub(bids_name):
    # sub is the first 10 characters of the bids name
    return bids_name[:10]


def extract_num(bids_name):
    return bids_name[4:10]


scores["participant_id"] = scores["bids_name"].apply(extract_sub)
scores["participant_num"] = scores["bids_name"].apply(extract_num)

scores["data"] = (
    "MRIQC/sub-MR-ART-"
    + scores["participant_num"]
    + "/"
    + scores["bids_name"]
    + "/mri/reg_extracted_orig_nu.nii.gz"
)
scores["label"] = scores["score"] - 1
scores.head()

Unnamed: 0,bids_name,score,participant_id,participant_num,data,label
0,sub-000103_acq-standard_T1w,1,sub-000103,103,MRIQC/sub-MR-ART-000103/sub-000103_acq-standar...,0
1,sub-000103_acq-headmotion1_T1w,2,sub-000103,103,MRIQC/sub-MR-ART-000103/sub-000103_acq-headmot...,1
2,sub-000103_acq-headmotion2_T1w,3,sub-000103,103,MRIQC/sub-MR-ART-000103/sub-000103_acq-headmot...,2
3,sub-000148_acq-standard_T1w,1,sub-000148,148,MRIQC/sub-MR-ART-000148/sub-000148_acq-standar...,0
4,sub-000148_acq-headmotion1_T1w,3,sub-000148,148,MRIQC/sub-MR-ART-000148/sub-000148_acq-headmot...,2


In [38]:
train_files = scores.loc[scores["participant_id"].isin(X_train), ["data", "label"]]
val_files = scores.loc[scores["participant_id"].isin(X_val), ["data", "label"]]
test_files = scores.loc[scores["participant_id"].isin(X_test), ["data", "label"]]
len(train_files), len(val_files), len(test_files)

(258, 89, 89)

In [39]:
train_files.to_csv("train.csv")
val_files.to_csv("val.csv")
test_files.to_csv("test.csv")

In [40]:
train_files.to_dict("records")

[{'data': 'MRIQC/sub-MR-ART-000159/sub-000159_acq-standard_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 0},
 {'data': 'MRIQC/sub-MR-ART-000159/sub-000159_acq-headmotion1_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 2},
 {'data': 'MRIQC/sub-MR-ART-000159/sub-000159_acq-headmotion2_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 2},
 {'data': 'MRIQC/sub-MR-ART-000175/sub-000175_acq-standard_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 0},
 {'data': 'MRIQC/sub-MR-ART-000175/sub-000175_acq-headmotion1_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 1},
 {'data': 'MRIQC/sub-MR-ART-000175/sub-000175_acq-headmotion2_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 2},
 {'data': 'MRIQC/sub-MR-ART-009673/sub-009673_acq-standard_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 0},
 {'data': 'MRIQC/sub-MR-ART-009673/sub-009673_acq-headmotion1_T1w/mri/reg_extracted_orig_nu.nii.gz',
  'label': 1},
 {'data': 'MRIQC/sub-MR-ART-009673/sub-009673_acq-headmotion2_T1w/mri/reg_extract

# Get data for HCP dataset

In [8]:
participant = pd.read_csv("mriqc.csv", index_col=0)
participant = participant.drop(columns="index")
participant.head()

Unnamed: 0,bids_name,qc_issue,score,dataset
0,sub-BI02450/ses-202306231,True,2.0,AMPSCZ
1,sub-BI02450/ses-202304111,False,3.0,AMPSCZ
2,sub-BI05529/ses-202306051,False,4.0,AMPSCZ
3,sub-BI05652/ses-202304171,False,3.0,AMPSCZ
4,sub-BI05874/ses-202305181,False,3.0,AMPSCZ


In [11]:
hcp = participant[participant.dataset == "HCP-YA-1200"]
hcp.head()

Unnamed: 0,bids_name,qc_issue,score,dataset
1197,100206,False,-1.0,HCP-YA-1200
1198,100307,False,-1.0,HCP-YA-1200
1199,100408,False,-1.0,HCP-YA-1200
1200,100610,False,-1.0,HCP-YA-1200
1201,101006,False,-1.0,HCP-YA-1200


In [16]:
hcp["data"] = (
    "MRIQC/sub-HCP-YA-1200-"
    + hcp["bids_name"]
    + "/"
    + hcp["bids_name"]
    + "_3T_T1w_MPR1/mri/reg_extracted_orig_nu.nii.gz"
)
hcp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hcp["data"] = (


Unnamed: 0,bids_name,qc_issue,score,dataset,data
1197,100206,False,-1.0,HCP-YA-1200,MRIQC/sub-HCP-YA-1200-100206/100206_3T_T1w_MPR...
1198,100307,False,-1.0,HCP-YA-1200,MRIQC/sub-HCP-YA-1200-100307/100307_3T_T1w_MPR...
1199,100408,False,-1.0,HCP-YA-1200,MRIQC/sub-HCP-YA-1200-100408/100408_3T_T1w_MPR...
1200,100610,False,-1.0,HCP-YA-1200,MRIQC/sub-HCP-YA-1200-100610/100610_3T_T1w_MPR...
1201,101006,False,-1.0,HCP-YA-1200,MRIQC/sub-HCP-YA-1200-101006/101006_3T_T1w_MPR...


In [18]:
hcp[["data"]].to_csv("hcp.csv")

## Converting Old MRIQC split to new pipeline

In [8]:
import re

In [59]:
train = pd.read_csv("../src/dataset/train.csv", index_col=0)
test = pd.read_csv("../src/dataset/test.csv", index_col=0)
val = pd.read_csv("../src/dataset/val.csv", index_col=0)

train.head()

Unnamed: 0,data,label
9,MRIQC/sub-MR-ART-000159/sub-000159_acq-standar...,0
10,MRIQC/sub-MR-ART-000159/sub-000159_acq-headmot...,2
11,MRIQC/sub-MR-ART-000159/sub-000159_acq-headmot...,2
12,MRIQC/sub-MR-ART-000175/sub-000175_acq-standar...,0
13,MRIQC/sub-MR-ART-000175/sub-000175_acq-headmot...,1


In [60]:
def convert_old_to_new(old_path):
    old_path = old_path.replace("MRIQC/sub-MR-ART-", "MRART-Preproc/subjects/sub-")
    ses = ""
    sub = re.search("\/(sub-[^_\/]*)_acq", old_path)
    end = "space-MNI152NLin2009cSym_desc-Crop_res-1x1x1_T1w.nii.gz"
    if "standard" in old_path:
        ses = "ses-standard"
    elif "headmotion1" in old_path:
        ses = "ses-headmotion1"
    elif "headmotion2" in old_path:
        ses = "ses-headmotion2"

    keep = old_path[: old_path.find(sub.group(1)) + len(sub.group(1))]
    return f"{keep}/{ses}/t1_linear/{sub.group(1)}_{ses}_{end}"

In [61]:
for ds in (train, val, test):
    ds["data"] = ds["data"].apply(convert_old_to_new)
train.head()

Unnamed: 0,data,label
9,MRART-Preproc/subjects/sub-000159/ses-standard...,0
10,MRART-Preproc/subjects/sub-000159/ses-headmoti...,2
11,MRART-Preproc/subjects/sub-000159/ses-headmoti...,2
12,MRART-Preproc/subjects/sub-000175/ses-standard...,0
13,MRART-Preproc/subjects/sub-000175/ses-headmoti...,1


In [62]:
train.to_csv("../src/dataset/train_preproc.csv")
test.to_csv("../src/dataset/test_preproc.csv")
val.to_csv("../src/dataset/val_preproc.csv")