In [1]:
# Load libraries
import os, xarray
import pandas as pd
import numpy as np

In [2]:
# Set main directories.
main_dir = '/raid/projects/Emin/AOMIC/'
data_dir = os.path.join(main_dir, 'preprocessing')

# Atlases
atlases = ['Shen_268']

# Task and dataset
task = 'moviewatching'
dataset = 'ID1000'

In [4]:
def save_conn_mat(conn_mat, output_dir, sub_name):
    """Saves connectivity matrices into single csv files."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    conn_mat.to_csv(os.path.join(output_dir, f'{sub_name}.csv'), header=None, index=None)

def remove_nans(data):
    """Remove nans from given data."""
    data = pd.DataFrame(data)
    nan_mask = np.ravel(np.array(data.isna()))
    data = data[~nan_mask]
    return data, nan_mask

def replace_outlier(data, cutoff=1.5):
    """Replace outliers with nan.
    Outliers are detected as being lower or upper
    than given cutoff value (e.g., 1.5) times 
    interquantile range"""
    data = pd.Series(data)
    q25, q75 = data.quantile(0.25), data.quantile(0.75)
    iqr = q75 - q25
    lower, upper = q25 - iqr*cutoff, q75 + iqr*cutoff
    outlier_map = (data < lower) | (data > upper)
    data[outlier_map] = np.nan
    return data, outlier_map

def organize_save_dataset(main_dir, atlas, dataset, task, remove_outlier=False, target=None, cutoff=1.5, fd=0.5):
    """Organizes dataset.
    
    It removes subjects with excessive head motion computed using frame-wise displacement,
    and with outlier target values (e.g., IQR*1.5)
    """
    # Define output directory.
    output_dir = os.path.join(main_dir, 'data_organization', atlas, dataset)

    # Load connectome dataset.
    data = xarray.load_dataarray(os.path.join(main_dir, 'preprocessing', atlas, dataset, f'conn_mat_{task}.nc'))
    ID_array = np.array(data.ID)

    # Load behavioral data.
    beh_data = pd.read_csv(
        os.path.join(main_dir, 'data', f'{dataset}_participants.csv')).rename({'participant_id':'ID'}, axis=1)
    beh_data.sex = beh_data.sex.replace({'female':1, 'male':0, 'F':1, 'M':0})

    # Remove nans.
    _, nan_mask = remove_nans(beh_data[target])
    sub_to_remove = np.array(beh_data.ID.loc[nan_mask])

    # Remove outlier if desired.
    if remove_outlier:
        assert target is not None, 'Target variable must be provided!'
        beh_data[target], outlier_map = replace_outlier(beh_data[target], cutoff=cutoff)
        sub_to_remove = np.concatenate([sub_to_remove, np.array(beh_data.ID.loc[outlier_map])])
        beh_data = beh_data[~outlier_map].reset_index(drop=True)

    # Load subject motion
    sub_motion = pd.read_csv(os.path.join(main_dir, 'data', 'subject_motion.csv'))
    sub_high_motion = np.array(
        sub_motion.ID[(sub_motion.dataset == dataset) & (sub_motion.task == task) & (sub_motion.FD > fd)])
    
    # Load total intracranial volume
    sub_TIV = pd.read_csv(os.path.join(main_dir, 'data', 'subject_TIV.csv'))
    sub_TIV = sub_TIV[(sub_TIV.dataset == dataset)]

    # Subjects to remove.
    sub_to_remove = np.unique(np.concatenate([sub_to_remove, sub_high_motion]))

    # Save conn matrices into .csv files
    clean_ID_array = []
    for ID in ID_array:
        if ID not in sub_to_remove:
            save_conn_mat(
                pd.DataFrame(np.array(data.loc[ID])), 
                os.path.join(output_dir, 'conn_mat'), ID)
            clean_ID_array.append(ID)

    # Remove beh data that has no fMRI images, and save into a .csv file.
    beh_data = beh_data.merge(
        pd.DataFrame(clean_ID_array).rename({0:'ID'}, axis=1), on='ID', how='right')
    beh_data = beh_data.merge(sub_TIV, on='ID', how='left')
    beh_data.to_csv(os.path.join(output_dir, 'behavioral_data.csv'), index=None)

In [5]:
# Creates the final dataset. 
# Removes subjects with outlier target values using very liberal threshold of IQR*2.5
# Removes subjects with excessive head motion (FD > 0.5)
organize_save_dataset(main_dir, atlas='Shen_268', dataset=dataset, task=task, 
                    remove_outlier=True, target='IST_fluid', 
                    cutoff=2.5, fd=0.5)