In [1]:
from preproc_functions_controls import *

In [94]:
root_path = '/Users/dg519/Documents/normative_paper_github/data'
remote_data_folders='/data_ic3online_cognition'
supervised_data_folders=['/data_healthy_v1','/data_healthy_v2']
folder_structure=['/summary_data','/trial_data','/speech']
output_clean_folder ='/data_healthy_cleaned'
list_of_tasks = ['IC3_NVtrailMaking','IC3_Orientation', 'IC3_PearCancellation', 'IC3_rs_digitSpan', 'IC3_rs_spatialSpan', 'IC3_rs_PAL', 'IC3_rs_SRT', 'IC3_rs_CRT', 'IC3_SemanticJudgment', 'IC3_i4i_IDED', 'IC3_i4i_motorControl','IC3_calculation', 'IC3_GestureRecognition', 'IC3_AuditorySustainedAttention','IC3_BBCrs_blocks', 'IC3_Comprehension','IC3_rs_oddOneOut', 'IC3_TaskRecall']
list_of_questionnaires = ['q_IC3_demographics', 'q_IC3_fatigue','q_IC3_GDS','q_IC3_apathy']
list_of_speech = ['IC3_NamingTest','IC3_Reading', 'IC3_Repetition']
folder_list = ['data_healthy_v1', 'data_healthy_v2', 'data_ic3online_cognition', 'data_healthy_merged']
merged_data_folder ='/data_healthy_combined'
data_format = '.csv'

In [None]:
main_preprocessing(root_path, list_of_tasks, list_of_questionnaires, list_of_speech)

In [60]:
df_demographics = demographics_preproc(root_path, merged_data_folder, questionnaire_name, inclusion_criteria, folder_structure, data_format,clean_file_extension)


In [67]:
import os
import pandas as pd
import numpy as np

def general_outlier_detection_remoteSetting(root_path, remote_data_folders, folder_structure, 
                                              screening_list=None):
    """
    Detect outliers in remote setting data by loading multiple screening questionnaires,
    cleaning and merging them, and then applying exclusion criteria.

    Parameters
    ----------
    root_path : str
        The root directory.
    remote_data_folder : str
        Remote data folder name (e.g. '/data_ic3online_cognition').
    folder_structure : list of str
        List containing folder names; the first element is used to locate summary data.
    screening_list : list of str, optional
        List of questionnaire filenames. Defaults to:
        ['q_IC3_demographicsHealthy_questionnaire.csv',
         'q_IC3_metacog_questionnaire.csv',
         'IC3_Orientation.csv',
         'IC3_PearCancellation.csv'].

    Returns
    -------
    pandas.Series
        A Series of user IDs that pass all exclusion criteria.
    """
    
    if screening_list is None:
        screening_list = [
            'q_IC3_demographicsHealthy_questionnaire.csv',
            'q_IC3_metacog_questionnaire.csv',
            'IC3_Orientation.csv',
            'IC3_PearCancellation.csv'
        ]
        
    # Build the base path for the remote data
    base_path = os.path.join(root_path, remote_data_folders.lstrip('/'), folder_structure[0].lstrip('/'))
    
    # Load screening questionnaires from file paths
    df_dem    = pd.read_csv(os.path.join(base_path, screening_list[0]), low_memory=False)
    df_cheat  = pd.read_csv(os.path.join(base_path, screening_list[1]), low_memory=False)
    df_orient = pd.read_csv(os.path.join(base_path, screening_list[2]), low_memory=False)
    df_pear   = pd.read_csv(os.path.join(base_path, screening_list[3]), low_memory=False)
    
    # Define a helper to remove duplicate and missing user IDs
    def clean_df(df):
        return df.drop_duplicates(subset=['user_id'], keep="first").dropna(subset=['user_id']).reset_index(drop=True)
    
    df_dem    = clean_df(df_dem)
    df_cheat  = clean_df(df_cheat)
    df_orient = clean_df(df_orient)
    df_pear   = clean_df(df_pear)
        
    # Keep only users present in both demographics and Pear Cancellation data
    common_ids = set(df_dem['user_id']) ^ set(df_pear['user_id'])

    for df in (df_dem, df_orient, df_pear, df_cheat):
        df.drop(df[df['user_id'].isin(common_ids)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)
        
    # Clean the questionnaire data
    df_dem['Q30_R'] = df_dem['Q30_R'].replace("No", "SKIPPED").replace("SKIPPED", 999999).astype(float)
    for col in ["Q2_S", "Q3_S"]:
        df_cheat[col].replace([0, 1], np.nan, inplace=True)
    df_cheat.dropna(subset=["Q2_S", "Q3_S"], inplace=True)
    df_cheat.reset_index(drop=True, inplace=True)
        
    # Remove users who fail screening tests based on SummaryScore criteria
    # Vectorized approach: identify user_ids that meet both failure conditions.
    fail_orient = set(df_orient.loc[df_orient['SummaryScore'] < 3, 'user_id'])
    fail_pear   = set(df_pear.loc[df_pear['SummaryScore'] <= 0.80, 'user_id'])
    failed_ids  = fail_orient & fail_pear    
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(failed_ids)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)

    print(f'We removed {len(failed_ids)} people who failed both Orientation and Pear Cancellation, from all tasks.')
    
    # Remove users not neurologically healthy (using demographics responses)
    unhealthy_mask = ((df_dem['Q12_R'] != "SKIPPED") | (df_dem['Q14_R'] != "SKIPPED") |
                      (df_dem['Q30_R'] <= 60) | (df_dem['Q1_R'] < 40))
    unhealthy_mask = df_dem.loc[unhealthy_mask,'user_id']
    neuro_removed = ((df_dem['Q12_R'] != "SKIPPED") | (df_dem['Q14_R'] != "SKIPPED")).sum()
    dementia_removed = (df_dem['Q30_R'] <= 60).sum()
    age_removed = (df_dem['Q1_R'] < 40).sum()
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(unhealthy_mask)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)

    print(f'We removed {neuro_removed} who indicated neurological disorder, '
          f'{dementia_removed} with a history of dementia and {age_removed} who are younger than 40.')
    
    # Remove users who self-reported lack of engagement ("cheating")
    cheating_mask = df_pear['user_id'].isin(df_cheat['user_id'])
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(df_cheat['user_id'])].index, inplace=True)
        df.reset_index(drop=True, inplace=True)
    
    print(f'We removed {cheating_mask.sum()} people who cheated.')
    
    # Return the final cleaned user IDs from df_pear as a Series
    cleaned_ids_remote = df_pear['user_id']
    
    return cleaned_ids_remote

In [68]:
ids2 = general_outlier_detection_remoteSetting(root_path, remote_data_folders, folder_structure)
print(len(ids2))

We removed 6 people who failed both Orientation and Pear Cancellation, from all tasks.
We removed 407 who indicated neurological disorder, 91 with a history of dementia and 92 who are younger than 40.
We removed 11 people who cheated.
6374


In [74]:
def general_outlier_detection_supervisedSetting(root_path, supervised_data_folders, folder_structure, screening_list = ['q_IC3_demographics_questionnaire.csv', 'IC3_Orientation.csv', 'IC3_PearCancellation.csv']): 

    # Read the data for v1

    os.chdir(root_path + supervised_data_folders[0] + folder_structure[0])

    df_dem_tp1 = pd.read_csv(screening_list[0], low_memory=False)
    df_orient_tp1 = pd.read_csv(screening_list[1], low_memory=False)
    df_pear_tp1 = pd.read_csv(screening_list[2], low_memory=False)

    # Read the data for v2

    os.chdir(root_path + supervised_data_folders[1] + folder_structure[0])

    df_dem_tp2 = pd.read_csv(screening_list[0], low_memory=False)
    df_orient_tp2 = pd.read_csv(screening_list[1], low_memory=False)
    df_pear_tp2 = pd.read_csv(screening_list[2], low_memory=False)

    # Concatenate the two timepoints

    df_dem = pd.concat([df_dem_tp1, df_dem_tp2], ignore_index=True)
    df_orient = pd.concat([df_orient_tp1, df_orient_tp2], ignore_index=True)
    df_pear = pd.concat([df_pear_tp1, df_pear_tp2], ignore_index=True)

    # Remove duplicates, NAs and reset index

    df_dem.drop_duplicates(subset=['user_id'],keep="first", inplace=True)
    df_dem.dropna(subset=['user_id'], inplace=True)
    df_dem.reset_index(drop=True, inplace=True)

    df_orient.drop_duplicates(subset=['user_id'],keep="first", inplace=True)
    df_orient.dropna(subset=['user_id'], inplace=True)
    df_orient.reset_index(drop=True, inplace=True)

    df_pear.drop_duplicates(subset=['user_id'],keep="first", inplace=True)
    df_pear.dropna(subset=['user_id'], inplace=True)
    df_pear.reset_index(drop=True, inplace=True)

    # Find the user_id who are in both df_pear and df_pear

    cleaned_ids_supervised = list(set(df_dem['user_id']).intersection(set(df_orient['user_id'])).intersection(set(df_pear['user_id'])))

    df_pear = df_pear[df_pear.user_id.isin(cleaned_ids_supervised)]  
    df_orient = df_orient[df_orient.user_id.isin(cleaned_ids_supervised)]  
    df_dem = df_dem[df_dem.user_id.isin(cleaned_ids_supervised)]  

    # Drop the user_id, if they fail the screening test

    ids_failed_screen = []
    for subs in cleaned_ids_supervised:
        if (df_orient[df_orient.user_id == subs].SummaryScore < 3).bool() and (df_pear[df_pear.user_id == subs].SummaryScore <= 0.80).bool():
            
            ids_failed_screen.append(subs)
            df_dem = df_dem.drop(df_dem[df_dem.user_id == subs].index).reset_index(drop=True)
            df_orient = df_orient.drop(df_orient[df_orient.user_id == subs].index).reset_index(drop=True)
            df_pear = df_pear.drop(df_pear[df_pear.user_id == subs].index).reset_index(drop=True)

    print(f'We removed {len(ids_failed_screen)} people who failed both Orientation and Pear Cancellation, from all tasks.')


    cleaned_ids_supervised = df_pear.user_id

    # Remove people who are not neurologically healthy

    to_remove = (df_dem.Q12_R != "SKIPPED") | (df_dem.Q14_R != "SKIPPED") | (df_dem.Q1_R < 40)    
    print(f'We removed {sum((df_dem.Q12_R != "SKIPPED") | (df_dem.Q14_R != "SKIPPED"))} who indicated they have a neurological disorder, and {sum(df_dem.Q1_R < 40)} who are younger than 40.')

    df_dem = df_dem[~to_remove].reset_index(drop=True)
    df_pear = df_pear[~to_remove].reset_index(drop=True)
    df_orient = df_orient[~to_remove].reset_index(drop=True)

    cleaned_ids_supervised = df_pear.user_id

    return cleaned_ids_supervised # Return participant ids that passed exclusion criteria



In [75]:
ids = general_outlier_detection_supervisedSetting(root_path, supervised_data_folders, folder_structure)
print(len(ids))

We removed 17 people who failed both Orientation and Pear Cancellation, from all tasks.
We removed 33 who indicated they have a neurological disorder, and 4 who are younger than 40.
338


  if (df_orient[df_orient.user_id == subs].SummaryScore < 3).bool() and (df_pear[df_pear.user_id == subs].SummaryScore <= 0.80).bool():


In [None]:
import os
import pandas as pd
import numpy as np

def general_outlier_detection_supervisedSetting(root_path, supervised_data_folders, folder_structure, 
                                                  screening_list=None):
    """
    Detect outliers in supervised setting data by loading two timepoints of screening questionnaires,
    cleaning and merging them, and then applying exclusion criteria.

    Parameters
    ----------
    root_path : str
        The root directory.
    supervised_data_folders : list of str
        List containing folder names for the two timepoints (e.g. [folder_v1, folder_v2]).
    folder_structure : list of str
        List containing folder names; the first element is used to locate summary data.
    screening_list : list of str, optional
        List of questionnaire filenames. Defaults to:
        ['q_IC3_demographics_questionnaire.csv', 'IC3_Orientation.csv', 'IC3_PearCancellation.csv'].

    Returns
    -------
    pandas.Series
        A Series of user IDs that pass all exclusion criteria.
    """
    if screening_list is None:
        screening_list = [
            'q_IC3_demographics_questionnaire.csv',
            'IC3_Orientation.csv',
            'IC3_PearCancellation.csv'
        ]
    
    def load_timepoint_data(data_folder):
        """Helper function to load data for a given timepoint."""
        base_path = os.path.join(root_path, data_folder.lstrip('/'), folder_structure[0].lstrip('/'))
        df_dem    = pd.read_csv(os.path.join(base_path, screening_list[0]), low_memory=False)
        df_orient = pd.read_csv(os.path.join(base_path, screening_list[1]), low_memory=False)
        df_pear   = pd.read_csv(os.path.join(base_path, screening_list[2]), low_memory=False)
        return df_dem, df_orient, df_pear
    
    # Load data for both timepoints
    df_dem_tp1, df_orient_tp1, df_pear_tp1 = load_timepoint_data(supervised_data_folders[0])
    df_dem_tp2, df_orient_tp2, df_pear_tp2 = load_timepoint_data(supervised_data_folders[1])
    
    # Concatenate the two timepoints
    df_dem    = pd.concat([df_dem_tp1, df_dem_tp2], ignore_index=True)
    df_orient = pd.concat([df_orient_tp1, df_orient_tp2], ignore_index=True)
    df_pear   = pd.concat([df_pear_tp1, df_pear_tp2], ignore_index=True)
    
    def clean_df(df):
        """Remove duplicates and missing user IDs, then reset index."""
        return df.drop_duplicates(subset=['user_id'], keep='first') \
                 .dropna(subset=['user_id']) \
                 .reset_index(drop=True)
    
    # Clean dataframes
    df_dem    = clean_df(df_dem)
    df_orient = clean_df(df_orient)
    df_pear   = clean_df(df_pear)
    
    # Retain only common user IDs across all dataframes
    common_ids = set(df_dem['user_id']) ^ set(df_pear['user_id'])
    
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(common_ids)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)
    
    # Remove users who fail screening tests based on SummaryScore criteria
    # Vectorized approach: identify user_ids that meet both failure conditions.
    fail_orient = set(df_orient.loc[df_orient['SummaryScore'] < 3, 'user_id'])
    fail_pear   = set(df_pear.loc[df_pear['SummaryScore'] <= 0.80, 'user_id'])
    failed_ids  = fail_orient & fail_pear    
    
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(failed_ids)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)
    
    print(f'We removed {len(failed_ids)} people who failed both Orientation and Pear Cancellation, from all tasks.')

    # Remove users not neurologically healthy (using demographics responses)
    unhealthy_mask = ((df_dem['Q12_R'] != "SKIPPED") | (df_dem['Q14_R'] != "SKIPPED") |
                    (df_dem['Q1_R'] < 40))
    unhealthy_mask = df_dem.loc[unhealthy_mask,'user_id']
    neuro_removed = ((df_dem['Q12_R'] != "SKIPPED") | (df_dem['Q14_R'] != "SKIPPED")).sum()
    age_removed = (df_dem['Q1_R'] < 40).sum()
    
    for df in (df_dem, df_orient, df_pear):
        df.drop(df[df['user_id'].isin(unhealthy_mask)].index, inplace=True)
        df.reset_index(drop=True, inplace=True)

    print(f'We removed {neuro_removed} who indicated neurological disorder, '
          f'{age_removed} who are younger than 40.')
    
    # Return the final cleaned user IDs from df_pear as a Series
    cleaned_ids_supervised = df_pear['user_id']
    
    return cleaned_ids_supervised

In [73]:
ids = general_outlier_detection_supervisedSetting(root_path, supervised_data_folders, folder_structure)
print(len(ids))

407 409 407
392 392 392
We removed 17 people who failed both Orientation and Pear Cancellation, from all tasks.
375 375 375
We removed 33 who indicated neurological disorder, 4 who are younger than 40.
338 338 338
338


In [91]:
def merge_control_data_across_sites(root_path, folder_structure, supervised_data_folders, remote_data_folders, list_of_tasks,list_of_questionnaires,list_of_speech, data_format, merged_data_folder):
        
    os.chdir(root_path)

    # Create folder structure
    
    if not os.path.isdir(merged_data_folder[1:]):
        os.mkdir(merged_data_folder[1:])
    os.chdir(merged_data_folder[1:])

    if not os.path.isdir(folder_structure[0][1:]):
        os.mkdir(folder_structure[0][1:])
        
    if not os.path.isdir(folder_structure[1][1:]):
        os.mkdir(folder_structure[1][1:])
    
    if not os.path.isdir(folder_structure[2][1:]):
        os.mkdir(folder_structure[2][1:])

    # Merge data from clinical tests
    
    if 'IC3_NVtrailMaking' in list_of_tasks:
        list_of_tasks.append('IC3_NVtrailMaking2')
        list_of_tasks.append('IC3_NVtrailMaking3')

    for taskName in list_of_tasks:
        
        summary_task_path = root_path + supervised_data_folders[0] + folder_structure[0]
        raw_task_path = root_path + supervised_data_folders[0] + folder_structure[1]
        
        df_v1 = pd.read_csv(f'{summary_task_path}/{taskName}{data_format}', low_memory=False)
        df_v1_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw{data_format}'), low_memory=False)
        
        summary_task_path = root_path + supervised_data_folders[1] + folder_structure[0]
        raw_task_path = root_path + supervised_data_folders[1] + folder_structure[1]
        
        # Special case for IDED task that has two versions
        
        if taskName == "IC3_i4i_IDED": 
            df_v2 = pd.read_csv((f'{summary_task_path}/{taskName}2{data_format}'), low_memory=False)
            df_v2_raw = pd.read_csv((f'{raw_task_path}/{taskName}2_raw{data_format}'), low_memory=False)
        else:   
            df_v2 = pd.read_csv(f'{summary_task_path}/{taskName}{data_format}', low_memory=False)
            df_v2_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw{data_format}'), low_memory=False)
        
        
        summary_task_path = root_path + remote_data_folders + folder_structure[0]
        raw_task_path = root_path + remote_data_folders + folder_structure[1]

        df_cog = pd.read_csv(f'{summary_task_path}/{taskName}{data_format}', low_memory=False)
        df_cog_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw{data_format}'), low_memory=False)
        
        df = pd.concat([df_v1, df_v2, df_cog], ignore_index=True)
        df_raw = pd.concat([df_v1_raw, df_v2_raw, df_cog_raw], ignore_index=True)
        
        output_folder = root_path + merged_data_folder
        
        df.to_csv(f'{output_folder}/{folder_structure[0]}/{taskName}{data_format}', index=False)
        df_raw.to_csv(f'{output_folder}/{folder_structure[1]}/{taskName}_raw{data_format}', index=False)
        print(f'Merged {taskName}')
        print(len(df))
        
    if 'IC3_NVtrailMaking' in list_of_tasks:
        list_of_tasks = list_of_tasks[:-2]
        
    # Merge data from speech
        
    for taskName in list_of_speech:
        
        raw_speech_path = root_path + supervised_data_folders[0] + folder_structure[1]
        df_v1_raw = pd.read_csv((f'{raw_speech_path}/{taskName}_raw{data_format}'), low_memory=False)
        
        raw_speech_path = root_path + supervised_data_folders[1] + folder_structure[1]
        df_v2_raw = pd.read_csv((f'{raw_speech_path}/{taskName}_raw{data_format}'), low_memory=False)

        df_raw = pd.concat([df_v1_raw, df_v2_raw], ignore_index=True)
        
        output_folder = root_path + merged_data_folder
        df_raw.to_csv(f'{output_folder}/{folder_structure[1]}/{taskName}_raw.csv', index=False)
        print(f'Merged {taskName}')
        print(len(df_raw))
        
    # Merge data from questionnaires

    for taskName in list_of_questionnaires:
        
        if taskName == 'q_IC3_demographics':
            
            summary_task_path = root_path + supervised_data_folders[0] + folder_structure[0]
            raw_task_path = root_path + supervised_data_folders[0] + folder_structure[1]
            
            df_v1 = pd.read_csv((f'{summary_task_path}/{taskName}Healthy_questionnaire.csv'), low_memory=False)
            df_v1_2 = pd.read_csv((f'{summary_task_path}/{taskName}_questionnaire.csv'), low_memory=False)
            df_v1_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw.csv'), low_memory=False)
            df_v1_raw_2 = pd.read_csv((f'{raw_task_path}/{taskName}Healthy_raw.csv'), low_memory=False)

            summary_task_path = root_path + supervised_data_folders[1] + folder_structure[0]
            raw_task_path = root_path + supervised_data_folders[1] + folder_structure[1]
 
            df_v2 = pd.read_csv((f'{summary_task_path}/{taskName}_questionnaire.csv'), low_memory=False)
            df_v2_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw.csv'), low_memory=False)
            
            summary_task_path = root_path + remote_data_folders + folder_structure[0]
            raw_task_path = root_path + remote_data_folders + folder_structure[1]

            df_cog = pd.read_csv((f'{summary_task_path}/{taskName}Healthy_questionnaire.csv'), low_memory=False)
            df_cog_raw = pd.read_csv((f'{raw_task_path}/{taskName}Healthy_raw.csv'), low_memory=False)

            df = pd.concat([df_v1, df_v1_2, df_v2, df_cog], ignore_index=True)
            df_raw = pd.concat([df_v1_raw, df_v1_raw_2, df_v2_raw, df_cog_raw], ignore_index=True)
            
        else:
            
            summary_task_path = root_path + supervised_data_folders[0] + folder_structure[0]
            raw_task_path = root_path + supervised_data_folders[0] + folder_structure[1]
            
            df_v1 = pd.read_csv(f'{summary_task_path}/{taskName}_questionnaire.csv', low_memory=False)
            df_v1_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw.csv'), low_memory=False)
            
            summary_task_path = root_path + supervised_data_folders[1] + folder_structure[0]
            raw_task_path = root_path + supervised_data_folders[1] + folder_structure[1]
            
            df_v2 = pd.read_csv(f'{summary_task_path}/{taskName}_questionnaire.csv', low_memory=False)
            df_v2_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw.csv'), low_memory=False)
            
            summary_task_path = root_path + remote_data_folders + folder_structure[0]
            raw_task_path = root_path + remote_data_folders + folder_structure[1]
            
            df_cog = pd.read_csv(f'{summary_task_path}/{taskName}_questionnaire.csv', low_memory=False)
            df_cog_raw = pd.read_csv((f'{raw_task_path}/{taskName}_raw.csv'), low_memory=False)
            
            df = pd.concat([df_v1, df_v2, df_cog], ignore_index=True)
            df_raw = pd.concat([df_v1_raw, df_v2_raw, df_cog_raw], ignore_index=True)
        
        output_folder = root_path + merged_data_folder

        #df.to_csv(f'{output_folder}/{folder_structure[0]}/{taskName}{data_format}', index=False)
        #df_raw.to_csv(f'{output_folder}/{folder_structure[1]}/{taskName}_raw.csv', index=False)
        print(f'Merged {taskName}')
    
        print(len(df))
    
    return list_of_tasks


In [93]:
list_of_tasks

['IC3_NVtrailMaking',
 'IC3_Orientation',
 'IC3_PearCancellation',
 'IC3_rs_digitSpan',
 'IC3_rs_spatialSpan',
 'IC3_rs_PAL',
 'IC3_rs_SRT',
 'IC3_rs_CRT',
 'IC3_SemanticJudgment',
 'IC3_i4i_IDED',
 'IC3_i4i_motorControl',
 'IC3_calculation',
 'IC3_GestureRecognition',
 'IC3_AuditorySustainedAttention',
 'IC3_BBCrs_blocks',
 'IC3_Comprehension',
 'IC3_rs_oddOneOut',
 'IC3_TaskRecall',
 'IC3_NVtrailMaking2',
 'IC3_NVtrailMaking3']

In [95]:
list_of_tasks = merge_control_data_across_sites(root_path, folder_structure, supervised_data_folders, remote_data_folders, list_of_tasks,list_of_questionnaires,list_of_speech, data_format, merged_data_folder)
print(len(list_of_tasks))

Merged IC3_NVtrailMaking
6330
Merged IC3_Orientation
7438
Merged IC3_PearCancellation
7401
Merged IC3_rs_digitSpan
6354
Merged IC3_rs_spatialSpan
6961
Merged IC3_rs_PAL
6763
Merged IC3_rs_SRT
6455
Merged IC3_rs_CRT
6425
Merged IC3_SemanticJudgment
5980
Merged IC3_i4i_IDED
6469
Merged IC3_i4i_motorControl
6401
Merged IC3_calculation
6656
Merged IC3_GestureRecognition
6273
Merged IC3_AuditorySustainedAttention
7143
Merged IC3_BBCrs_blocks
6242
Merged IC3_Comprehension
6034
Merged IC3_rs_oddOneOut
6700
Merged IC3_TaskRecall
5639
Merged IC3_NVtrailMaking2
6313
Merged IC3_NVtrailMaking3
6332
Merged IC3_NamingTest
10666
Merged IC3_Reading
3748
Merged IC3_Repetition
7028
Merged q_IC3_demographics
7659
Merged q_IC3_fatigue
5579
Merged q_IC3_GDS
5592
Merged q_IC3_apathy
5610
18


In [None]:
import os
import pandas as pd

def ensure_directory(path):
    """Checks if directory exists, creates it if it does not exist."""
    if not os.path.isdir(path):
        os.makedirs(path, exist_ok=True)

def load_task_data(root, data_folder, subfolder, task, data_format, raw=False, version_suffix=''):
    """
    Helper to load a CSV file.
    :param root: Root directory.
    :param data_folder: The data folder (e.g. supervised or remote).
    :param subfolder: The subfolder name (e.g. summary or raw folder).
    :param task: Task name.
    :param data_format: File extension (e.g. '.csv').
    :param raw: If True, load the raw file.
    :param version_suffix: Optional suffix (for special cases like IDED).
    """
    
    base = os.path.join(root, data_folder.strip("/"), subfolder.strip("/"), task)
    suffix = "_raw" if raw else ""
    filename = f"{base}{version_suffix}{suffix}{data_format}"

    return pd.read_csv(filename, low_memory=False)

def merge_and_save(df_list, output_file):
    """
    Concatenate list of dataframes and save to a CSV file.
    :param df_list: List of dataframes to merge.
    :param output_file: Full path of the output file.
    """
    merged_df = pd.concat(df_list, ignore_index=True)
    print(len(merged_df))
    merged_df.to_csv(output_file, index=False)

def merge_clinical_tests(root, folder_structure, supervised_data_folders, remote_data_folder, tasks, data_format, output_base):
    
    """
    Merge clinical test data across sources.
    This function handles the special case for 'IC3_i4i_IDED' and temporary addition of NVtrailMaking versions.
    """
    
    # Use a copy of the tasks list to add temporary tasks
    tasks_to_merge = tasks.copy()
    if 'IC3_NVtrailMaking' in tasks_to_merge:
        tasks_to_merge += ['IC3_NVtrailMaking2', 'IC3_NVtrailMaking3']

    for task in tasks_to_merge:
        # Supervised source 1
        df_v1 = load_task_data(root, supervised_data_folders[0], folder_structure[0], task, data_format, raw=False)
        df_v1_raw = load_task_data(root, supervised_data_folders[0], folder_structure[1], task, data_format, raw=True)
        
        # Supervised source 2 (with special case for IDED)
        if task == "IC3_i4i_IDED":
            df_v2 = load_task_data(root, supervised_data_folders[1], folder_structure[0], task, data_format, raw=False, version_suffix='2')
            df_v2_raw = load_task_data(root, supervised_data_folders[1], folder_structure[1], task, data_format, raw=True, version_suffix='2')
        else:
            df_v2 = load_task_data(root, supervised_data_folders[1], folder_structure[0], task, data_format, raw=False)
            df_v2_raw = load_task_data(root, supervised_data_folders[1], folder_structure[1], task, data_format, raw=True)
        
        # Remote data source
        df_cog = load_task_data(root, remote_data_folder, folder_structure[0], task, data_format, raw=False)
        df_cog_raw = load_task_data(root, remote_data_folder, folder_structure[1], task, data_format, raw=True)
        
        print(task)
        
        # Merge the three sources
        summary_out = os.path.join(output_base, folder_structure[0].strip("/"), f"{task}{data_format}")
        merge_and_save([df_v1, df_v2, df_cog], summary_out)
        
        raw_out = os.path.join(output_base, folder_structure[1].strip("/"), f"{task}_raw{data_format}")
        merge_and_save([df_v1_raw, df_v2_raw, df_cog_raw],raw_out)

        # Optionally log progress: print(f'Merged clinical test data for {task}')

def merge_speech_data(root, folder_structure, supervised_data_folders, list_of_speech,  data_format, output_base):
    """
    Merge speech data from two supervised sources.
    """
    for task in list_of_speech:
        print(task)
        
        df_v1_raw = load_task_data(root, supervised_data_folders[0], folder_structure[1].strip("/"), task, data_format, raw=True)
        df_v2_raw = load_task_data(root, supervised_data_folders[1], folder_structure[1].strip("/"), task, data_format, raw=True)
        
        raw_out = os.path.join(output_base, folder_structure[1].strip("/"), f"{task}_raw{data_format}")
        merge_and_save([df_v1_raw, df_v2_raw], raw_out)
        
        
        # Optionally log progress: print(f'Merged speech data for {task}')

def merge_questionnaire_data(root, folder_structure, supervised_data_folders, remote_data_folder, list_of_questionnaires,  data_format, output_base):
    """
    Merge questionnaire data. Handles the special case for demographics.
    """
    for task in list_of_questionnaires:
        
        print(task)

        if task == 'q_IC3_demographics':
            
            # Supervised source 1 (two versions for healthy)
            df_v1 = load_task_data(root, supervised_data_folders[0], folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='Healthy_questionnaire')
            df_v1_2 = load_task_data(root, supervised_data_folders[0], folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='_questionnaire')
            df_v1_raw = load_task_data(root, supervised_data_folders[0], folder_structure[1].strip("/"), task, data_format, raw=True)
            df_v1_raw_2 = load_task_data(root, supervised_data_folders[0], folder_structure[1].strip("/"), task, data_format, raw=True, version_suffix='Healthy')

            # Supervised source 2
            df_v2 = load_task_data(root, supervised_data_folders[1], folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='_questionnaire')
            df_v2_raw = load_task_data(root, supervised_data_folders[1], folder_structure[1].strip("/"), task, data_format, raw=True)
            
            # Remote data source
            df_cog =  load_task_data(root, remote_data_folder, folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='Healthy_questionnaire')
            df_cog_raw =  load_task_data(root, remote_data_folder, folder_structure[1].strip("/"), task, data_format, raw=True, version_suffix='Healthy')

            summary_out = os.path.join(output_base, folder_structure[0].strip("/"), f"{task}{data_format}")
            merge_and_save([df_v1, df_v1_2, df_v2, df_cog], summary_out)
            
            raw_out = os.path.join(output_base, folder_structure[1].strip("/"), f"{task}_raw{data_format}")
            merge_and_save([df_v1_raw, df_v1_raw_2, df_v2_raw, df_cog_raw], raw_out)

        else:
            # For all other questionnaires
            # Supervised source 1

            df_v1 = load_task_data(root, supervised_data_folders[0], folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='_questionnaire')
            df_v1_raw = load_task_data(root, supervised_data_folders[0], folder_structure[1].strip("/"), task, data_format, raw=True)

            # Supervised source 2
            df_v2 = load_task_data(root, supervised_data_folders[1], folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='_questionnaire')
            df_v2_raw = load_task_data(root, supervised_data_folders[1], folder_structure[1].strip("/"), task, data_format, raw=True)
            
            # Remote data source
            df_cog =  load_task_data(root, remote_data_folder, folder_structure[0].strip("/"), task, data_format, raw=False, version_suffix='_questionnaire')
            df_cog_raw =  load_task_data(root, remote_data_folder, folder_structure[1].strip("/"), task, data_format, raw=True)
            
            summary_out = os.path.join(output_base, folder_structure[0].strip("/"), f"{task}{data_format}")
            merge_and_save([df_v1, df_v2, df_cog], summary_out)
            
            raw_out = os.path.join(output_base, folder_structure[1].strip("/"), f"{task}_raw{data_format}")
            merge_and_save([df_v1_raw, df_v2_raw, df_cog_raw], raw_out)
            
        # Optionally log progress: print(f'Merged questionnaire data for {task}')

def merge_control_data_across_sites(root_path, folder_structure, supervised_data_folders,
                                    remote_data_folders, list_of_tasks, list_of_questionnaires,
                                    list_of_speech, data_format, merged_data_folder):
    """
    Merge control data across sites from clinical tests, speech, and questionnaires.
    """
    # Build output base folder and ensure directory structure exists
    output_base = os.path.join(root_path, merged_data_folder.strip("/"))
    ensure_directory(output_base)
    
    for folder in folder_structure:
        ensure_directory(os.path.join(output_base, folder.strip("/")))
    
    # Merge clinical test data
    merge_clinical_tests(root_path, folder_structure, supervised_data_folders,
                                         remote_data_folders, list_of_tasks, data_format, output_base)
    
    # Merge speech data
    merge_speech_data(root_path, folder_structure, supervised_data_folders, list_of_speech,  data_format, output_base)
    
    # Merge questionnaire data
    merge_questionnaire_data(root_path, folder_structure, supervised_data_folders, remote_data_folders,
                             list_of_questionnaires,  data_format, output_base)
    
    return None

In [139]:
list_of_tasks

['IC3_NVtrailMaking',
 'IC3_Orientation',
 'IC3_PearCancellation',
 'IC3_rs_digitSpan',
 'IC3_rs_spatialSpan',
 'IC3_rs_PAL',
 'IC3_rs_SRT',
 'IC3_rs_CRT',
 'IC3_SemanticJudgment',
 'IC3_i4i_IDED',
 'IC3_i4i_motorControl',
 'IC3_calculation',
 'IC3_GestureRecognition',
 'IC3_AuditorySustainedAttention',
 'IC3_BBCrs_blocks',
 'IC3_Comprehension',
 'IC3_rs_oddOneOut',
 'IC3_TaskRecall']

In [138]:
merge_control_data_across_sites(root_path, folder_structure, supervised_data_folders,
                                    remote_data_folders, list_of_tasks, list_of_questionnaires,
                                    list_of_speech, data_format, merged_data_folder)
print(len(list_of_tasks))

IC3_NVtrailMaking
6330
93565
IC3_Orientation
7438
29727
IC3_PearCancellation
7401
168409
IC3_rs_digitSpan
6354
55975
IC3_rs_spatialSpan
6961
48779
IC3_rs_PAL
6763
165256
IC3_rs_SRT
6455
394523
IC3_rs_CRT
6425
405563
IC3_SemanticJudgment
5980
137628
IC3_i4i_IDED
6469
378462
IC3_i4i_motorControl
6401
191790
IC3_calculation
6656
58849
IC3_GestureRecognition
6273
50050
IC3_AuditorySustainedAttention
7143
252867
IC3_BBCrs_blocks
6242
158113
IC3_Comprehension
6034
119656
IC3_rs_oddOneOut
6700
93002
IC3_TaskRecall
5639
22524
IC3_NVtrailMaking2
6313
89333
IC3_NVtrailMaking3
6332
92736
IC3_NamingTest
10666
IC3_Reading
3748
IC3_Repetition
7028
q_IC3_demographics
7659
536548
q_IC3_fatigue
5579
55860
q_IC3_GDS
5592
179222
q_IC3_apathy
5610
213784
18


In [1]:
from preproc_functions_patients import *

In [2]:
root_path = '/Users/dg519/Documents/normative_paper_github/data'
list_of_tasks = ['IC3_Orientation', 'IC3_PearCancellation', 'IC3_rs_digitSpan', 'IC3_rs_spatialSpan', 'IC3_rs_PAL', 'IC3_rs_SRT', 'IC3_rs_CRT', 'IC3_SemanticJudgment', 'IC3_i4i_IDED', 'IC3_i4i_motorControl','IC3_calculation', 'IC3_GestureRecognition', 'IC3_AuditorySustainedAttention','IC3_BBCrs_blocks', 'IC3_Comprehension', 'IC3_NVtrailMaking','IC3_rs_oddOneOut', 'IC3_TaskRecall']
list_of_questionnaires = ['q_IC3_demographics', 'q_IC3_fatigue','q_IC3_GDS','q_IC3_apathy', 'q_IC3_IADL']
list_of_speech = ['IC3_NamingTest','IC3_Reading', 'IC3_Repetition']
patient_data_folders=['/data_patients_v1','/data_patients_v2']
folder_structure=['/summary_data','/trial_data','/speech']
output_clean_folder ='/data_patients_cleaned'
merged_data_folder ='/data_patients_merged'
clean_file_extension='_cleaned'
data_format='.csv'
clinical_information = '/Users/dg519/Documents/normative_paper_github/data/output_data/patients_database260924.xlsx'

In [3]:
df= main_preprocessing(root_path, list_of_tasks, list_of_questionnaires, list_of_speech, clinical_information)

Starting preprocessing...
Merging data across sites...Merged IC3_Orientation
Merged IC3_PearCancellation
Merged IC3_rs_digitSpan
Merged IC3_rs_spatialSpan
Merged IC3_rs_PAL
Merged IC3_rs_SRT
Merged IC3_rs_CRT
Merged IC3_SemanticJudgment
Merged IC3_i4i_IDED
Merged IC3_i4i_motorControl
Merged IC3_calculation
Merged IC3_GestureRecognition
Merged IC3_AuditorySustainedAttention
Merged IC3_BBCrs_blocks
Merged IC3_Comprehension
Merged IC3_NVtrailMaking
Merged IC3_rs_oddOneOut
Merged IC3_TaskRecall
Merged IC3_NamingTest
Merged IC3_Reading
Merged IC3_Repetition
Merged q_IC3_demographics
Merged q_IC3_fatigue
Merged q_IC3_GDS
Merged q_IC3_apathy
Merged q_IC3_IADL
Done
Pre-processing IC3_Orientation...Done
Pre-processing IC3_PearCancellation...Done
Pre-processing IC3_rs_digitSpan...Done
Pre-processing IC3_rs_spatialSpan...Done
Pre-processing IC3_rs_PAL...Done
Pre-processing IC3_rs_SRT...Done
Pre-processing IC3_rs_CRT...Done
Pre-processing IC3_SemanticJudgment...Done
Pre-processing IC3_i4i_IDED...D