In [17]:
from preproc_functions_controls import *

In [18]:
root_path = '/Users/dg519/Documents/normative_paper_github/data'
remote_data_folders='/data_ic3online_cognition'
supervised_data_folders=['/data_healthy_v1','/data_healthy_v2']
folder_structure=['/summary_data','/trial_data','/speech']
output_clean_folder ='/data_healthy_cleaned'
list_of_tasks = ['IC3_NVtrailMaking','IC3_Orientation', 'IC3_PearCancellation', 'IC3_rs_digitSpan', 'IC3_rs_spatialSpan', 'IC3_rs_PAL', 'IC3_rs_SRT', 'IC3_rs_CRT', 'IC3_SemanticJudgment', 'IC3_i4i_IDED', 'IC3_i4i_motorControl','IC3_calculation', 'IC3_GestureRecognition', 'IC3_AuditorySustainedAttention','IC3_BBCrs_blocks', 'IC3_Comprehension','IC3_rs_oddOneOut', 'IC3_TaskRecall']
list_of_questionnaires = ['q_IC3_demographics', 'q_IC3_fatigue','q_IC3_GDS','q_IC3_apathy']
list_of_speech = ['IC3_NamingTest','IC3_Reading', 'IC3_Repetition']
folder_list = ['data_healthy_v1', 'data_healthy_v2', 'data_ic3online_cognition', 'data_healthy_merged']
merged_data_folder ='/data_healthy_combined'
data_format = '.csv'

In [19]:
#main_preprocessing(root_path, list_of_tasks, list_of_questionnaires, list_of_speech)

In [20]:
merged_data_dir = os.path.join(root_path, merged_data_folder.lstrip('/'))
inclusion_criteria = create_inclusion_criteria(root_path, remote_data_folders, supervised_data_folders, folder_structure)
# Save inclusion criteria
inclusion_criteria_path = os.path.join(merged_data_dir, 'inclusion_criteria.csv')
inclusion_criteria.to_csv(inclusion_criteria_path, index=False)


We removed 6 people who failed both Orientation and Pear Cancellation, from all tasks.
We removed 407 who indicated neurological disorder, 91 with a history of dementia and 92 who are younger than 40.
We removed 11 people who cheated.
We removed 17 people who failed both Orientation and Pear Cancellation, from all tasks.
We removed 33 who indicated neurological disorder, 4 who are younger than 40.


In [21]:
def demographics_preproc(root_path, merged_data_folder, questionnaire_name, inclusion_criteria, folder_structure, data_format, clean_file_extension):
    
    os.chdir(root_path + merged_data_folder)
    try:
        df_dem_summary = pd.read_csv(f'.{folder_structure[0]}/{questionnaire_name}{data_format}', low_memory=False)
        df_dem_summary = df_dem_summary[df_dem_summary.user_id.isin(inclusion_criteria)]
        
        df_dem = pd.read_csv(f'.{folder_structure[1]}/{questionnaire_name}_raw{data_format}', low_memory=False)
        df_dem = df_dem[df_dem.user_id.isin(inclusion_criteria)]    

    except:
        print(f'Error in loading {questionnaire_name}. File might not exist.')
        return None
    
    df_dem.drop(['Unnamed: 0'], axis=1, inplace=True)
    df_dem =df_dem.drop_duplicates(subset=['user_id','question'], keep='last').reset_index(drop=True)
    df_dem_summary =df_dem_summary.drop_duplicates(subset='user_id',keep='last').reset_index(drop=True)
    
    # Extract demographics of interest
    
    age = df_dem.groupby(['question']).get_group('<center>Howoldareyou?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    gender = df_dem.groupby(['question']).get_group('<center>Whatisyourbiologicalsex?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    education = df_dem.groupby(['question']).get_group('<center>Whatisyourhighestlevelofeducation?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    device = df_dem.groupby(['question']).get_group('<center>Whatdeviceareyouusingatthemoment?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    english = df_dem.groupby(['question']).get_group('<center>HowwouldyourateyourproficiencyinEnglish?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    depression = df_dem.groupby(['question']).get_group('<center>Areyoucurrentlytakinganymedicationfordepression?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    anxiety = df_dem.groupby(['question']).get_group('<center>Areyoucurrentlytakinganymedicationforanxiety?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    dyslexia = df_dem.groupby(['question']).get_group('<center>DoyouhaveDyslexia,oranyotherproblemswithreadingandwriting?</center>').loc[:,['response','user_id']].reset_index(drop=True)
    risks = df_dem.groupby(['question']).get_group('<center>Haveyoueverbeentoldyouhavethefollowing?Tickallthatapplies</center>').loc[:,['response','user_id']].reset_index(drop=True)

    # Clean each variable
    
    age.response = pd.to_numeric(age.response)
    age.loc[age.response < 40,'response'] = np.nan
    
    gender.response = gender.response.replace(['53','55','65','78','71','72'],np.nan)
    gender.replace(['Male','Female'],[0,1], inplace=True)

    education.response = education.response.replace('1',np.nan)
    education.response = education.response.replace('Secondary/HighSchoolDiploma','Secondary/HighSchool-A-levels')
    education.response = education.response.replace('Primary/ElementarySchool','SecondarySchool-GCSE')
    education.replace(['SecondarySchool-GCSE','Secondary/HighSchool-A-levels','ProfessionalDegree','Bachelor\'sDegree','Master\'sDegree','PhD'],[0,1,1,2,3,3], inplace=True)
                
    device = device.merge(df_dem_summary.loc[:,['user_id','os']], on='user_id', how='outer')
    device.response = device.response.fillna(device.os)
    device.drop('os',axis=1,inplace=True)
    
    english.response.replace({'3': 1, '4': 1, 'No': np.nan, '2':1,'1':0}, inplace=True)
    depression.response.replace({'No':0, 'Yes':1, 'SKIPPED':0}, inplace=True)   
    anxiety.response.replace({'No':0, 'Yes':1, 'SKIPPED':0}, inplace=True)
    dyslexia.response.replace({'Yes':1,'No':0,'Tablet':0,'Touchscreen':0,np.nan:0}, inplace=True)
    
    risks.drop_duplicates(keep='last', inplace=True)
    risks.replace(np.nan, ' ', inplace=True)
    risks.response = risks.response.str.lower()
    risks['diabetes'] = risks.response.apply(lambda x: 'diabetes' in x).replace([True,False], [1,0])
    risks['highbloodpressure'] = risks.response.apply(lambda x: 'highbloodpressure' in x).replace([True,False], [1,0])
    risks['highcholesterol'] = risks.response.apply(lambda x: ('highcholesterol' in x) or ('highcholesterole' in x)).replace([True,False], [1,0])
    risks['heartdisease'] = risks.response.apply(lambda x: 'heartdisease' in x).replace([True,False], [1,0])
    risks['kidneydisease'] = risks.response.apply(lambda x: 'kidneydisease' in x).replace([True,False], [1,0])
    risks['alcoholdependency'] = risks.response.apply(lambda x: 'alcoholdependency' in x).replace([True,False], [1,0])
    risks['over-weight'] = risks.response.apply(lambda x: ('over-weight' in x) or ('overweight' in x)).replace([True,False], [1,0])
    risks['long-termsmoker'] = risks.response.apply(lambda x: ('long-termsmoker' in x) or ('longtermsmoker' in x)).replace([True,False], [1,0])
    risks['ex-smoker'] = risks.response.apply(lambda x: ('ex-smoker' in x) or ('exsmoker' in x)).replace([True,False], [1,0])
    risks.loc[(risks['long-termsmoker'] & risks['ex-smoker']).astype(bool),'ex-smoker'] = 0
    risks.response = risks.iloc[:,2:].sum(axis=1)
    
    age.drop_duplicates(subset="user_id",keep='last',inplace=True)
    gender.drop_duplicates(subset="user_id", keep='last',inplace=True)
    education.drop_duplicates(subset="user_id", keep='last',inplace=True)
    device.drop_duplicates(subset="user_id", keep='last',inplace=True)
    english.drop_duplicates(subset="user_id", keep='last',inplace=True)
    depression.drop_duplicates(subset="user_id", keep='last',inplace=True)
    anxiety.drop_duplicates(subset="user_id", keep='last',inplace=True)
    dyslexia.drop_duplicates(subset="user_id", keep='last',inplace=True)
    risks.drop_duplicates(subset="user_id", keep='last',inplace=True)
    
    age.rename(columns={'response':'age'}, inplace=True)
    gender.rename(columns={'response':'gender'}, inplace=True)
    education.rename(columns={'response':'education'}, inplace=True)
    device.rename(columns={'response':'device'}, inplace=True)
    english.rename(columns={'response':'english'}, inplace=True)
    depression.rename(columns={'response':'depression'}, inplace=True)
    anxiety.rename(columns={'response':'anxiety'}, inplace=True)
    dyslexia.rename(columns={'response':'dyslexia'}, inplace=True)
    risks.rename(columns={'response':'risks'}, inplace=True)
    
    age.dropna(inplace=True)
    gender.dropna(inplace=True)
    education.dropna(inplace=True)
    device.dropna(inplace=True)
    english.dropna(inplace=True)
    depression.dropna(inplace=True)
    anxiety.dropna(inplace=True)
    dyslexia.dropna(inplace=True)
    risks.dropna(inplace=True)
              
    # Merge and format
    
    healthy_demographics = age.merge(gender,on='user_id').merge(education,on='user_id').merge(device,on='user_id').merge(english,on='user_id').merge(depression,on='user_id').merge(anxiety,on='user_id').merge(risks,on='user_id').merge(dyslexia,on='user_id')
    healthy_demographics.education = healthy_demographics.education.astype(int)
    
    one_hot_encoded_data = pd.get_dummies(healthy_demographics, columns = ['device', 'education'])
    one_hot_encoded_data.rename(columns={'education_1':'education_Alevels', 'education_2':'education_bachelors','education_3':'education_postBachelors'}, inplace=True)
    one_hot_encoded_data.rename(columns={'device_1':'device_tablet', 'device_0':'device_phone'}, inplace=True)
    one_hot_encoded_data.rename(columns={'english':'english_secondLanguage'}, inplace=True)
    one_hot_encoded_data.replace({True:1, False:0}, inplace=True)
    one_hot_encoded_data.loc[:,'gender':'education_postBachelors'] = one_hot_encoded_data.loc[:,'gender':'education_postBachelors'] -0.5

    # Save 
    
    one_hot_encoded_data.to_csv(f'.{folder_structure[0]}/{questionnaire_name}{clean_file_extension}{data_format}', index=False)
    
    return one_hot_encoded_data


In [22]:
df_demographics = demographics_preproc(root_path, merged_data_folder, 'q_IC3_demographics', inclusion_criteria, folder_structure, data_format, '_cleaned')


In [23]:
df_demographics.head()

Unnamed: 0,age,user_id,gender,english_secondLanguage,depression,anxiety,risks,diabetes,highbloodpressure,highcholesterol,...,long-termsmoker,ex-smoker,dyslexia,device_Laptop/Computer,device_Phone,device_Tablet,education_0,education_Alevels,education_bachelors,education_postBachelors
0,66.0,6f181486-0e14-450e-bfe3-fb85e925adba,-0.5,-0.5,0.5,-0.5,4.5,-0.5,0.5,0.5,...,-0.5,0.5,0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5,-0.5
1,54.0,12e7f6c0-f77e-4064-88fc-abb9e1952693,0.5,-0.5,-0.5,-0.5,1.5,-0.5,-0.5,-0.5,...,-0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0.5
2,68.0,54270eee-d9a7-4d44-b72f-f0137372acce,0.5,-0.5,-0.5,-0.5,2.5,-0.5,-0.5,0.5,...,-0.5,0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,-0.5
3,41.0,68732c3e-9d23-4212-ad3c-22acaddff7e0,-0.5,-0.5,-0.5,-0.5,2.5,-0.5,-0.5,0.5,...,-0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5
4,63.0,4b21c478-5926-4c59-b37f-60d6a40ccf43,0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,...,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5


In [24]:
len(df_demographics)

6355

In [None]:
import os
import pandas as pd
import numpy as np


def demographics_preproc(root_path, merged_data_folder, output_clean_folder, questionnaire_name,
                         inclusion_criteria, folder_structure, data_format,
                         clean_file_extension):
    """
    Preprocess and clean demographic questionnaire data.

    This function loads demographic data from two CSV files (a summary file and a raw file),
    filters the data based on the provided inclusion criteria, cleans and transforms the
    demographic variables, and returns a one-hot encoded DataFrame of the cleaned data.
    The processed DataFrame is also saved to a CSV file.

    Parameters
    ----------
    root_path : str
        Root directory path.
    merged_data_folder : str
        Folder name where the merged data is stored.
    questionnaire_name : str
        Name of the questionnaire (used in file naming).
    inclusion_criteria : list
        List of user IDs to be included in the analysis.
    folder_structure : list of str
        Two-element list with folder paths: first for the summary file, second for the raw file.
    data_format : str
        File extension (e.g. ".csv").
    clean_file_extension : str
        Suffix to be appended to the output file name (e.g. "_clean").

    Returns
    -------
    pd.DataFrame or None
        A one-hot encoded DataFrame containing the cleaned demographic data, or
        None if there was an error loading the data.

    Examples
    --------
    >>> df = demographics_preproc("/data", "merged", "survey", [1, 2, 3],
    ...                           ["/summary", "/raw"], ".csv", "_clean")
    >>> df.head()
    """
    
    # Set working directory
    os.chdir(os.path.join(root_path, merged_data_folder.strip('/')))

    # Construct file paths (strip / if any)
    raw_path = os.path.join(folder_structure[1].strip('/'), f"{questionnaire_name}_raw{data_format}")
    summary_path = os.path.join(folder_structure[0].strip('/'), f"{questionnaire_name}{data_format}")

    try:
        df_dem = pd.read_csv(raw_path, low_memory=False)
        df_dem = df_dem[df_dem.user_id.isin(inclusion_criteria)]
        
        df_dem_summary = pd.read_csv(summary_path, low_memory=False)
        df_dem_summary = df_dem_summary[df_dem_summary.user_id.isin(inclusion_criteria)]
    except Exception as e:
        print(f"Error loading {questionnaire_name}: {e}. File might not exist.")
        return None

    # Drop unwanted columns and duplicates
    if 'Unnamed: 0' in df_dem.columns:
        df_dem.drop(['Unnamed: 0'], axis=1, inplace=True)
    df_dem = df_dem.drop_duplicates(subset=['user_id', 'question'],keep='last').reset_index(drop=True)
    df_dem_summary = df_dem_summary.drop_duplicates(subset='user_id', keep='last').reset_index(drop=True)

    def extract_response(df, question_text):
        """Extract the response and user_id for a given question."""
        return df.loc[df['question'] == question_text, ['user_id', 'response']].copy().reset_index(drop=True)

    # Extract demographics of interest using the question text
    age_df = extract_response(df_dem, '<center>Howoldareyou?</center>')
    gender_df = extract_response(df_dem, '<center>Whatisyourbiologicalsex?</center>')
    education_df = extract_response(df_dem, '<center>Whatisyourhighestlevelofeducation?</center>')
    device_df = extract_response(df_dem, '<center>Whatdeviceareyouusingatthemoment?</center>')
    english_df = extract_response(df_dem, '<center>HowwouldyourateyourproficiencyinEnglish?</center>')
    depression_df = extract_response(df_dem, '<center>Areyoucurrentlytakinganymedicationfordepression?</center>')
    anxiety_df = extract_response(df_dem, '<center>Areyoucurrentlytakinganymedicationforanxiety?</center>')
    dyslexia_df = extract_response(df_dem, '<center>DoyouhaveDyslexia,oranyotherproblemswithreadingandwriting?</center>')
    risks_df = extract_response(df_dem, '<center>Haveyoueverbeentoldyouhavethefollowing?Tickallthatapplies</center>')

    # --- Cleaning Functions for Each Variable ---

    def clean_age(df):
        df['response'] = pd.to_numeric(df['response'], errors='coerce')
        df.loc[df['response'] < 40, 'response'] = np.nan
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'age'}, inplace=True)
        return df

    def clean_gender(df):
        df['response'] = df['response'].replace(['53', '55', '65', '78', '71', '72'], np.nan)
        df.replace({'Male': 0, 'Female': 1}, inplace=True)
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'gender'}, inplace=True)
        return df

    def clean_education(df):
        df['response'] = df['response'].replace('1', np.nan)
        df['response'] = df['response'].replace({
            'Secondary/HighSchoolDiploma': 'Secondary/HighSchool-A-levels',
            'Primary/ElementarySchool': 'SecondarySchool-GCSE'
        })
        mapping = {
            'SecondarySchool-GCSE': 0,
            'Secondary/HighSchool-A-levels': 1,
            'ProfessionalDegree': 1,
            "Bachelor'sDegree": 2,
            "Master'sDegree": 3,
            'PhD': 3
        }
        df.replace({'response': mapping}, inplace=True)
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'education'}, inplace=True)
        return df

    def clean_device(df, summary_df):
        df = df.merge(summary_df[['user_id', 'os']], on='user_id', how='outer')
        df['response'] = df['response'].fillna(df['os'])
        df['response'] = df['response'].replace({'Mac OS X': 'Tablet', 'Android': 'Phone', 'Windows': 'Laptop/Computer', 'iOS': 'Phone', 'Chrome OS': 'Laptop/Computer'})
        df.drop(columns='os', inplace=True)
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'device'}, inplace=True)
        return df

    def clean_english(df):
        df['response'] = df['response'].replace({'3': 1, '4': 1, 'No': np.nan, '2': 1, '1': 0})
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'english'}, inplace=True)
        return df

    def clean_binary_response(df, col_name, mapping):
        df['response'] = df['response'].replace(mapping)
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': col_name}, inplace=True)
        return df

    def clean_risks(df):
        df.drop_duplicates(keep='last', inplace=True)
        df.replace(np.nan, ' ', inplace=True)
        df['response'] = df['response'].str.lower()
        risk_conditions = {
            'diabetes': ['diabetes'],
            'highbloodpressure': ['highbloodpressure'],
            'highcholesterol': ['highcholesterol', 'highcholesterole'],
            'heartdisease': ['heartdisease'],
            'kidneydisease': ['kidneydisease'],
            'alcoholdependency': ['alcoholdependency'],
            'over-weight': ['over-weight', 'overweight'],
            'long-termsmoker': ['long-termsmoker', 'longtermsmoker'],
            'ex-smoker': ['ex-smoker', 'exsmoker']
        }
        for key, terms in risk_conditions.items():
            df[key] = df['response'].apply(lambda x: any(term in x for term in terms)).astype(int)
        df.loc[(df['long-termsmoker'] & df['ex-smoker']).astype(bool), 'ex-smoker'] = 0
        
        # Sum risk flags to obtain a risk score
        df['response'] = df[list(risk_conditions.keys())].sum(axis=1)
        #df.drop(columns=list(risk_conditions.keys()), inplace=True)
        df.drop_duplicates(subset='user_id', keep='last', inplace=True)
        df.dropna(inplace=True)
        df.rename(columns={'response': 'risks'}, inplace=True)
        return df

    # --- Clean each extracted DataFrame ---
    age_df = clean_age(age_df)
    gender_df = clean_gender(gender_df)
    education_df = clean_education(education_df)
    device_df = clean_device(device_df, df_dem_summary)
    english_df = clean_english(english_df)
    depression_df = clean_binary_response(depression_df, 'depression',
                                          {'No': 0, 'Yes': 1, 'SKIPPED': 0})
    anxiety_df = clean_binary_response(anxiety_df, 'anxiety',
                                       {'No': 0, 'Yes': 1, 'SKIPPED': 0})
    dyslexia_df = clean_binary_response(dyslexia_df, 'dyslexia',
                                     {'Yes': 1, 'No': 0, 'Tablet': 0, 'Touchscreen': 0, np.nan: 0})
    risks_df = clean_risks(risks_df)

    # Merge all cleaned data on 'user_id'
    dfs = [age_df, gender_df, education_df, device_df, english_df,
           depression_df, anxiety_df, risks_df, dyslexia_df]
    healthy_demographics = dfs[0]
    
    for df in dfs[1:]:
        healthy_demographics = healthy_demographics.merge(df, how='left', on='user_id')
        
    healthy_demographics['dyslexia'] = healthy_demographics['dyslexia'].replace(np.nan,0)
    healthy_demographics = healthy_demographics.dropna()

    # Ensure education is of integer type
    healthy_demographics['education'] = healthy_demographics['education'].astype(int)

    # One-hot encode categorical variables
    one_hot_encoded_data = pd.get_dummies(healthy_demographics, columns=['device', 'education'])
    one_hot_encoded_data.rename(columns={
        'education_1': 'education_Alevels',
        'education_2': 'education_bachelors',
        'education_3': 'education_postBachelors',
        'device_1': 'device_tablet',
        'device_0': 'device_phone',
        'english': 'english_secondLanguage'
    }, inplace=True)
    one_hot_encoded_data.replace({True: 1, False: 0}, inplace=True)

    # Adjust numerical columns by subtracting 0.5 (from gender to education_postBachelors)
    cols_to_adjust = one_hot_encoded_data.loc[:, 'gender':'education_postBachelors'].columns
    one_hot_encoded_data[cols_to_adjust] = one_hot_encoded_data[cols_to_adjust] - 0.5

    # Save the final DataFrame
    output_path = os.path.join(root_path, output_clean_folder.strip('/'), folder_structure[0].strip('/'),
                               f"{questionnaire_name}{clean_file_extension}{data_format}")
    one_hot_encoded_data.to_csv(output_path, index=False)

    return one_hot_encoded_data

In [151]:
df_demographics = demographics_preproc(root_path, merged_data_folder, output_clean_folder, 'q_IC3_demographics', inclusion_criteria, folder_structure, data_format, '_cleaned')



user_id      0
education    0
dtype: int64


In [148]:
def combine_demographics_and_cognition(root_path, output_clean_folder, folder_structure, list_of_tasks, df_demographics, clean_file_extension, data_format):
    
    os.chdir(root_path + output_clean_folder + folder_structure[0])
    
    #Do a loop and read the data in each of the tasks

    for file in list_of_tasks:
        
        # Read Task Data for Healthy Participants
        temp_healthy_cog = pd.read_csv(f'{file}{clean_file_extension}{data_format}', low_memory=False)
        
        # Drop duplicates if any
        temp_healthy_cog.drop_duplicates(subset='user_id', keep='last', inplace=True)

        # Reset the Index
        temp_healthy_cog.reset_index(drop=True, inplace=True)

        # Merge Demographics data between patients and controls
        task_id = temp_healthy_cog.taskID.iloc[0]       
        temp_healthy_cog = temp_healthy_cog.loc[:,['user_id','SummaryScore']]
        temp_healthy_cog.rename(columns={'SummaryScore':task_id}, inplace=True)
        
        df_demographics= pd.merge(df_demographics, temp_healthy_cog,  on="user_id", how="left")
    
    df_demographics.to_csv('summary_cognition_and_demographics_old.csv')
    return df_demographics


In [149]:
df_demographics = combine_demographics_and_cognition(
    root_path, output_clean_folder, folder_structure, list_of_tasks, df_demographics, '_cleaned', data_format
)

In [152]:
import os
import pandas as pd


def combine_demographics_and_cognition(root_path, output_clean_folder, folder_structure,
                                         list_of_tasks, df_demographics,
                                         clean_file_extension, data_format):
    """
    Combine demographic data with cognitive task scores.

    This function iterates over a list of cognitive task files, reads each file,
    cleans the data by dropping duplicates and renaming the summary score column to the task ID,
    and then merges the resulting summary scores into the provided demographics DataFrame.
    Finally, it saves the combined DataFrame as a CSV file.

    Parameters
    ----------
    root_path : str
        The root directory path.
    output_clean_folder : str
        The folder name containing cleaned cognitive data.
    folder_structure : list of str
        List of folder structure components; the first element is used to locate the tasks.
    list_of_tasks : list of str
        List of task file base names (without extension or clean_file_extension).
    df_demographics : pd.DataFrame
        DataFrame containing the demographic data.
    clean_file_extension : str
        Suffix appended to task file names (e.g., '_clean').
    data_format : str
        File format extension (e.g., '.csv').

    Returns
    -------
    pd.DataFrame
        Merged DataFrame containing demographics and cognitive task scores.

    Examples
    --------
    >>> merged_df = combine_demographics_and_cognition(
    ...     "/data", "cleaned", ["/tasks"], ["task1", "task2"], demographics_df,
    ...     "_clean", ".csv"
    ... )
    >>> merged_df.head()
    """
    # Build the folder path without changing the working directory
    tasks_folder = os.path.join(root_path, output_clean_folder.strip('/'), folder_structure[0].strip('/'))
    
    # Loop through each cognitive task file
    for task_file in list_of_tasks:
        task_file_path = os.path.join(tasks_folder, f"{task_file}{clean_file_extension}{data_format}")
        try:
            temp_cog = pd.read_csv(task_file_path, low_memory=False)
        except Exception as e:
            print(f"Error reading file {task_file_path}: {e}")
            continue
        
        # Drop duplicate entries for each user and reset the index
        temp_cog = temp_cog.drop_duplicates(subset='user_id', keep='last').reset_index(drop=True)
        
        # Ensure the 'taskID' column exists and use its first value as the new column name
        if 'taskID' not in temp_cog.columns:
            print(f"'taskID' column missing in {task_file_path}. Skipping this file.")
            continue
        
        task_id = temp_cog.loc[0, 'taskID']
        
        # Select only the 'user_id' and 'SummaryScore' columns and rename 'SummaryScore' to the task_id
        temp_cog = temp_cog[['user_id', 'SummaryScore']].rename(columns={'SummaryScore': task_id})
        
        # Merge the cognitive data with the demographics data on 'user_id'
        df_demographics = pd.merge(df_demographics, temp_cog, on='user_id', how='left')
    
    # Save the merged DataFrame
    output_file = os.path.join(tasks_folder, "summary_cognition_and_demographics_new.csv")
    df_demographics.to_csv(output_file, index=False)
    
    return df_demographics

In [153]:
df_demographics = combine_demographics_and_cognition(
    root_path, output_clean_folder, folder_structure, list_of_tasks, df_demographics,  '_cleaned', data_format
)

In [1]:
from preproc_functions_patients import *

In [2]:
root_path = '/Users/dg519/Documents/normative_paper_github/data'
list_of_tasks = ['IC3_Orientation', 'IC3_PearCancellation', 'IC3_rs_digitSpan', 'IC3_rs_spatialSpan', 'IC3_rs_PAL', 'IC3_rs_SRT', 'IC3_rs_CRT', 'IC3_SemanticJudgment', 'IC3_i4i_IDED', 'IC3_i4i_motorControl','IC3_calculation', 'IC3_GestureRecognition', 'IC3_AuditorySustainedAttention','IC3_BBCrs_blocks', 'IC3_Comprehension', 'IC3_NVtrailMaking','IC3_rs_oddOneOut', 'IC3_TaskRecall']
list_of_questionnaires = ['q_IC3_demographics', 'q_IC3_fatigue','q_IC3_GDS','q_IC3_apathy', 'q_IC3_IADL']
list_of_speech = ['IC3_NamingTest','IC3_Reading', 'IC3_Repetition']
patient_data_folders=['/data_patients_v1','/data_patients_v2']
folder_structure=['/summary_data','/trial_data','/speech']
output_clean_folder ='/data_patients_cleaned'
merged_data_folder ='/data_patients_merged'
clean_file_extension='_cleaned'
data_format='.csv'
clinical_information = '/Users/dg519/Documents/normative_paper_github/data/output_data/patients_database260924.xlsx'

In [None]:
df= main_preprocessing(root_path, list_of_tasks, list_of_questionnaires, list_of_speech, clinical_information)

Starting preprocessing...
Merging data across sites...IC3_NamingTest
IC3_Reading
IC3_Repetition
Done
Pre-processing IC3_Orientation...Done
Pre-processing IC3_PearCancellation...Done
Pre-processing IC3_rs_digitSpan...Done
Pre-processing IC3_rs_spatialSpan...Done
Pre-processing IC3_rs_PAL...Done
Pre-processing IC3_rs_SRT...Done
Pre-processing IC3_rs_CRT...Done
Pre-processing IC3_SemanticJudgment...Done
Pre-processing IC3_i4i_IDED...Done
Pre-processing IC3_i4i_motorControl...Done
Pre-processing IC3_calculation...Done
Pre-processing IC3_GestureRecognition...Done
Pre-processing IC3_AuditorySustainedAttention...Done
Pre-processing IC3_BBCrs_blocks...Done
Pre-processing IC3_Comprehension...Done
Pre-processing IC3_NVtrailMaking...Done
Pre-processing IC3_rs_oddOneOut...Done
Pre-processing IC3_TaskRecall...Done
Pre-processing q_IC3_demographics...Done
Pre-processing q_IC3_fatigue...Questionnaire q_IC3_fatigue does not have a specific preprocessing function.
Pre-processing q_IC3_GDS...Questionnai