# BANDA behavioral, clinical, NIH task data from NDA
#### francesca morfini; susan whitfield-gabrieli lab; Northeastern University

this script:
- loads all files specified in {what_to_load}
- scores loaded questionnaires specified in {what_to_score} - note only works for those for which myfx_scoring.py has scoring fx for
- combines data and scored subscales (dropping item-level questionnaire responses) into a long dataframe (this is 4 times N x Cols)
- (optional) transform form long to wide - by creating one column_T1|_T2|_T3|_T4 per subscale (this is N x 4 times the Cols)
- save to file

Note: need to run this script only once

## Load and Score

### Set up environment

In [1]:
#general
import pandas as pd
import numpy as np
from glob import glob
import functools as ft

#custom
import myfx_scoring as myfx

## User inputs

In [2]:
## paths
main_d = '/work/swglab/data/BANDA'
behave_d_in = f'{main_d}/NDA/BANDAImgManifestBeh'
behave_d_out = f'{main_d}/sourcedata/behavioral_data'

## general parameters
merge_cols = ['src_subject_id', 'visit','respondent','subjectkey','sex','interview_date','interview_age'] #cols that will be used to merged files
custom_cols = ['respondent','interview_age','interview_date'] #cols present in all source files but to keep specific to each file source

## files to load

In [3]:
## if you want to upload all files from {behave_d_in}
dont_load = ['dataset_collection', 'datastructure_manifest', 'md5_values', 'package_info', 'fmriresults01','imagingcollection01','fhs01'] #these files have general dataset info , not data per se
what_to_load = [f.replace(f'{behave_d_in}/','').replace('.txt','') for f in glob(f'{behave_d_in}/*.txt')] # returns only the name of the files in {behave_d_in} without abs path
what_to_load = [f for f in what_to_load if f not in dont_load]

## if you want a subset of files
# what_to_load = ['mfq01','ksads_diagnoses01'] #these names need to be a substring of the filename in behave_d_in (without extentions). e.g., 'cbcl' works to load 'cbcl01.txt'

## files to score

note: can't score something that has not been loaded in {what_to_load} and that's not included in the myfx_scoring.py functions


In [4]:
## to score everything possible (ie use all scoring functions included in the myfx module)
what_to_score = [eval(f'myfx.{fx}') for fx in dir(myfx) if 'score_' in fx] #grabbing all functions lik myfx.score_xxx from the scoring file

## if you want to specify a subsample of questionnaires
# what_to_score = [myfx.score_mfq,
#                  myfx.score_rcads]

### Helper functions

In [5]:
def myfx_load_behave(behave_d_in, questionnaire, merge_cols):

    '''
    behave_d_in : string representing an absolute path of parent directory for where file.txt live
    questionnaire : file name to load. no need to be exact name but should be included in filename e.g. 'demo' would work for demographic01.txt
    merge_cols : list of columns used as index to merge data from different files on
    '''    
    print(q)

    # load
    file = glob(f'{behave_d_in}/*{q}*')
    assert len(file) == 1, f'watch out: there are multiple files corresponding to {q}. Specify which is needed from these by passing a keyword that is unique:\n{file}'
    file = file[0]
    grab = pd.read_csv(f'{file}', sep='\t', skiprows=[1])

    # harmonize info between files
    to_fix = ['dccs01','flanker01','lswmt01','orrt01','pcps01','ndar_subject01','ksads_diagnoses01'] #these have no info about Respondent but it was always Child, so hardcoding it here 
    if q in to_fix:
        if 'respondent' not in grab.columns or grab['respondent'].isnull().all():
            grab['respondent'] = 'Child'

    grab = grab.set_index(merge_cols)

    # clean
    drop_cols = grab.columns[grab.columns.str.endswith('_id') | grab.columns.str.contains('version')].tolist() + ['collection_title'] # grab columns e.g., ['rcads_id', 'dataset_id', 'flanker01_version_form'] to be dropped
    grab = grab.drop(columns = drop_cols)
    grab = grab.replace('NaN',np.nan).replace("NaN",np.nan)

    if q not in ['pwmt01','pmat01','er4001','deldisk01']: #these may have 999 which are meaningful values so not dropping those, for other columsn 999/99999 etc represent missing - so substituting
        grab = grab.replace(999,np.nan).replace(9999,np.nan).replace(9998,np.nan)
        
    grab = grab.dropna(axis=1, how='all') # drop if every value that column is NaN
    grab = grab.loc[:, (grab!="None").all(axis = 0)]
    grab = grab.loc[:, (grab!="No").all(axis = 0)]

    # adding prefix corresponding to file name so that it'd be easy to track each variable back to each file
    # note this operation is reverted inside of each scoring_fx so that variable names included there match with original NDA nomenclature
    # ultimately, questionnaires that have been scored will have mycustolabel_subscale_name
    # those that are not scored (eg nih toolbox etc) will have origfilename_originalcolname
    grab = grab.add_prefix(f'{q}_')

    return grab, f'{q}_'

## Run

### Load and combine data

In [6]:
dfs = []
filename = []
for q in what_to_load:
    grab, questionnaire = myfx_load_behave(behave_d_in, q, merge_cols)
    dfs.append(grab)
    filename.append(questionnaire)

# combine dataframes        
print(f'\nMERGING by {merge_cols}')
df = ft.reduce(lambda left, right: pd.merge(left, right, on=merge_cols, how='outer', validate="one_to_one"), dfs)
df = df.replace('NaN',np.nan).replace("NaN",np.nan).sort_values(by= merge_cols)

bisbas01
cbcl01
chaphand01
cssrs01
dccs01
deldisk01
demographics02
er4001
flanker01
ksads_diagnoses01
ksads_diagnosesp201
lswmt01
masq01
mfq01
ndar_subject01
nffi01
orrt01
pcps01
pmat01
pwmt01
rbqa01
rcads01
rmbi01
shaps01
stai01
strain01
tanner_sms01
wasi201

MERGING by ['src_subject_id', 'visit', 'respondent', 'subjectkey', 'sex', 'interview_date', 'interview_age']


### Score questionnaires
for each questionnaire indicated to be scored in {what_to_score}, this function: 
- generates scored subscales and total score
- returns number of nan in the item-level which would have been used to calculate subscale or totals
- return an error if any subscale has values outside the theoretical range assumed by the questionnaire
- return an error if there are more or fewer amount of items used to generate the subscales

if specified:
- removes item-level columns
- returns count of all items which should have been used to generate each individual subscale or total (ie item_count - nan_count = number of items used/available)

output: one row per respondent {parent|child} and assessment timepoint {T1|T2|T3|T4} and one column per subscale or per original columns found in the loaded files (in cases when a questionnaire was loaded but not scored)

note: if you specify to drop original items, you can't run this twice in a row without re-loading original data

In [7]:
for grab_fx in what_to_score:
    df = grab_fx(df, item_level = 'drop', grab_item_count = 'no') # item_level : 'drop' or ''; grab_item_count : 'yes' or ''
print('DONE')

Scored bisbas
scored chaphand
scored cssrs
scored ksads
scored ksadsp201
Scored masq
Scored mfq
scored nffi
Remaned PENN tasks and NIH toolbox tasks
scored rbqa
scored rcads
scored rmbi
scored shaps
Scored stai
scored tanner
scored wasi
DONE


# Reality Checks at the Dataframe level

In [13]:
# check that there are no duplicate cases
assert len(df.index[df.index.duplicated(keep=False)].values) == 0, 'Watch out: there are duplicated indexes'
print('OK: no duplicate indexes found')

# more specific: check that each respondent is included only once (regardless of sex, interview_data, and interview_age)
assert len(df.index[df.index.droplevel(['sex','interview_date','interview_age']).duplicated()].values) == 0, 'Watch out: there are duplicated indexes'
print('OK: no duplicate assessment dates for respondents')

# check that there are no columns with just NaNs for all participants
assert df[df.columns[df.isna().all(axis = 0)]].shape[1] == 0, 'Watch out: there are some columns that only have all NaNs' #looking for columns that are all nan - there should not be any
print ('OK: no columns have elements that are all NaNs')

# check total N by respondent {parent, child} and timepoints x {T1, T2, T3, T4} is < 215 (which is N of recruited participants)
for r in ['Parent','Child']:
    for t in ['T1','T2','T3','T4']:
        n_cases = len(df.loc[:,(f'{t}'),(f'{r}'),:,:,:,:])
        assert n_cases <= 215, f'Watch out: there are more indexes corresponding to {r} at {t} than 215'        
print('OK: no extra participants: sample size of respondend x timepoint is always < n=215')
print('\nAll good')

OK: no duplicate indexes found
OK: no duplicate assessment dates for respondents
OK: no columns have elements that are all NaNs
OK: no extra participants: sample size of respondend x timepoint is always < n=215

All good


### Save scored data to file

In [None]:
df.to_csv(f'{behave_d_out}/banda_behave.csv', sep = ",")