In [1]:
import pandas as pd
import numpy as np
import os
import os.path as op

In [2]:
datadir = "/data/RocklandSample/assessments/"
files = sorted([op.join(datadir, f) for f in os.listdir(datadir) if f.endswith('csv')])
files

['/data/RocklandSample/assessments/3760_Age_20140917.csv',
 '/data/RocklandSample/assessments/3760_HT_WT_Vitals_20140917.csv',
 '/data/RocklandSample/assessments/3760_MRI_Log_Sheet_20140917.csv',
 '/data/RocklandSample/assessments/scan_visit_list.csv']

In [3]:
age_df = pd.read_csv(files[0])
htwt_df = pd.read_csv(files[1])
log_df = pd.read_csv(files[2])
visits_df = pd.read_csv(files[3])

In [4]:
multi_df = pd.DataFrame(columns=["Anonymized ID", "Study Codes", "Visit Codes", "DWI and T1w"])
count = 0
for aid in visits_df['Anonymized ID'].unique():
    tmpdf = visits_df.loc[visits_df['Anonymized ID'] == aid] 
    if len(tmpdf) > 1:
        ims = [tmpdf["DWI"].unique, tmpdf["T1w"].unique]
        multi_df = multi_df.append({"Anonymized ID": aid,
                                    "Study Codes": tmpdf["Study Code"].unique(),
                                    "Visit Codes": tmpdf["Visit Code"].unique(),
                                    "DWI and T1w": all(ims)}, ignore_index=True)
        count += 1
print(count)

290


In [5]:
multi_df

Unnamed: 0,Anonymized ID,Study Codes,Visit Codes,DWI and T1w
0,A00018030,"[CLG, DS]","[4, 4R, 2]",True
1,A00027159,"[CLG, DS]","[4, 4R, 2]",True
2,A00027167,"[CLG, DS]","[4, 4R, 2]",True
3,A00027439,"[CLG, DS]","[4, 4R, 2]",True
4,A00028185,"[DS, NFB]","[2, 2R, 3]",True
...,...,...,...,...
285,A00066860,"[CLG, DS]","[2R, A]",True
286,A00066926,[NFB],"[3, A]",True
287,A00072203,[NFB],"[3, A]",True
288,A00073600,[NFB],"[3, A]",True


In [6]:
filehandle = "/data/RocklandSample/dmriprep_filelist.txt"
with open(filehandle, 'r') as fhandle:
    data = fhandle.readlines()

In [14]:
columns = ["subject", "dataset", "n_sessions",  "data"]
df = pd.DataFrame(columns=columns)

lut = {"aparc+aseg.nii.gz": 0,
       "T1w.nii.gz": 1,
       "dwi.bvals": 2,
       "dwi.bvecs": 3,
       "dwi.nii.gz": 4}

for line in data:
    l = line.strip()
    issub = l.startswith("sub-") and not l.endswith("gz") and not l.endswith("s")

    if issub:
        # if there's a previous subj DF, add it to the stack
        try:
            df = df.append(tmpdf, ignore_index=True, sort=False)
            del tmpdf
        except NameError as e:
            pass

        # create new subj DF
        sub = l.strip('sub-')
        tmpdf = {"subject": sub,
                 "dataset": [],
                 "n_sessions": 0,
                 "data": [   0,    0,    0,    0,    0]}
                #        aparc,   t1, bval, bvec,  dwi 
        continue

    isses = l.startswith("ses-")
    if isses:
        ses = l.strip('ses-')
        tmpdf["dataset"] += [ses]
        tmpdf["n_sessions"] += 1
        continue

    if l == "anat" or l == "dwi" or l is '':
        continue

    ending = l.split("_")[-1]
    tmpdf["data"][lut[ending]] += 1

Unnamed: 0,subject,dataset,n_sessions,data
0,A00008326,[DS2],1,"[1, 1, 1, 1, 1]"
1,A00010893,[DS2],1,"[1, 1, 1, 1, 1]"
2,A00013809,[DS2],1,"[0, 1, 1, 1, 1]"
3,A00018030,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
4,A00019903,[DS2],1,"[1, 1, 1, 1, 1]"
...,...,...,...,...
887,A00073677,[ALGA],1,"[1, 1, 1, 1, 1]"
888,A00073705,[DSA],1,"[1, 1, 1, 1, 1]"
889,A00073942,[ALGA],1,"[1, 1, 1, 1, 1]"
890,A00073953,[ALGA],1,"[1, 1, 1, 1, 1]"


In [24]:
multi_df = df.copy()

single_sessions = multi_df[multi_df["n_sessions"] < 2].index
multi_df.drop(single_sessions, inplace=True)

missing_data = multi_df[multi_df["data"].map(sum) < 5*multi_df["n_sessions"]].index
multi_df.drop(missing_data, inplace=True)

ds = lambda x: "DS2" not in x and "DSA" not in x
only_x_sectional = multi_df[multi_df["dataset"].map(ds)].index
multi_df.drop(only_x_sectional, inplace=True)

multi_df.reset_index(inplace=True, drop=True)

In [25]:
multi_df

Unnamed: 0,subject,dataset,n_sessions,data
0,A00018030,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
1,A00027159,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
2,A00027167,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
3,A00027439,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
4,A00031881,"[CLG4, CLG4R, CLG5, DS2]",4,"[4, 4, 4, 4, 4]"
5,A00032875,"[DS2, NFBAR]",2,"[2, 2, 2, 2, 2]"
6,A00033714,"[CLG4, CLG4R, CLG5, DS2]",4,"[4, 4, 4, 4, 4]"
7,A00034350,"[CLG4, CLG4R, DS2]",3,"[3, 3, 3, 3, 3]"
8,A00034854,"[DS2, NFB2R]",2,"[2, 2, 2, 2, 2]"
9,A00035291,"[CLG4, DS2]",2,"[2, 2, 2, 2, 2]"


In [99]:
newdf.to_csv("/data/RocklandSample/subjects_sessions_to_process.csv", index=False)

In [111]:
sum(newdf["n_sessions"])

256

In [107]:
flatten = [f for l in newdf.dataset.values for f in l]
[(x, flatten.count(x)) for x in set(flatten)]

[('NFBAR', 3),
 ('NFB2R', 1),
 ('ALGA', 5),
 ('DSA', 10),
 ('CLG4', 35),
 ('CLG5', 7),
 ('CLG2R', 83),
 ('NFB2', 4),
 ('CLGA', 16),
 ('CLG4R', 14),
 ('CLG2', 46),
 ('DS2', 32)]

In [112]:
flatten = [f[:3] for l in df.dataset.values for f in l]
[(x, flatten.count(x)) for x in set(flatten)]

[('DSA', 133), ('NFB', 102), ('ALG', 23), ('DS2', 538), ('CLG', 265)]