In [2]:
from hyppo.ksample import KSample
from hyppo.independence import Dcorr
from combat import combat
import pandas as pd
import glob
import os
import graspy as gp
import numpy as np
from dask.distributed import Client, progress
import dask.dataframe as ddf
from scipy.stats import zscore, rankdata, mannwhitneyu
import copy
import math
import networkx as nx
from graspy.models import SIEMEstimator as siem

In [3]:
def get_sub(fname):
    stext = os.path.basename(fname).split('_')
    return('{}_{}_{}'.format(stext[0], stext[1], stext[3]))

def get_sub_pheno_dat(subid, scan, pheno_dat):
    matches = pheno_dat.index[pheno_dat["SUBID"] == int(subid)].tolist()
    match = np.min(matches)
    return(int(pheno_dat.iloc[match]["SEX"]))

def get_age_pheno_dat(subid, scan, pheno_dat):
    matches = pheno_dat.index[pheno_dat["SUBID"] == int(subid)].tolist()
    match = np.min(matches)
    return(int(pheno_dat.iloc[match]["AGE_AT_SCAN_1"]))

def apply_along_dataset(scs, dsets, fn):
    scs_xfmd = np.zeros(scs.shape)
    for dset in np.unique(dsets):
        scs_xfmd[dsets == dset,:] = np.apply_along_axis(fn, 0, scs[dsets == dset,:])
    return(scs_xfmd)

def apply_along_individual(scs, fn):
    scs_xfmd = np.zeros(scs.shape)

def zsc(x):
    x_ch = copy.deepcopy(x)
    if (np.var(x_ch) > 0):
        x_ch = (x_ch - np.mean(x_ch))/np.std(x_ch)
        return x_ch
    else:
        return np.zeros(x_ch.shape)
    
    
def ptr(x):
    x_ch = copy.deepcopy(x)
    nz = x[x != 0]
    x_rank = rankdata(nz)*2/(len(nz) + 1)
    x_ch[x_ch != 0] = x_rank
    if (np.min(x_ch) != np.max(x_ch)):
        x_ch = (x_ch - np.min(x_ch))/(np.max(x_ch) - np.min(x_ch))
    return(x_ch)

In [4]:
basepath = '/mnt/nfs2/MR/cpac_3-9-2/'
pheno_basepath = '/mnt/nfs2/MR/all_mr/phenotypic/'
datasets = os.listdir(basepath)
try:
    datasets.remove("phenotypic")
except:
    print("No phenotypic folder in `datasets`.")
print(datasets)

No phenotypic folder in `datasets`.
['UPSM1', 'BNU1', 'HNU1', 'IBATRT', 'IPCAS1', 'Utah1', 'SWU4', 'NYU2', 'UWM', 'NKI24_std2500', 'BNU2', 'BNU3', 'NYU1', 'XHCUMS', 'IPCAS5', 'IPCAS6', 'JHNU', 'IPCAS8', 'LMU3', 'DC1', 'IACAS', 'IPCAS', 'MRNTRT', 'NKI24_mx1400', 'KKI2009', 'SWU2', 'NKI24_mx645', 'SWU3', 'MPG1', 'IPCAS2', 'SWU1', 'UM']


In [5]:
fmri_dict = {}
pheno_dat = {}

for i, dataset in enumerate(datasets):
    try:
        try:
            pheno_dat[dataset] = pd.read_csv('{}{}_phenotypic_data.csv'.format(pheno_basepath, dataset))
        except:
            raise ValueError("Dataset: {} does not have a phenotypic file.".format(dataset))
        scan_dict = {}
        sex_dict = []
        age_dict = []
        dset_dir = os.path.join('{}{}/graphs/FSL_nff_nsc_gsr_des'.format(basepath, dataset), '*.ssv')
        files_ds = glob.glob(dset_dir)
        successes = len(files_ds)
        for f in files_ds:
            try:
                gr_dat = gp.utils.import_edgelist(f)
                sub = get_sub(f)
                scansub = sub.split('_')
                sex = get_sub_pheno_dat(scansub[1], scansub[2], pheno_dat[dataset])
                age = get_age_pheno_dat(scansub[1], scansub[2], pheno_dat[dataset])
                scan_dict[sub] = gr_dat.flatten()
                sex_dict.append(sex)
                age_dict.append(age)
            except Exception as e:
                successes -= 1
        print("Dataset: {} has {}/{} successes.".format(dataset, successes, len(files_ds)))
        if (successes < 5):
            raise ValueError("Dataset: {} does not have enough successes.".format(dataset))
        fmri_dict[dataset] = {}
        fmri_dict[dataset]["scans"] = np.vstack(list(scan_dict.values()))
        fmri_dict[dataset]["subs"] = list(scan_dict.keys())
        fmri_dict[dataset]["sex"] = sex_dict
        fmri_dict[dataset]["age"] = age_dict
        fmri_dict[dataset]["dataset"] = [i + 1 for j in range(0, fmri_dict[dataset]["scans"].shape[0])]
    except Exception as e:
        print("Error in {} Dataset.".format(dataset))
        print(e)

Dataset: UPSM1 has 230/230 successes.
Dataset: BNU1 has 100/100 successes.
Dataset: HNU1 has 300/300 successes.
Dataset: IBATRT has 50/50 successes.
Dataset: IPCAS1 has 60/60 successes.
Dataset: Utah1 has 52/52 successes.
Dataset: SWU4 has 466/467 successes.
Dataset: NYU2 has 4/252 successes.
Error in NYU2 Dataset.
Dataset: NYU2 does not have enough successes.
Dataset: UWM has 50/50 successes.
Error in NKI24_std2500 Dataset.
Dataset: NKI24_std2500 does not have a phenotypic file.
Dataset: BNU2 has 10/100 successes.
Dataset: BNU3 has 48/48 successes.
Dataset: NYU1 has 75/75 successes.
Dataset: XHCUMS has 115/120 successes.
Dataset: IPCAS5 has 44/44 successes.
Dataset: IPCAS6 has 30/30 successes.
Dataset: JHNU has 60/60 successes.
Dataset: IPCAS8 has 26/26 successes.
Dataset: LMU3 has 50/50 successes.
Error in DC1 Dataset.
Dataset: DC1 does not have a phenotypic file.
Dataset: IACAS has 59/59 successes.
Error in IPCAS Dataset.
Dataset: IPCAS does not have a phenotypic file.
Dataset: MRNT

In [6]:
def run_experiment(row):
    try:
        ds1 = row[0]; ds2 = row[1]; sxfm=row[2]; dxfm = row[3]
        scans = np.vstack((fmri_dict[ds1]["scans"], fmri_dict[ds2]["scans"]))
        scans = scans[:,~np.all(scans == 0, axis=0)]
        sex = np.array(fmri_dict[ds1]["sex"] + fmri_dict[ds2]["sex"])
        age = np.array(fmri_dict[ds1]["age"] + fmri_dict[ds2]["age"])
        datasets = np.array([1 for i in range(0, fmri_dict[ds1]["scans"].shape[0])] + [2 for i in range(0, fmri_dict[ds2]["scans"].shape[0])])
        # apply per-individual transform
        if sxfm == "ptr":
            scans = np.apply_along_axis(ptr, 1, scans)
        # apply per-dataset edgewise transform
        if dxfm == "raw":
            scans = scans
        elif dxfm == "zscore":
            scans = apply_along_dataset(scans, datasets, zsc)
        elif dxfm == "ptr":
            scans = apply_along_dataset(scans, datasets, ptr)
        elif dxfm == "combat":
            scans = np.array(combat(pd.DataFrame(scans.T), datasets, model=None, numerical_covariates=None)).T
        try:
            eff_batch = KSample("DCorr").test(scans[datasets == 1,:], scans[datasets == 2,:])
        except:
            eff_batch = (None, None)
        try:
            eff_sex = KSample("DCorr").test(scans[sex == 1,:], scans[sex == 2,:])
        except:
            eff_sex = (None, None)
        try:
            eff_age = Dcorr().test(scans, age)
        except:
            eff_age = (None, None)
    except:
        eff_batch = (None, None)
        eff_sex = (None, None)
        eff_age = (None, None)
    return (row[0], row[1], row[2], row[3], eff_batch[0], eff_batch[1], eff_sex[0], eff_sex[1], eff_age[0], eff_age[1])

# Experiments

## Effects

In [7]:
ncores = 99
client = Client(threads_per_worker=1, n_workers=ncores)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43081 instead
  http_address["port"], self.http_server.port


In [8]:
exps = []
datasets = list(fmri_dict.keys())
for sxfm in ["raw", "ptr"]:
    for i, ds1 in enumerate(datasets):
        for j in range(i+1, len(datasets)):
            for dxfm in ["raw", "ptr", "zscore", "combat"]:
                exps.append([ds1, datasets[j], sxfm, dxfm])
sim_exps = pd.DataFrame(exps, columns=["Dataset1", "Dataset2", "Sxfm", "Dxfm"])
print(sim_exps.head(n=30))

   Dataset1 Dataset2 Sxfm    Dxfm
0     UPSM1     BNU1  raw     raw
1     UPSM1     BNU1  raw     ptr
2     UPSM1     BNU1  raw  zscore
3     UPSM1     BNU1  raw  combat
4     UPSM1     HNU1  raw     raw
5     UPSM1     HNU1  raw     ptr
6     UPSM1     HNU1  raw  zscore
7     UPSM1     HNU1  raw  combat
8     UPSM1   IBATRT  raw     raw
9     UPSM1   IBATRT  raw     ptr
10    UPSM1   IBATRT  raw  zscore
11    UPSM1   IBATRT  raw  combat
12    UPSM1   IPCAS1  raw     raw
13    UPSM1   IPCAS1  raw     ptr
14    UPSM1   IPCAS1  raw  zscore
15    UPSM1   IPCAS1  raw  combat
16    UPSM1    Utah1  raw     raw
17    UPSM1    Utah1  raw     ptr
18    UPSM1    Utah1  raw  zscore
19    UPSM1    Utah1  raw  combat
20    UPSM1     SWU4  raw     raw
21    UPSM1     SWU4  raw     ptr
22    UPSM1     SWU4  raw  zscore
23    UPSM1     SWU4  raw  combat
24    UPSM1      UWM  raw     raw
25    UPSM1      UWM  raw     ptr
26    UPSM1      UWM  raw  zscore
27    UPSM1      UWM  raw  combat
28    UPSM1   

In [9]:
sim_exps = ddf.from_pandas(sim_exps, npartitions=ncores)
sim_results = sim_exps.apply(lambda x: run_experiment(x), axis=1, result_type='expand',
                             meta={0: str, 1: str, 2: str, 3: str, 4: float, 5: float, 6: float, 7: float,
                                   8: float, 9: float})
sim_results

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,object,object,object,object,float64,float64,float64,float64,float64,float64
25,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
2375,...,...,...,...,...,...,...,...,...,...
2399,...,...,...,...,...,...,...,...,...,...


In [10]:
sim_results = sim_results.compute(scheduler="multiprocessing")
sim_results = sim_results.rename(columns={0: "Dataset1", 1: "Dataset2", 2: "Sxfm", 3: "Dxfm", 4: "Effect.Batch",
                                          5: "pvalue.Batch", 6: "Effect.Sex", 7: "pvalue.Sex",
                                          8: "Effect.Age", 9: "pvalue.Age"})
sim_results.to_csv('../data/dcorr/batch_results.csv')
sim_results.head(n=20)

Unnamed: 0,Dataset1,Dataset2,Sxfm,Dxfm,Effect.Batch,pvalue.Batch,Effect.Sex,pvalue.Sex,Effect.Age,pvalue.Age
0,UPSM1,BNU1,raw,raw,0.409943,1.7319320000000002e-31,0.017449,0.009331731,0.34416,9.761183000000001e-27
1,UPSM1,BNU1,raw,ptr,-0.043275,1.0,0.021491,0.004446183,-0.004665,1.0
2,UPSM1,BNU1,raw,zscore,-0.036737,1.0,0.020744,0.005094442,-0.000674,0.3778837
3,UPSM1,BNU1,raw,combat,-0.022192,1.0,0.022375,0.003785871,0.014722,0.01550477
4,UPSM1,HNU1,raw,raw,0.542544,1.0300849999999999e-64,0.0538,5.55204e-08,0.510462,5.226817e-61
5,UPSM1,HNU1,raw,ptr,-0.023164,1.0,0.065299,2.41247e-09,0.021454,0.000436191
6,UPSM1,HNU1,raw,zscore,-0.018839,1.0,0.06614,1.919265e-09,0.024762,0.0001711462
7,UPSM1,HNU1,raw,combat,-0.01199,1.0,0.071285,4.7406e-10,0.032701,1.856073e-05
8,UPSM1,IBATRT,raw,raw,0.21347,6.409222e-15,0.027246,0.003308564,0.18352,4.56078e-13
9,UPSM1,IBATRT,raw,ptr,-0.048904,1.0,0.028761,0.002622536,0.006928,0.08641645
