In [20]:
from hyppo.ksample import KSample
from combat import combat
import pandas as pd
import glob
import os
import graspy as gp
import numpy as np
from dask.distributed import Client, progress
import dask.dataframe as ddf
from scipy.stats import zscore, rankdata
import copy
import math

In [24]:
def get_sub(fname):
    stext = os.path.basename(fname).split('_')
    return('{}_{}'.format(stext[1], stext[3]))

def get_sub_pheno_dat(subid, scan, pheno_dat):
    matches = pheno_dat.index[pheno_dat["SUBID"] == int(subid)].tolist()
    match = np.min(matches)
    return(int(pheno_dat.iloc[match]["SEX"]))

def apply_along_dataset(scs, dsets, fn):
    scs_xfmd = np.zeros(scs.shape) 
    for dset in np.unique(dsets):
        scs_xfmd[dsets == dset,:] = np.apply_along_axis(fn, 0, scs[dsets == dset,:])
    return(scs_xfmd)

def ptr(x):
    #x_ch = copy.deepcopy(x)
    nz = x[x != 0]
    x_rank = rankdata(nz)*2/(len(nz) + 1)
    x[x != 0] = x_rank
    x = (x - np.min(x))/(np.max(x) - np.min(x))
    return(x)

In [25]:
basepath = '/data/'
pheno_basepath = '/phenotypic/'
datasets = os.listdir(basepath)
datasets.remove("phenotypic")
print(datasets)

['BNU1', 'HNU1']


In [26]:
fmri_dict = {}
pheno_dat = {}

for i, dataset in enumerate(datasets):
    try:
        pheno_dat[dataset] = pd.read_csv('{}{}_phenotypic_data.csv'.format(pheno_basepath, dataset))
        scan_dict = {}
        sex_dict = []
        dset_dir = os.path.join('{}{}/graphs/FSL_nff_nsc_gsr_des'.format(basepath, dataset), '*.ssv')
        for f in glob.glob(dset_dir):
            gr_dat = gp.utils.import_edgelist(f)
            sub = get_sub(f)
            scan_dict[sub] = gr_dat.flatten()
            scansub = sub.split('_')
            sex_dict.append(get_sub_pheno_dat(scansub[0], scansub[1], pheno_dat[dataset]))
        fmri_dict[dataset] = {}
        fmri_dict[dataset]["scans"] = np.vstack(list(scan_dict.values()))
        fmri_dict[dataset]["subs"] = list(scan_dict.keys())
        fmri_dict[dataset]["sex"] = sex_dict
        fmri_dict[dataset]["dataset"] = [i + 1 for j in range(0, fmri_dict[dataset]["scans"].shape[0])]
    except Exception as e:
        print("Error in {} Dataset.".format(dataset))
        print(e)

In [27]:
def run_experiment(row):
    try:
        ds1 = row[0]; ds2 = row[1]; xfm = row[2]
        scans = np.vstack((fmri_dict[ds1]["scans"], fmri_dict[ds2]["scans"]))
        scans = scans[:,~np.all(scans == 0, axis=0)]
        sex = np.array(fmri_dict[ds1]["sex"] + fmri_dict[ds2]["sex"])
        datasets = np.array([1 for i in range(0, fmri_dict[ds1]["scans"].shape[0])] + [2 for i in range(0, fmri_dict[ds2]["scans"].shape[0])])
        if xfm == "raw":
            scans = scans
        elif xfm == "zscore":
            scans = apply_along_dataset(scans, datasets, zscore)
        elif xfm == "ptr":
            scans = apply_along_dataset(scans, datasets, ptr)
        elif xfm == "combat":
            scans = np.array(combat(pd.DataFrame(scans.T), datasets, model=None, numerical_covariates=None)).T
        eff_batch = KSample("DCorr").test(scans[datasets == 1,:], scans[datasets == 2,:])
        eff_sex = KSample("DCorr").test(scans[sex == 1,:], scans[sex == 2,:])
    except:
        eff_batch = (None, None)
        eff_sex = (None, None)
    return (row[0], row[1], row[2], eff_batch[0], eff_batch[1], eff_sex[0], eff_sex[1])

## Experiments

In [28]:
ncores = 30
client = Client(threads_per_worker=1, n_workers=ncores)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34353 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:36971  Dashboard: http://127.0.0.1:34353/status,Cluster  Workers: 6  Cores: 6  Memory: 33.51 GB


In [29]:
exps = []
datasets = list(fmri_dict.keys())
for i, ds1 in enumerate(datasets):
    for j in range(i+1, len(datasets)):
        for xfm in ["raw", "ptr", "zscore", "combat"]:
            exps.append([ds1, datasets[j], xfm])
sim_exps = pd.DataFrame(exps, columns=["Dataset1", "Dataset2", "Transform"])
print(sim_exps.head())

  Dataset1 Dataset2 Transform
0     BNU1     HNU1       raw
1     BNU1     HNU1       ptr
2     BNU1     HNU1    zscore
3     BNU1     HNU1    combat


In [None]:
sim_exps = ddf.from_pandas(sim_exps, npartitions=ncores)
sim_results = sim_exps.apply(lambda x: run_experiment(x), axis=1, result_type='expand',
                             meta={0: str, 1: str, 2: str, 3: float, 4: float, 5: float, 6: float})
sim_results

In [30]:
sim_results = sim_results.compute(scheduler="multiprocessing")
sim_results = sim_results.rename(columns={0: "Dataset1", 1: "Dataset2", 2: "Transform", 3: "Effect.Batch",
                                          4: "pvalue.Batch", 5: "Effect.Sex", 6: "pvalue.Sex"})
sim_results.to_csv('./data/batch_results.csv')

NameError: name 'sim_results' is not defined

(0.26217454597493484, 7.874182809594927e-25)

## Combatted

In [10]:
combat_scans = combat(pd.DataFrame(scans.T), datasets, model=None, numerical_covariates=None)


found 2 batches
found 0 numerical covariates...
found 0 categorical variables:	
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


Adjusting data


In [11]:
eff_batch = KSample("DCorr").test(combat_scans[datasets == 1,:], combat_scans[datasets == 2,:])

TypeError: '(array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False]), slice(None, None, None))' is an invalid key

In [12]:
eff_sex = KSample("DCorr").test(combat_scans[sex == 1,:], combat_scans[sex == 2,:])

TypeError: '(array([False, False, False,  True, False,  True,  True, False, False,
       False, False,  True, False, False,  True, False,  True, False,
        True, False, False, False, False, False,  True,  True,  True,
       False, False, False, False, False,  True, False,  True, False,
       False, False, False,  True, False,  True, False,  True,  True,
        True,  True,  True, False, False,  True, False,  True,  True,
       False, False,  True,  True, False, False, False,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False, False, False,  True,  True,  True, False,  True,
        True,  True, False,  True, False,  True, False,  True,  True,
        True, False, False, False, False,  True,  True, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False,  True,  True,  True, False, False,  True, False, False,
        True, False,  True,  True, False, False,  True, False,  True,
        True,  True, False, False, False,  True,  True, False,  True,
        True,  True, False, False,  True, False, False,  True, False,
        True,  True, False,  True,  True,  True, False, False, False,
        True, False, False, False, False,  True, False,  True, False,
       False,  True, False,  True,  True, False, False, False,  True,
       False, False, False, False, False, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False,  True,
       False,  True,  True, False, False,  True, False,  True, False,
        True, False, False,  True,  True, False,  True, False,  True,
       False, False,  True,  True, False,  True, False,  True, False,
        True, False,  True,  True, False,  True, False,  True, False,
        True, False,  True,  True,  True, False, False, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False,  True,  True,  True, False,
        True,  True, False,  True, False,  True,  True, False,  True,
       False,  True, False,  True, False, False, False, False,  True,
       False,  True, False,  True,  True, False, False,  True,  True,
       False, False,  True,  True,  True,  True, False,  True,  True,
       False, False,  True,  True, False, False,  True, False,  True,
       False,  True,  True,  True, False,  True, False,  True, False,
        True,  True,  True, False,  True,  True, False,  True, False,
       False, False,  True,  True, False,  True,  True, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
       False, False, False,  True, False, False, False,  True, False,
        True, False, False,  True,  True, False,  True,  True, False,
       False, False,  True,  True]), slice(None, None, None))' is an invalid key

In [1]:
eff_batch

NameError: name 'eff_sex' is not defined

In [14]:
combat_scans

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.079961,0.169019,0.013939,0.075546,0.324519,0.066174,0.130587,0.173839,0.027809,0.138987,...,0.026226,0.295544,0.012179,0.163396,0.162671,0.170088,0.141930,0.364129,0.032054,0.260242
1,0.136965,0.027485,0.068595,0.092722,0.471526,0.045892,0.182966,0.067399,0.201577,0.040022,...,0.067657,0.358156,0.301873,0.012701,0.259114,0.020551,0.250263,0.030788,0.374616,0.017455
2,0.017515,0.174713,0.117976,0.201546,0.455948,0.258870,0.026215,0.256086,0.036762,0.208059,...,0.196671,0.402563,0.194152,0.284823,0.378998,0.048994,0.315422,0.385325,0.016784,0.383083
3,0.149464,0.226378,0.040478,0.104469,0.385379,0.033653,0.089574,0.173290,0.236768,0.077180,...,0.103193,0.201414,0.212215,0.021843,0.224856,0.048098,0.208125,0.252847,0.229207,0.019543
4,0.051282,0.035035,0.142592,0.324699,0.102383,0.108207,0.029470,0.039923,0.037383,0.062477,...,0.057409,0.138324,0.156167,0.059341,0.244549,0.134895,0.147321,0.256275,0.241786,0.074325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825,0.043034,0.107710,0.079939,0.012422,0.228633,0.314604,0.183382,0.042760,0.163151,0.205678,...,0.023906,0.018651,0.162969,0.164098,0.243588,0.275008,0.128743,0.166689,0.152616,0.084700
4826,0.188554,0.037594,0.218519,0.310148,0.224841,0.019094,0.245050,0.104442,0.200475,0.211149,...,0.090584,0.056020,0.067998,0.208495,0.184677,0.021908,0.078296,0.200886,0.292110,0.092889
4827,0.040945,0.012145,0.093545,0.167038,0.037455,0.383224,0.020973,0.151532,0.173338,0.199138,...,0.068557,0.255053,0.139579,0.199687,0.009020,0.062191,0.092607,0.258694,0.029665,0.025783
4828,0.358038,0.401410,0.033422,0.208878,0.389438,0.402954,0.256473,0.398740,0.259403,0.187288,...,0.107166,0.335564,0.130142,0.344607,0.179272,0.445320,0.330615,0.357884,0.242037,0.104853
