In [1]:
from hyppo.ksample import KSample
from hyppo.independence import Dcorr
from combat import combat
import pandas as pd
import glob
import os
import graspy as gp
import numpy as np
from dask.distributed import Client, progress
import dask.dataframe as ddf
from scipy.stats import zscore, rankdata, mannwhitneyu
import copy
import math
import networkx as nx
from graspy.models import SIEMEstimator as siem
import re

In [2]:
def get_sub_pheno_dat(subid, scan, pheno_dat):
    matches = pheno_dat.index[pheno_dat["SUBID"] == int(subid)].tolist()
    match = np.min(matches)
    return(int(pheno_dat.iloc[match]["SEX"]))

def get_age_pheno_dat(subid, scan, pheno_dat):
    matches = pheno_dat.index[pheno_dat["SUBID"] == int(subid)].tolist()
    match = np.min(matches)
    return(int(pheno_dat.iloc[match]["AGE_AT_SCAN_1"]))

def apply_along_dataset(scs, dsets, fn):
    scs_xfmd = np.zeros(scs.shape)
    for dset in np.unique(dsets):
        scs_xfmd[dsets == dset,:] = np.apply_along_axis(fn, 0, scs[dsets == dset,:])
    return(scs_xfmd)

def apply_along_individual(scs, fn):
    scs_xfmd = np.zeros(scs.shape)

def zsc(x):
    x_ch = copy.deepcopy(x)
    if (np.var(x_ch) > 0):
        x_ch = (x_ch - np.mean(x_ch))/np.std(x_ch)
        return x_ch
    else:
        return np.zeros(x_ch.shape)

def ptr(x):
    x_ch = copy.deepcopy(x)
    nz = x[x != 0]
    x_rank = rankdata(nz)*2/(len(nz) + 1)
    x_ch[x_ch != 0] = x_rank
    if (np.min(x_ch) != np.max(x_ch)):
        x_ch = (x_ch - np.min(x_ch))/(np.max(x_ch) - np.min(x_ch))
    return(x_ch)

In [3]:
# path to directory produced by download_aws.sh
basepath = '/mnt/nfs2/MR/corr/corr_m2g/graphs/m2g/fmri/'

# path to directory containing phenotypic annotations for download_aws.sh script
pheno_basepath = '/mnt/nfs2/MR/corr/corr_m2g/phenotypic/CoRR_AggregatedPhenotypicData.csv'
pheno_dat = pd.read_csv(pheno_basepath)
datasets = os.listdir(basepath)
print(datasets)

['IBATRT', 'Utah1', 'BMB_1', 'IPCAS_2', 'ABIDEII-TCD_1', 'SWU1', 'UWM', 'BNU2', 'XHCUMS', 'SWU4', 'IPCAS_3', 'ABIDEII-SDSU_1', 'SWU3', 'IPCAS_4', 'NYU_2', 'IPCAS_1', 'IPCAS_7', 'UPSM_1', 'ABIDEII-BNI_1', 'IACAS_1', 'IPCAS_5', 'NYU_1', 'BNU1', 'MRN_1', 'BNU3', 'HNU1', 'SWU2', 'IPCAS_8', 'JHNU', 'IPCAS_6']


In [4]:
fmri_dict = {}

for i, dataset in enumerate(datasets):
    try:
        dset_dir = os.path.join('{}{}'.format(basepath, dataset), '*.csv')
        files_ds = glob.glob(dset_dir)

        successes = len(files_ds)

        scans = []
        sexs = []
        ages = []
        ds_lab = []
        subjects = []
        subids = []
        sessions = []
        for f in files_ds:
            # obtain graph for this subject
            try:
                gr_dat = gp.utils.import_edgelist(f).flatten()
                scansub = re.split('-|_', os.path.basename(f))
                sex = get_sub_pheno_dat(scansub[1], scansub[3], pheno_dat)
                age = get_age_pheno_dat(scansub[1], scansub[3], pheno_dat)
                subid = "dataset-{}_sub-{}_ses-{}".format(dataset, scansub[1], scansub[3])
                scans.append(gr_dat)
                sexs.append(sex)
                ages.append(age)
                subjects.append(scansub[1])
                ds_lab.append(dataset)
                subids.append(subid)
                sessions.append(scansub[3])
            except Exception as e:
                successes -= 1

        if (successes < 5):
            raise ValueError("Dataset: {} does not have enough successes.".format(dataset))

        # add it in assuming there are enough unique files with metadata annotation
        scans = np.vstack(scans)
        fmri_dict[dataset] = {"Data": scans, "Subject": subjects, "Session": sessions, "Subid": subids,
                              "Sex": sexs, "Age": ages, "Dataset": ds_lab}

    except Exception as e:
        print("Error in {} Dataset.".format(dataset))
        print(e)

Error in BMB_1 Dataset.
Dataset: BMB_1 does not have enough successes.
Error in ABIDEII-TCD_1 Dataset.
Dataset: ABIDEII-TCD_1 does not have enough successes.
Error in ABIDEII-SDSU_1 Dataset.
Dataset: ABIDEII-SDSU_1 does not have enough successes.
Error in ABIDEII-BNI_1 Dataset.
Dataset: ABIDEII-BNI_1 does not have enough successes.


In [5]:
len(fmri_dict)

26

In [6]:
ncores = 99
client = Client(threads_per_worker=1, n_workers=ncores)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33077 instead
  http_address["port"], self.http_server.port


## Preservation of Network Statistics

In [7]:
def diag_edges(n):
    """
    A function for generating diagonal SIEM edge communities.
    """
    m = int(n/2)
    edge_comm = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if (i == j + m) or (j == i + m):
                edge_comm[i,j] = 1
            else:
                edge_comm[i,j] = 2
    np.fill_diagonal(edge_comm, 0)
    return edge_comm

def modular_edges(n):
    """
    A function for generating modular sbm edge communities.
    """
    m = int(n/2)
    edge_comm = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if ((i<m) & (j<m)) or ( (i>=m ) & (j>=m) ):
                edge_comm[i,j] = 1
            else:
                edge_comm[i,j] = 2
    np.fill_diagonal(edge_comm, 0)
    return edge_comm

des_diag = diag_edges(70)
des_mod = modular_edges(70)

def mww(G, C):
    A = G[C == 1]
    B = G[C == 2]
    test_res = list(mannwhitneyu(A, B, alternative='greater'))
    test_res.append(np.mean(A))
    test_res.append(np.mean(B))
    return(test_res)

In [8]:
dset_ls = [fmri_dict[ds]["Data"] for ds in fmri_dict.keys()]
raw_dat = np.vstack(dset_ls)
datasets = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Dataset"]])
# get the subject ids and dataset ids as a big list
subjects = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Subject"]])
sessions = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Session"]])
subids = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Subid"]])
sexs = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Sex"]])
ages = np.array([j for ds in fmri_dict.keys() for j in fmri_dict[ds]["Age"]])

In [9]:
raw_dat.shape

(2617, 4900)

In [10]:
def prepare_aggregate_data(scans, datasets):
    newdat = {}
    newdat["raw"] = copy.deepcopy(scans)
    # copy the raw data over
    newdat["zscore"] = copy.deepcopy(scans)
    newdat["ptr"] = copy.deepcopy(scans)
    newdat["combat"] = copy.deepcopy(scans)

    # remove stationary edges for combat
    combat_rem_edges = ~np.all(newdat["combat"] == 0, axis=0)

    # apply relevant transforms en-masse
    newdat["zscore"] = apply_along_dataset(newdat["zscore"], datasets, zscore)
    # replace nans with zeros
    newdat["zscore"][np.isnan(newdat["zscore"])] = 0
    newdat["ptr"] = apply_along_dataset(newdat["ptr"], datasets, ptr)
    newdat["combat"][:,combat_rem_edges] = np.array(combat(pd.DataFrame(newdat["combat"][:,combat_rem_edges].T), datasets, model=None, numerical_covariates=None)).T
    return(newdat)

data_preproc = {}
data_preproc["raw"] = prepare_aggregate_data(raw_dat, datasets)
data_preproc["ptr"] = prepare_aggregate_data(np.apply_along_axis(ptr, 1, raw_dat), datasets)

found 26 batches
found 0 numerical covariates...
found 0 categorical variables:	
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


Adjusting data


found 26 batches
found 0 numerical covariates...
found 0 categorical variables:	
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


Adjusting data


In [11]:
exps = []

for i, sub in enumerate(subjects):
    for sxfm in ["raw", "ptr"]:
        for dxfm in ["raw", "zscore", "ptr", "combat"]:
            exps.append([datasets[i], subjects[i], sessions[i], sexs[i], ages[i], i, sub, sxfm, dxfm])
sim_exps = pd.DataFrame(exps, columns=["Dataset", "Subject", "Retest", "Sex", "Age",
                                       "Ix", "Fullname", "Sxfm", "Dxfm"])
print(sim_exps.head(n=20))

   Dataset  Subject Retest  Sex  Age  Ix Fullname Sxfm    Dxfm
0   IBATRT  0027241     11    2   27   0  0027241  raw     raw
1   IBATRT  0027241     11    2   27   0  0027241  raw  zscore
2   IBATRT  0027241     11    2   27   0  0027241  raw     ptr
3   IBATRT  0027241     11    2   27   0  0027241  raw  combat
4   IBATRT  0027241     11    2   27   0  0027241  ptr     raw
5   IBATRT  0027241     11    2   27   0  0027241  ptr  zscore
6   IBATRT  0027241     11    2   27   0  0027241  ptr     ptr
7   IBATRT  0027241     11    2   27   0  0027241  ptr  combat
8   IBATRT  0027240     22    2   40   1  0027240  raw     raw
9   IBATRT  0027240     22    2   40   1  0027240  raw  zscore
10  IBATRT  0027240     22    2   40   1  0027240  raw     ptr
11  IBATRT  0027240     22    2   40   1  0027240  raw  combat
12  IBATRT  0027240     22    2   40   1  0027240  ptr     raw
13  IBATRT  0027240     22    2   40   1  0027240  ptr  zscore
14  IBATRT  0027240     22    2   40   1  0027240  ptr 

In [12]:
def singlegraph_exp(row):
    # grab data, and reshape it to nv x nv matrix
    flat_gr = data_preproc[row[7]][row[8]][row[5],:]
    nv = int(np.sqrt(np.max(flat_gr.shape)))
    exp_gr = flat_gr.reshape((nv, nv))
    G = nx.from_numpy_matrix(exp_gr)
    cc = nx.average_clustering(G, weight="weight")
    deg = np.array(list(dict(G.degree(weight="weight")).values())).mean()
    homophilic = mww(exp_gr, des_mod)
    homotopic = mww(exp_gr, des_diag)
    return(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7],
           row[8], cc, deg, homophilic[2], homotopic[2], homophilic[3], homotopic[3], 
           homophilic[1], homotopic[1], homophilic[0], homotopic[0])

In [13]:
sim_exps = ddf.from_pandas(sim_exps, npartitions=ncores)
sim_results = sim_exps.apply(lambda x: singlegraph_exp(x), axis=1, result_type='expand',
                             meta={0: str, 1: str, 2: str, 3:str, 4:str, 5:str, 6:str, 7:str, 8:str,
                                   9: float, 10: float, 11: float, 12: float, 13: float, 14: float,
                                  15: float, 16: float, 17: float, 18: float})
sim_results

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
npartitions=99,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,object,object,object,object,object,object,object,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
212,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20776,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20935,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [14]:
sim_results = sim_results.compute(scheduler="multiprocessing")
sim_results = sim_results.rename(columns={0: "Dataset", 1: "Subject", 2: "Retest", 3: "Sex", 4: "Age", 5: "Ix",
                                          6: "Fullname", 7: "Sxfm", 8: "Dxfm", 9: "Clustering",
                                          10: "Degree", 11: "Homophilic_mean", 12: "Homotopic_mean",
                                          13: "Heterophilic_mean", 14: "Heterotopic_mean",
                                          15: "Homophilic_pvalue", 16: "Homotopic_pvalue",
                                          17: "Homophilic_stat", 18: "Homotopic_stat"})
sim_results.to_csv('../data/summary/batch_statistics.csv')
sim_results.head(n=30)

Unnamed: 0,Dataset,Subject,Retest,Sex,Age,Ix,Fullname,Sxfm,Dxfm,Clustering,Degree,Homophilic_mean,Homotopic_mean,Heterophilic_mean,Heterotopic_mean,Homophilic_pvalue,Homotopic_pvalue,Homophilic_stat,Homotopic_stat
0,IBATRT,27241,11,2,27,0,27241,raw,raw,0.412969+0.000000j,29.562112,0.454585,0.735618,0.403034,0.423919,8.963445e-20,2.565275e-27,3352780.0,291260.0
1,IBATRT,27241,11,2,27,0,27241,raw,zscore,0.210416+0.080516j,6.282878,0.20338,-0.386291,-0.018059,0.098076,7.399481e-20,0.9989675,3353796.0,130920.0
2,IBATRT,27241,11,2,27,0,27241,raw,ptr,0.480426+0.000000j,36.602165,0.564583,0.400866,0.497324,0.532372,4.358851e-19,0.9999165,3344282.0,123006.0
3,IBATRT,27241,11,2,27,0,27241,raw,combat,0.354152+0.002292j,25.28256,0.392937,0.67385,0.340649,0.361893,5.555193000000001e-22,2.0481210000000003e-27,3378980.0,291500.0
4,IBATRT,27241,11,2,27,0,27241,ptr,raw,0.439415+0.000000j,34.510715,0.538199,0.868901,0.463199,0.494733,8.963445e-20,2.565275e-27,3352780.0,291260.0
5,IBATRT,27241,11,2,27,0,27241,ptr,zscore,0.231486+0.095148j,-1.820399,0.105771,-0.930452,-0.15476,-0.013087,5.1520169999999996e-20,0.9997842,3355708.0,125832.0
6,IBATRT,27241,11,2,27,0,27241,ptr,ptr,0.437185+0.000000j,34.460576,0.536755,0.365125,0.463168,0.501404,3.8809989999999995e-19,0.9999464,3344920.0,121736.0
7,IBATRT,27241,11,2,27,0,27241,ptr,combat,0.426985+0.001816j,34.348067,0.538679,0.842393,0.458085,0.492731,2.4901750000000002e-23,2.747167e-24,3394292.0,283584.0
8,IBATRT,27240,22,2,40,1,27240,raw,raw,0.330571+0.000000j,24.022427,0.353413,0.78668,0.343039,0.341702,0.009105984,2.2594620000000003e-43,3029906.0,325952.0
9,IBATRT,27240,22,2,40,1,27240,raw,zscore,0.189065+0.118694j,-26.740332,-0.399016,-0.058594,-0.376394,-0.392378,0.8562283,0.0002577096,2863972.0,206820.0
