In [5]:
import os
import sys
import pandas as pd 
import random
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection as fdr
sys.path.append('../src')
import abd_profile
import numpy as np
import se
import tree_util
import copy
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
random.seed(0)



In [6]:
def fr(d_df, profile, sname):
    #print(profile.columns, d_df.index)
    sp_list = sorted(list(set(profile.columns).intersection(set(d_df.index))))
    #print(sp_list)
    sp_d_df = d_df.loc[sp_list, sp_list]
    #print(profile)
    sp_profile = np.array(profile.loc[sname, sp_list])
    value = np.dot(sp_profile.reshape(len(sp_profile), 1),sp_profile.reshape(1, len(sp_profile)))
    width = value.shape[0]
    cor_df = np.ones(shape=(width, width)) - sp_d_df.values
    for i in range(width):
        cor_df[i][i] = 0
    value = np.multiply(value, cor_df)
    fr_df = pd.DataFrame(value, index=sp_list, columns=sp_list)
    return fr_df

def shuffle_and_assign_dataframe(df, pairs, remaining_pairs):
    new_df = pd.DataFrame(index=df.index, columns=df.columns)
    new_df = new_df.fillna(0)
   
    triu_indices = np.triu_indices_from(df, k=1)
    upper_triangle_values = df.values[triu_indices]
    
    if len(pairs)>0:
        n = len(pairs)
        largest_n_values = sorted(upper_triangle_values, reverse=True)[:n]
        last_values = sorted(upper_triangle_values, reverse=True)[n:]
    else:
        last_values = upper_triangle_values
    shuffled_values = last_values.copy()
    random.shuffle(shuffled_values)
    for idx, (i, j) in enumerate(pairs):
        if idx < len(pairs):
            new_df.loc[i, j] = largest_n_values[idx]
    
    # print(len(remaining_pairs), len(shuffled_values))
    for idx, (i, j) in enumerate(remaining_pairs):
        new_df.loc[i, j] = shuffled_values[idx]
    
    df = new_df + new_df.T
    
    return df

def shuffle_and_assign_dataframe_low(df, pairs, remaining_pairs):
    new_df = pd.DataFrame(index=df.index, columns=df.columns)
    new_df = new_df.fillna(0)
   
    triu_indices = np.triu_indices_from(df, k=1)
    upper_triangle_values = df.values[triu_indices]
    
    n = len(pairs)
    largest_n_values = sorted(upper_triangle_values, reverse=True)[:n]
    last_values = sorted(upper_triangle_values, reverse=True)[n:]
    last_shuffled_values = last_values.copy()
    
    outside_need_n = len(remaining_pairs)
    if outside_need_n > len(largest_n_values):
        outside_values = largest_n_values + last_shuffled_values[:outside_need_n-len(largest_n_values)]
        inside_values = last_shuffled_values[outside_need_n-len(largest_n_values):]
    else:
        outside_values = largest_n_values[:outside_need_n]
        inside_values = largest_n_values[outside_need_n:]+last_shuffled_values
    random.shuffle(outside_values)
    random.shuffle(inside_values)
    
    
    for idx, (i, j) in enumerate(pairs):
        new_df.loc[i, j] = inside_values[idx]
    
    # print(len(remaining_pairs), len(shuffled_values))
    for idx, (i, j) in enumerate(remaining_pairs):
        new_df.loc[i, j] = outside_values[idx]
    
    df = new_df + new_df.T
    
    return df


In [7]:
# n: repeat time
# k: sample number

def summary(se_list_rand, se_list_high, se_list_low):
    
    # run paired Wilcoxon test
    statistic, p1 = stats.wilcoxon(se_list_high, se_list_rand)
    statistic, p2 = stats.wilcoxon(se_list_low, se_list_rand)
    statistic, p3 = stats.wilcoxon(se_list_high, se_list_low)
    print("rand mean: ", np.mean(se_list_rand), 'rand std: ', np.std(se_list_rand))
    print("high mean: ", np.mean(se_list_high), 'high std: ', np.std(se_list_high))
    print("low mean: ", np.mean(se_list_low), 'low std: ', np.std(se_list_low))
    print("rand vs high: ", p1)
    print("rand vs low: ", p2)
    print("high vs low: ", p3)
    df = pd.DataFrame(index=['rand', 'high', 'low'], columns=['mean', 'std'])
    df.loc['rand'] = [np.mean(se_list_rand), np.std(se_list_rand)]
    df.loc['high'] = [np.mean(se_list_high), np.std(se_list_high)]
    df.loc['low'] = [np.mean(se_list_low), np.std(se_list_low)]
    p_df = pd.DataFrame(index=['rand vs high', 'rand vs low', 'high vs low'], columns=['pvalue'])
    p_df.loc['rand vs high'] = p1
    p_df.loc['rand vs low'] = p2
    p_df.loc['high vs low'] = p3
    return df, p_df

def get_ori_nets(profile, d_df):
    net_dict = {}
    rename_dict = {}
    for sp in d_df.index:
        rename_dict[sp] = sp.replace('_', '-')
    renamed_d = d_df.rename(columns=rename_dict, index=rename_dict)

    renamed_profile = profile.rename(columns=rename_dict)
    renamed_profile = renamed_profile[list(set(renamed_profile.columns).intersection(set(renamed_d.index)))]
    for sname in profile.index:
        # print(sname)
        edge_df = fr(renamed_d, renamed_profile, sname)
        net_dict[sname] = copy.deepcopy(edge_df)
    return net_dict

def shuffle_nets(net_dict, pairs, remaining_pairs, n=1):
    # first kind rand
    rand_dict = {}
    for sname in net_dict.keys():
        rand_dict[sname] = []
        for i in range(n):
            rand_net = shuffle_and_assign_dataframe(net_dict[sname], [], remaining_pairs)
            rand_dict[sname].append(copy.deepcopy(rand_net))
    # second kind high
    high_dict = {}
    for sname in net_dict.keys():
        high_dict[sname] = []
        for i in range(n):
            high_net = shuffle_and_assign_dataframe(net_dict[sname], pairs, remaining_pairs)
            high_dict[sname].append(copy.deepcopy(high_net))

    # third kind low
    low_dict = {}
    for sname in net_dict.keys():
        low_dict[sname] = []
        for i in range(n):
            low_net = shuffle_and_assign_dataframe_low(net_dict[sname], pairs, remaining_pairs)
            low_dict[sname].append(copy.deepcopy(low_net))
    return rand_dict, high_dict, low_dict

def split_newick(newick_tree):
    json_tree = tree_util.parse(newick_tree)
    largest = {'largest': 0}
    leaf_list, l = tree_util.recu_compute(json_tree, 0, largest)
    largest_level = largest['largest']
    nlayer = largest_level
    leaf_list, l = tree_util.recu_compute(json_tree, 0, largest)
    layer_leaves_dict = tree_util.make_layer_dict(nlayer)

    tree_util.recu_layer(json_tree, layer_leaves_dict)
    tree_util.to_layer_leaves(layer_leaves_dict, nlayer)
    result = {}
    # compute leaf layer
    result['leaves_dict'] = copy.deepcopy(layer_leaves_dict)
    parent_dict = {}
    tree_util.parents(json_tree, parent_dict)
    node_leaves = {}
    for level in layer_leaves_dict.keys():
        for node, sp_list in layer_leaves_dict[level].items():
            if node in node_leaves.keys():
                continue
            node_leaves[node] = copy.deepcopy(sp_list)
    subtree_nodes = {}
    for l in leaf_list:
        parent = parent_dict[l]
        if parent not in subtree_nodes.keys():
            subtree_nodes[parent] = []
        subtree_nodes[parent].append(l)

    for node in node_leaves.keys():
        parent = parent_dict[node]
        if parent not in subtree_nodes.keys():
            subtree_nodes[parent] = []
        subtree_nodes[parent] += subtree_nodes[node]
        subtree_nodes[parent].append(node)

    for node in subtree_nodes.keys():
        subtree_nodes[node].append(node)

    direct_children_dict = {}
    for node, parent in parent_dict.items():
        if parent not in direct_children_dict:
            direct_children_dict[parent] = []
        direct_children_dict[parent].append(node)
    return parent_dict, node_leaves, subtree_nodes, direct_children_dict

def get_se_list(net_dict, parent_dict, node_leaves, subtree_nodes, direct_children_dict, param=0.5):
    result = pd.DataFrame()
    for sname in net_dict.keys():
        for i, edge_df in enumerate(net_dict[sname]):
            nid = "{}_{}".format(sname, i)
            tmp = se.subtree_se_adj(edge_df, parent_dict, node_leaves, subtree_nodes, direct_children_dict, param)
            for node in tmp.keys():
                value = tmp[node]
                result.loc[nid, node] = value
    return result


def read_cohort(indir):
    # Load the dataset
    abd_path = os.path.join(indir, 'abd.tsv')
    metadata_path = os.path.join(indir, 'metadata.tsv')
    # first process data from gutmeta
    raw_profile = abd_profile.input_profile(abd_path, transfer=True)
    metadata = pd.read_csv(metadata_path, sep='\t', header=0)
    selected_raw_profile = raw_profile.loc[list(metadata['sample_id']), :]
    selected_raw_profile = abd_profile.rename_s_level(selected_raw_profile)
    crc_profile = abd_profile.clean(selected_raw_profile)

    pheno_list = {}
    for i in range(metadata.shape[0]):
        pheno = metadata.loc[i, 'disease']
        if pheno not in pheno_list.keys():
            pheno_list[pheno] = []
        if metadata.loc[i, 'sample_id'] in list(crc_profile.index):
            pheno_list[pheno].append(metadata.loc[i, 'sample_id'])
    return crc_profile
    

def select_clusters(diff_df, data_dir):
    selected_diff = diff_df.loc[os.listdir(data_dir), ]
    selected_diff.dropna(axis=1, thresh = len(selected_diff)/2 , inplace=True)
    #num_df = (selected_diff < 0.05).sum(axis=0)
    clusters = list(selected_diff.columns)
    if 'root' in clusters:
        clusters.remove('root')
    return clusters

def sampling(total, cluster_list, slist):
    clusters = random.choices(cluster_list, k=total)
    samples = random.choices(slist, k=total)
    return samples, clusters

In [8]:
outer_dir = '../data'
total_sample = 100
outer_se = '../result/large_scale_cohort'
output_dir = '../result/validation/se_structure_simulation'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
plist = ['CRC']
d_df = pd.read_csv('../data/sp_d.tsv', sep='\t', header=0, index_col=0)
diff_df = pd.read_csv(os.path.join(outer_se, 'p_all_cohorts_se.tsv'), sep='\t', header=0, index_col=0)
with open('../result/GCN_fix_tree/renamed_GCN_tree.newick') as fp:
    newick_tree = fp.read()
    parent_dict, node_leaves, subtree_nodes, direct_children_dict = split_newick(newick_tree)
pheno_result= {}
ori_se_dict = {}


In [13]:

for pheno in plist:
    result = pd.DataFrame(columns = ['sample', 'cluster', 'rand', 'high', 'low'])
    #ori_se_dict[pheno] = {}
    data_dir = os.path.join(outer_dir, pheno)
    clusters = select_clusters(diff_df, data_dir)
    se_dir = os.path.join(outer_se, pheno)
    print('== {} =='.format(pheno))
    #clusters = select_clusters(diff_df, data_dir)
    merged_crc_profile = pd.DataFrame()
    for cohort in os.listdir(data_dir):
        print(f'-- {pheno} -- {cohort} --')
        # sp_diff_df = pd.read_csv(os.path.join(abd_dff_dir, pheno, 'p_{}.tsv'.format(cohort)), sep='\t', header=0, index_col=0)
        se_cohort_dir = os.path.join(se_dir, cohort)
        #ori_se = pd.read_csv(os.path.join(se_cohort_dir, 'se_Health.tsv'), sep='\t', header=0, index_col=0)
        abd_dir = os.path.join(data_dir, cohort)
        #crc_profile, se_df = read_cohort(abd_dir, os.path.join(outer_se, pheno, cohort))
        crc_profile = read_cohort(abd_dir)
        merged_crc_profile, crc_profile_tmp = merged_crc_profile.align(crc_profile, join='outer', fill_value=0)
        merged_crc_profile.loc[crc_profile.index, crc_profile.columns] = crc_profile
    
    #random sampling sample and cluster 
    sub_slist, sub_clusters = sampling(total_sample, clusters, merged_crc_profile.index)  
    sample_df = pd.DataFrame(columns=['sample', 'cluster'])
    sample_df['sample'] = sub_slist
    sample_df['cluster'] = sub_clusters
    take_clusters = list(set(sub_clusters))

    crc_profile = merged_crc_profile.loc[list(set(sub_slist)), ]
    crc_profile = crc_profile[list(set(crc_profile.columns).intersection(set(d_df.index)))]
    fr_dict = copy.deepcopy(get_ori_nets(crc_profile, d_df))
    renamed_dict = {}
    for sp in crc_profile.columns:
        renamed_dict[sp] = sp.replace('_', '-')
    crc_profile = crc_profile.rename(columns=renamed_dict)

    for tmp_cluster in take_clusters:
        leaves = set(node_leaves[tmp_cluster]).intersection(set(crc_profile.columns))
        leaves = sorted(list(leaves))
        pairs = []
        for i in range(len(leaves)):
            for j in range(i+1, len(leaves)):
                pairs.append((leaves[i], leaves[j]))
        print('pairs: ', len(pairs))

        remaining_pairs = []
        for i in range(len(crc_profile.columns)):
            s1 = crc_profile.columns[i]
            for j in range(i+1, len(crc_profile.columns)):
                s2 = crc_profile.columns[j]
                if not ((s1 in leaves) and (s2 in leaves)):
                    remaining_pairs.append((s1, s2))
        print('remaining pairs: ', len(remaining_pairs))

        tmp_df = sample_df[sample_df['cluster'] == tmp_cluster]
        for sample in tmp_df['sample']:
            tmp_dict = {}
            tmp_dict[sample] = copy.deepcopy(fr_dict[sample])
            rand_dict, high_dict, low_dict = shuffle_nets(tmp_dict, pairs, remaining_pairs, 1)
            rand_se = get_se_list(rand_dict, parent_dict, node_leaves, subtree_nodes, direct_children_dict)
            high_se = get_se_list(high_dict, parent_dict, node_leaves, subtree_nodes, direct_children_dict)
            low_se = get_se_list(low_dict, parent_dict, node_leaves, subtree_nodes, direct_children_dict)
            for  nid in rand_se.index:
                result = result.append({'sample': nid, 'cluster': tmp_cluster, 'rand': rand_se.loc[nid, tmp_cluster], 'high': high_se.loc[nid, tmp_cluster], 'low': low_se.loc[nid, tmp_cluster]}, ignore_index=True)

    tmp_dir = os.path.join(output_dir, pheno)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    result.to_csv(os.path.join(tmp_dir, 'se_summary.tsv'), sep='\t')
    pheno_result[pheno] = copy.deepcopy(result)



== CRC ==
-- CRC -- CRC1 --
-- CRC -- CRC2 --
-- CRC -- CRC3 --
-- CRC -- CRC4 --
-- CRC -- CRC5 --
-- CRC -- CRC6 --
-- CRC -- CRC7 --
-- CRC -- CRC8 --
-- CRC -- CRC9 --
pairs:  2485
remaining pairs:  379016
pairs:  325
remaining pairs:  381176
pairs:  325
remaining pairs:  381176
pairs:  2628
remaining pairs:  378873
pairs:  990
remaining pairs:  380511
pairs:  3160
remaining pairs:  378341
pairs:  78
remaining pairs:  381423
pairs:  136
remaining pairs:  381365
pairs:  36
remaining pairs:  381465
pairs:  435
remaining pairs:  381066
pairs:  45
remaining pairs:  381456
pairs:  741
remaining pairs:  380760
pairs:  496
remaining pairs:  381005
pairs:  496
remaining pairs:  381005
pairs:  276
remaining pairs:  381225
pairs:  36
remaining pairs:  381465
pairs:  1225
remaining pairs:  380276
pairs:  191890
remaining pairs:  189611
pairs:  1891
remaining pairs:  379610
pairs:  78
remaining pairs:  381423
pairs:  105
remaining pairs:  381396
pairs:  78
remaining pairs:  381423
pairs:  55
r

In [None]:
for pheno in plist:
    tmp_dir = os.path.join(output_dir, pheno)
    result = pheno_result[pheno]
    rand_se = result['rand']
    high_se = result['high']
    low_se = result['low']
    df, p_df = summary(rand_se, high_se, low_se)
    df.to_csv(os.path.join(tmp_dir, 'se_mean_std.tsv'), sep='\t')
    p_df.to_csv(os.path.join(tmp_dir, 'se_p_values.tsv'), sep='\t')

rand mean:  0.32876340479264526 rand std:  0.48543600464450826
high mean:  3.4978149723563536 high std:  1.0174569051290732
low mean:  0.33227911354466755 low std:  0.5687596463086205
rand vs high:  3.896559845095909e-18
rand vs low:  0.12515445955953292
high vs low:  3.896559845095909e-18
