In [1]:
# This procedure aims to analyze the NAFLD dataset using NAFLD GCN
import sys
sys.path.append('..')
import abd_profile
import GCN
import warnings
import pandas as pd
import os
import copy
import analysis
warnings.filterwarnings('ignore')

# load data of NAFLD
ori_profile = abd_profile.input_profile('../data/NAFLD/abd.tsv' , transfer=True)
ori_GCN = GCN.input_GCN('../data/NAFLD/NASH_GCN.tsv', transfer=True)
print(ori_profile.shape)
print(ori_GCN.shape)
metadata = pd.read_csv('../data/NAFLD/NASH_forward_63_map.txt', sep = '\t', header=0, index_col=0)


(63, 97)
(4324, 97)


In [2]:
# compute the distance
d_df = GCN.sp_d(ori_GCN)
params = {
    'sample cluster': 'enterotype', # 'enterotype', # can be 'seat'
    'max_cluster': 10,
    'max_depth': 6
}
d_df.to_csv('../data/NAFLD/NASH_distance.tsv', sep = '\t')

In [3]:
outdir = '../result/NAFLD'
if not os.path.exists(outdir):
    os.makedirs(outdir)

# split samples according to phenotype
cluster_labels = list(set(metadata['DiseaseStatus']))
label_dict = {}
cluster_profiles = {}
phenos = ['NASH', 'Normal']
for pheno in phenos:
    idx_list = list(metadata[metadata['DiseaseStatus'] == pheno].index)
    cluster_profiles[pheno] = copy.deepcopy(ori_profile.loc[idx_list, :])

# build personalized functional redundancy network and construct the tree for each phenotypes
# find the keystone speceis and keystone clsuter
result, c = analysis.main(ori_GCN, ori_profile, outdir, params, d_df, cluster_profiles)


In [4]:
# check the difference of keystone speceis and keystone cluster
sp_dict  = {}
for pheno in phenos:
    cdir = os.path.join(outdir, 'cluster_{}'.format(pheno))
    keystone_path = os.path.join(cdir, 'keystone_node.tsv')
    keystone_df = pd.read_csv(keystone_path, sep='\t', header=0, index_col=0)
    keystone_cluster  = (keystone_df[(keystone_df['is_keystone'] == True) & (keystone_df['layer'] == 1)])['leaves'].iloc[0].split(',')
    sp_dict[pheno] = copy.deepcopy(set(keystone_cluster))

common = sp_dict[phenos[0]].intersection(sp_dict[phenos[1]])
diff0 = sp_dict[phenos[0]].difference(sp_dict[phenos[1]])
diff1 = sp_dict[phenos[1]].difference(sp_dict[phenos[0]])

print('Common: {} species: \n{}\n'.format(len(common), '\n'.join(sorted(list(common)))))
print('In {} only: {} species: \n{}\n'.format(len(diff0), phenos[0], '\n'.join(sorted(list(diff0)))))
print('In {} only: {} species: \n{}\n'.format(len(diff1), phenos[1], '\n'.join(sorted(list(diff1)))))


Common: 6 species: 
s__A. ruminis-OTU75
s__B. barnesiae
s__B. caecicola-OTU4
s__B. producta-OTU32
s__F. prausnitzii-OTU8
s__F. prausnitzii-OTU80

In 15 only: NASH species: 
s__A. octavius
s__B. coprophilus
s__C. pinnipediorum
s__D. pneumosintes
s__E. peruensis
s__Escherichia-Shigella-OTU88
s__Ezakiella-OTU94
s__F. magna
s__P. buccalis
s__P. ivorii
s__P. lacrimalis
s__P. loveana
s__P. olsenii
s__P. sp. 2007b
s__S5-A14a-OTU76

In 19 only: Normal species: 
s__A. caccae
s__A. indistinctus-OTU57
s__B. caecicola-OTU51
s__B. coprosuis-OTU67
s__B. hydrogenotrophica
s__B. magnum
s__B. producta-OTU81
s__C. comes
s__D. formicigenerans
s__D. longicatena
s__E. oxidoreducens
s__F. saccharivorans
s__P. chartae-OTU69
s__R. gnavus
s__R. torques
s__Ruminiclostridium-OTU96
s__Ruminococcaceae-OTU46
s__S. variabile-OTU44
s__S. variabile-OTU93

