In [1]:
# compute taxanomy abundance difference
import sys
sys.path.append('..')
import abd_profile
import os
import pandas as pd
import copy
import numpy as np
from statsmodels.stats.multitest import fdrcorrection as fdr
from scipy.stats import mannwhitneyu

In [2]:
def compare(pheno_profiles, p_cutoff):    
    pheno_list = list(pheno_profiles.keys())
    disease = ''
    for pheno in pheno_list:
        if pheno != 'Normal':
            disease = pheno
    p_df = pd.DataFrame(columns=['p', 'mean(case)', 'mean(control)', 'var(case)', 'var(control)', 'fc(case/control)', 'diff(case-control)', 'abs', 'non_zero(control)', 'non_zero(case)', 'non_zero%(control)', 'non_zero%(case)'], index=list(pheno_profiles[disease].columns))

    # check each taxon
    for sp in pheno_profiles[disease].columns:
        disease_v = list(pheno_profiles[disease][sp])
        control_v = list(pheno_profiles['Normal'][sp])
        p_df.loc[sp, 'mean(case)'] = np.mean(disease_v)
        p_df.loc[sp, 'mean(control)'] = np.mean(control_v)
        p_df.loc[sp, 'var(case)'] = np.var(disease_v)
        p_df.loc[sp, 'var(control)'] = np.var(control_v)
        p_df.loc[sp, 'diff(case-control)'] = p_df.loc[sp, 'mean(case)'] - p_df.loc[sp, 'mean(control)']
        p_df.loc[sp, 'fc(case/control)'] = p_df.loc[sp, 'mean(case)']/p_df.loc[sp, 'mean(control)']
        p_df.loc[sp, 'non_zero(case)'] = len(disease_v) - disease_v.count(0)
        p_df.loc[sp, 'non_zero(control)'] = len(control_v) - control_v.count(0)
        p_df.loc[sp, 'non_zero%(case)'] = p_df.loc[sp, 'non_zero(case)']/len(disease_v)
        p_df.loc[sp, 'non_zero%(control)'] = p_df.loc[sp, 'non_zero(control)']/len(control_v)
        t, p = mannwhitneyu(disease_v, control_v)
        p_df.loc[sp, 'p'] = p
    p_adj = fdr(p_df['p'], p_cutoff)[1]
    p_df['p_adj'] = p_adj
    p_df['abs'] = abs(p_df['diff(case-control)'])
    return p_df


In [3]:
outdir = '../result/taxa_abd_check_NAFLD'
p_cutoff = 0.05

if not os.path.exists(outdir):
    os.makedirs(outdir)

ori_profile = abd_profile.input_profile('../data/NAFLD/abd.tsv' , transfer=True)
metadata = pd.read_csv('../data/NAFLD/NASH_forward_63_map.txt', sep = '\t', header=0, index_col=0)
cluster_labels = list(set(metadata['DiseaseStatus']))
label_dict = {}
cluster_profiles = {}
phenos = ['NASH', 'Normal']
for pheno in phenos:
    idx_list = list(metadata[metadata['DiseaseStatus'] == pheno].index)
    cluster_profiles[pheno] = copy.deepcopy(ori_profile.loc[idx_list, :])  

p_df = compare(cluster_profiles, p_cutoff)  
p_df.to_csv(os.path.join(outdir, 'p_values.tsv'), sep='\t')      