This script checks taxanomy abundance difference between disease and health group.

In [1]:
# check taxanomy abundance difference between disease and health group
import sys
sys.path.append('..')
import abd_profile
import os
import pandas as pd
import copy
import numpy as np
from statsmodels.stats.multitest import fdrcorrection as fdr
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings("ignore")

In [2]:
def profiles(indir, outdir='valid_data'):
    #input dir: including metadata.tsv and abd.tsv
    # output result to dir
    abd_path = os.path.join(indir, 'abd.tsv')
    metadata_path = os.path.join(indir, 'metadata.tsv')

    

    # first process data from gutmeta
    raw_profile = abd_profile.input_profile(abd_path, transfer=True)
    metadata = pd.read_csv(metadata_path, sep='\t', header=0)
    selected_raw_profile = raw_profile.loc[list(metadata['sample_id']), :]
    selected_raw_profile = abd_profile.rename_s_level(selected_raw_profile)
    crc_profile = abd_profile.clean(selected_raw_profile)
    
    # classify according to the metadata
    pheno_list = {}
    pheno_profiles = {}
    valid_sname = []
    for i in range(metadata.shape[0]):
        pheno = metadata.loc[i, 'disease']
        if pheno not in pheno_list.keys():
            pheno_list[pheno] = []
        if metadata.loc[i, 'sample_id'] in list(crc_profile.index):
            pheno_list[pheno].append(metadata.loc[i, 'sample_id'])
            valid_sname.append(metadata.loc[i, 'sample_id'])
    
    valid_metadata = metadata[metadata['sample_id'].isin(valid_sname)]
    valid_profile = selected_raw_profile.loc[valid_sname, :]
    prefix, pdir = os.path.split(indir)
    prefix, ddir = os.path.split(prefix)
    out = os.path.join(outdir, ddir, pdir)
    if not os.path.exists(out):
        os.makedirs(out)
        #print(out)
    valid_metadata.set_index('sample_id', inplace=True)
    valid_metadata.to_csv(os.path.join(out, 'metadata.tsv'), sep='\t')
    valid_profile.T.to_csv(os.path.join(out, 'abd.tsv'), sep='\t')

    for c, clist in pheno_list.items():
        pheno_profiles[c] = copy.deepcopy(crc_profile.loc[clist, :])
        #print(c, len(clist), valid_metadata['bioproject'][0])
    return pheno_profiles


In [3]:
def compare(pheno_profiles, p_cutoff):    
    pheno_list = list(pheno_profiles.keys())
    disease = ''
    for pheno in pheno_list:
        if pheno != 'Health':
            disease = pheno
    p_df = pd.DataFrame(columns=['p', 'mean(case)', 'mean(control)', 'var(case)', 'var(control)', 'fc(case/control)', 'diff(case-control)', 'abs', 'non_zero(control)', 'non_zero(case)', 'non_zero%(control)', 'non_zero%(case)'], index=list(pheno_profiles[disease].columns))

    # check each taxon
    for sp in pheno_profiles[disease].columns:
        disease_v = list(pheno_profiles[disease][sp])
        control_v = list(pheno_profiles['Health'][sp])
        p_df.loc[sp, 'mean(case)'] = np.mean(disease_v)
        p_df.loc[sp, 'mean(control)'] = np.mean(control_v)
        p_df.loc[sp, 'var(case)'] = np.var(disease_v)
        p_df.loc[sp, 'var(control)'] = np.var(control_v)
        p_df.loc[sp, 'diff(case-control)'] = p_df.loc[sp, 'mean(case)'] - p_df.loc[sp, 'mean(control)']
        p_df.loc[sp, 'fc(case/control)'] = p_df.loc[sp, 'mean(case)']/p_df.loc[sp, 'mean(control)']
        p_df.loc[sp, 'non_zero(case)'] = len(disease_v) - disease_v.count(0)
        p_df.loc[sp, 'non_zero(control)'] = len(control_v) - control_v.count(0)
        p_df.loc[sp, 'non_zero%(case)'] = p_df.loc[sp, 'non_zero(case)']/len(disease_v)
        p_df.loc[sp, 'non_zero%(control)'] = p_df.loc[sp, 'non_zero(control)']/len(control_v)
        t, p = mannwhitneyu(disease_v, control_v)
        p_df.loc[sp, 'p'] = p
    p_adj = fdr(p_df['p'], p_cutoff)[1]
    p_df['p_adj'] = p_adj
    p_df['abs'] = abs(p_df['diff(case-control)'])
    return p_df


In [4]:
outdir = '../result/taxa_abd_check'
p_cutoff = 0.05
plist = ['ACVD',
 'BD',
 'CRC',
 'IBD',
 'IGT',
 'CFS',
 'STH',
 'T2D',
 'adenoma',
 'asthma',
 'carcinoma_surgery_history',
 'hypertension',
 'migraine',
 'schizofrenia']

if not os.path.exists(outdir):
    os.makedirs(outdir)
outer = '../data'
for project in plist:
    out = os.path.join(outdir, project)
    if not os.path.exists(out):
        os.makedirs(out)
    projects_dir = os.path.join(outer, project)
    for dir in os.listdir(projects_dir):
        indir = os.path.join(projects_dir, dir)
        pheno_profiles = profiles(indir)
        if len(list(pheno_profiles.keys())) >1:
            p_df = compare(pheno_profiles, p_cutoff)
            outpath = os.path.join(out, 'p_{}.tsv'.format(dir))
            p_df.to_csv(outpath, sep='\t', index=True)
            