Output some detail statistic information of SE/nFR, including variance, mean, non zero rate, difference of mean value

In [1]:
# output some detail statistic information of SE
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def compare(pheno_profiles, p_line):    
    pheno_list = list(pheno_profiles.keys())
    disease = ''
    for pheno in pheno_list:
        if pheno != 'Health':
            disease = pheno
    p_df = pd.DataFrame(columns=['mean(case)', 'mean(control)', 'var(case)', 'var(control)', 'fc(case/control)', 'diff(case-control)', 'abs', 'non_zero(control)', 'non_zero(case)', 'non_zero%(control)', 'non_zero%(case)', 'p_adj'], index=list(pheno_profiles[disease].columns))

    # check each taxon
    for idx in pheno_profiles[disease].columns:
        disease_v = list(pheno_profiles[disease][idx])
        control_v = list(pheno_profiles['Health'][idx])
        p_df.loc[idx, 'mean(case)'] = np.mean(disease_v)
        p_df.loc[idx, 'mean(control)'] = np.mean(control_v)
        p_df.loc[idx, 'var(case)'] = np.var(disease_v)
        p_df.loc[idx, 'var(control)'] = np.var(control_v)
        p_df.loc[idx, 'diff(case-control)'] = p_df.loc[idx, 'mean(case)'] - p_df.loc[idx, 'mean(control)']
        p_df.loc[idx, 'fc(case/control)'] = p_df.loc[idx, 'mean(case)']/p_df.loc[idx, 'mean(control)']
        p_df.loc[idx, 'non_zero(case)'] = len(disease_v) - disease_v.count(0)
        p_df.loc[idx, 'non_zero(control)'] = len(control_v) - control_v.count(0)
        p_df.loc[idx, 'non_zero%(case)'] = p_df.loc[idx, 'non_zero(case)']/len(disease_v)
        p_df.loc[idx, 'non_zero%(control)'] = p_df.loc[idx, 'non_zero(control)']/len(control_v)
    p_df['abs'] = abs(p_df['diff(case-control)'])

    for cluster in p_line.index:
        p_df.loc[cluster, 'p_adj'] = p_line[cluster]


    return p_df


In [11]:
p_cutoff = 0.05
plist = ['ACVD',
 'BD',
 'CRC',
 'IBD',
 'IGT',
 'CFS',
 'STH',
 'T2D',
 'adenoma',
 'asthma',
 'carcinoma_surgery_history',
 'hypertension',
 'migraine',
 'schizofrenia']

# test SE
indir = '../result/large_scale_cohort'
all_cohort_df = pd.read_csv(os.path.join(indir, 'p_all_cohorts_se.tsv'), sep='\t', index_col=0)
for disease in plist:
    d_dir = os.path.join(indir, disease)
    for cohort in os.listdir(d_dir):
        if os.path.isdir(os.path.join(d_dir, cohort)) == False:
            continue
        idir = os.path.join(d_dir, cohort, 'SE')
        pheno_profiles = {}
        for pheno in os.listdir(idir):
            if not pheno.startswith('se'):
                continue
            pheno_short = pheno.split('.')[0].split('_')[1]
            pheno_profiles[pheno_short] = pd.read_csv(os.path.join(idir, pheno), sep='\t', index_col=0)

        if len(list(pheno_profiles.keys())) >1:
            p_df = compare(pheno_profiles, all_cohort_df.loc[cohort, ])
            outpath = os.path.join(idir, 'p_detail.tsv')
            p_df.to_csv(outpath, sep='\t', index=True)


In [12]:
# test nFR
all_cohort_df = pd.read_csv(os.path.join(indir, 'p_all_cohorts_nfr.tsv'), sep='\t', index_col=0)
for disease in plist:
    d_dir = os.path.join(indir, disease)
    for cohort in os.listdir(d_dir):
        if os.path.isdir(os.path.join(d_dir, cohort)) == False:
            continue
        idir = os.path.join(d_dir, cohort, 'nFR')
        pheno_profiles = {}
        for pheno in os.listdir(idir):
            if not pheno.startswith('fr'):
                continue
            pheno_short = pheno.split('.')[0].split('_')[1]
            pheno_profiles[pheno_short] = pd.read_csv(os.path.join(idir, pheno), sep='\t', index_col=0)

        if len(list(pheno_profiles.keys())) >1:
            p_df = compare(pheno_profiles, all_cohort_df.loc[cohort, ])
            outpath = os.path.join(idir, 'p_detail.tsv')
            p_df.to_csv(outpath, sep='\t', index=True)