In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, truncnorm, bernoulli
import cloudpickle as pickle
import matplotlib.pyplot as plt
import torch
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #last_expr

In [2]:
def simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E):
    dc_D0 = np.round(norm.rvs(loc=MEAN_E0, scale=SIGMA_E0, size=N), 3).clip(MIN_E, MAX_E)
    dc_D1 = np.round(norm.rvs(loc=MEAN_E1, scale=SIGMA_E1, size=N), 3).clip(MIN_E, MAX_E)
    return dc_D0, dc_D1

def simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A):
    ac_a0 = np.round(norm.rvs(loc=MEAN_A0, scale=SIGMA_A0, size=N),3).clip(MIN_A, MAX_A)
    ac_a1 = np.round(norm.rvs(loc=MEAN_A1, scale=SIGMA_A1, size=N),3).clip(MIN_A, MAX_A)
    return ac_a0, ac_a1

def simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S):
    sc_s0 = np.round(norm.rvs(loc=MEAN_S0, scale=SIGMA_S0, size=N), 3).clip(MIN_S, MAX_S)
    sc_s1 = np.round(norm.rvs(loc=MEAN_S1, scale=SIGMA_S1, size=N), 3).clip(MIN_S, MAX_S)
    return sc_s0, sc_s1

def simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N):
    #np.random.seed(0)
    #nc_n = np.round(norm.rvs(loc=MEAN_N, scale=SIGMA_N, size=N*2), 3).clip(MIN_N, MAX_N)
    #pickle.dump(nc_n, open('nc_n.pkl', 'wb'))
    nc_n = pickle.load(open('nc_n.pkl', 'rb'))
    return nc_n

def simi_metabolic_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S,MEAN_N, SIGMA_N, MIN_N, MAX_N):
    dc_D0, dc_D1 = simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E)
    ac_a0, ac_a1 = simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A)
    sc_s0, sc_s1 = simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S)
    nc_n = simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N)
    simudata = pd.DataFrame([np.hstack((dc_D0, dc_D1)), np.hstack((ac_a0, ac_a1)), np.hstack((sc_s0, sc_s1)), np.hstack((nc_n)),np.ones(N*2)], index=['Environment', 'Age', 'Sex','Noise','1'], columns=range(N*2)).T
    #simudata['Noise'] = np.round(norm.rvs(MEAN_U, SIGMA_U, size=2*N), 3)
    #Group=0 : control  Group=1 : case
    simudata['Group'] = np.hstack(([0 for i in range(N)], [1 for i in range(N)]))
    return simudata

def simu_weight(N_Species, MEAN_WE, MEAN_WA, MEAN_WS,MEAN_WN, MEAN_WD, rsize=50):
    simudata = []
    Ws = []
    for i in range(N_Species):
        w = np.random.choice(['E', 'A', 'S','N', 'D'], size=rsize, p=[MEAN_WE, MEAN_WA, MEAN_WS, MEAN_WN,MEAN_WD])
        w = np.round([(w==i).mean() for i in ['E', 'A', 'S','N', 'D']], 3)
        w = list(w)
        w. append(np.random.random( ))
        Ws.append(w)
    return pd.DataFrame(Ws, index=range(N_Species), columns=['E', 'A', 'S','N', 'D','1'])

def cal_abundance(metabolic, weights, windex):
    abundance = pd.DataFrame(np.dot(metabolic[['Environment', 'Age', 'Sex','Noise','Group','1']].values, weights.values.T), index=metabolic.index, 
                             columns=['W'+str(windex)+'X'+str(i) for i in weights.index])
    metabolic = pd.concat([metabolic, abundance], axis=1, sort=False)
    return metabolic

def simu_main_meta(config,Liner):
    metabolic = simi_metabolic_component(config['N'], config['MEAN_E0'], config['MEAN_E1'], config['SIGMA_E0'], config['SIGMA_E1'], config['MIN_E'], config['MAX_E'],
                                         config['MEAN_A0'], config['MEAN_A1'], config['SIGMA_A0'], config['SIGMA_A1'], config['MIN_A'], config['MAX_A'],
                                         config['MEAN_S0'], config['MEAN_S1'], config['SIGMA_S0'], config['SIGMA_S1'], config['MIN_S'], config['MAX_S'],
                                         config['MEAN_N'], config['SIGMA_N'], config['MIN_N'], config['MAX_N'])
    if Liner==True:
        metabolic['Environment'] =config['LE_E'] *metabolic['Group'] + metabolic['Environment']  
        metabolic['Age']=config['LE_A'] * metabolic['Group'] + metabolic['Age']
        metabolic['Sex']=config['LE_S'] * metabolic['Group'] + metabolic['Sex']
    return metabolic

def simu_main_microbiome(metabolic,config):
    windex = 1
    for Ws in config['Weights_proportion']:
        weights = simu_weight(Ws['N_Species'], Ws['MEAN_WE'], Ws['MEAN_WA'], Ws['MEAN_WS'], Ws['MEAN_WN'],Ws['MEAN_WD'])
        metabolic = cal_abundance(metabolic, weights, windex)
        windex += 1
    return metabolic

In [3]:
def validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax):
    TP1=len(DiffTax.intersection(TrueDiffTax))
    FP1=len(DiffTax)-TP1
    TN1=len(OtherTax.intersection(TrueOtherTax))
    FN1=len(OtherTax)-TN1
    TP2=len(PSMDiffTax.intersection(TrueDiffTax))
    FP2=len(PSMDiffTax)-TP2
    TN2=len(PSMOtherTax.intersection(TrueOtherTax))
    FN2=len(PSMOtherTax)-TN2
    accuracy1=(TP1+TN1)/(TP1+FN1+FP1+TN1)
    precision1=TP1/(TP1+FP1)
    recall1=TP1/(TP1+FN1)
    F11=2*precision1*recall1/(precision1+recall1)
    accuracy2=(TP2+TN2)/(TP2+FN2+FP2+TN2)
    if (TP2+FP2)==0:
        precision2=0
    else:
        precision2=TP2/(TP2+FP2)
    recall2=TP2/(TP2+FN2)
    if (precision2+recall2)==0:
        F12=None
    else:
        F12=2*precision2*recall2/(precision2+recall2)
    score=pd.DataFrame(data=None,columns=["Accuracy","Precision","Recall","F1","psm-Accuracy","psm-Precision","psm-Recall","psm-F1"])
    score.loc[len(score.index)] = [accuracy1,precision1,recall1,F11,accuracy2,precision2,recall2,F12]
    return score

In [18]:
def save_files(pi,seed,config,metadata,microbiome,result):
    s=str(seed)
    with open('../simuData/'+pi+'/config.txt','w') as f:f.write(str(config))
    metadata.to_csv( '../simuData/'+pi+'/'+s+'/metadata.csv')
    microbiome.to_csv( '../simuData/'+pi+'/'+s+'/microbiome.csv')
    result.to_csv( '../simuData/'+pi+'/'+s+'/PSMresult.csv')

In [13]:
%run miMatch.py
psm = miMatch()

# how disease affect taxa

In [7]:
config_E_A_S = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.5,
    'MEAN_E1':0.5,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.5,
    'MEAN_A1':0.5,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.5,
    'MEAN_S1':0.5,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
    ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    
    'Weights_proportion':[
        { 
        'N_Species':10,
        'MEAN_WE':0.1333,
        'MEAN_WA':0.1333,
        'MEAN_WS':0.1333,
        'MEAN_WN':0.6,
        'MEAN_WD':0.0001,
        },
        { 
        'N_Species':10,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { 
        'N_Species':10,
        'MEAN_WE':0.131,
        'MEAN_WA':0.132,
        'MEAN_WS':0.132,
        'MEAN_WN':0.6,
        'MEAN_WD':0.005,
        },
        { 
        'N_Species':10,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
        { 
        'N_Species':10,
        'MEAN_WE':0.1,
        'MEAN_WA':0.05,
        'MEAN_WS':0.05,
        'MEAN_WN':0.6,
        'MEAN_WD':0.2,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [19]:
DiffTax=[]
for seed in range(0,50):
    metadata= simu_main_meta(config_E_A_S,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_E_A_S)
    microbiome=microbiome.iloc[:,5:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_E_A_S'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.05)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    DiffTax.append(PSMDiffTax) 
    save_files(pi,seed,config_E_A_S,metadata,microbiome,result)
resoult = []  # 定义一个空字典
for i in range(0,50):  # 遍历输入的字符串，以键值对的方式存储在字典中
    for j in range(0,len(DiffTax[i])):
        resoult.append(DiffTax[i][j])
d = {}
for word in resoult: 
    d[word] = d.get(word, 0) + 1
#for k in d_order: 
#    print("{}:{}".format(k, d_order[k]))
d_order=sorted(d.items(),key=lambda x:x[1],reverse=True)#字典按value值排序
d_order 

[('W5X9', 50),
 ('W5X5', 50),
 ('W5X0', 50),
 ('W5X4', 50),
 ('W5X2', 50),
 ('W5X8', 50),
 ('W5X7', 50),
 ('W5X1', 49),
 ('W5X3', 49),
 ('W5X6', 49),
 ('W4X0', 48),
 ('W4X7', 47),
 ('W4X9', 46),
 ('W4X3', 46),
 ('W4X4', 45),
 ('W4X1', 45),
 ('W4X2', 43),
 ('W4X8', 43),
 ('W4X6', 42),
 ('W4X5', 42),
 ('W3X1', 5),
 ('W3X5', 2),
 ('W3X2', 2),
 ('W3X6', 2),
 ('W2X0', 2),
 ('W3X0', 2),
 ('W3X9', 2),
 ('W3X8', 2),
 ('W3X3', 1),
 ('W3X4', 1),
 ('W2X1', 1),
 ('W2X3', 1),
 ('W3X7', 1)]

### disease weight ≥ 0.1, defined differetial taxa 
### disease weight ≤ 0.001, defined non- differetial taxa 

#  No difference background

In [20]:
config_NE_NA_NS = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.5,
    'MEAN_E1':0.5,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.5,
    'MEAN_A1':0.5,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.5,
    'MEAN_S1':0.5,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
    ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [21]:
metadata= simu_main_meta(config_NE_NA_NS,Liner=False)
microbiome=simu_main_microbiome(metadata,config_NE_NA_NS)
microbiome=microbiome.iloc[:,6:]
microbiome['Group'] =metadata['Group']
metadata.index=['s'+str(i) for i in metadata.index]
microbiome.index=['s'+str(i) for i in microbiome.index]
metadata,microbiome


(      Environment    Age    Sex  Noise    1  Group
 s0          0.413  0.270  0.492  0.853  1.0      0
 s1          0.269  0.272  0.619  0.580  1.0      0
 s2          0.372  0.616  0.088  0.696  1.0      0
 s3          0.336  0.493  0.688  0.948  1.0      0
 s4          0.629  0.628  0.132  0.874  1.0      0
 ...           ...    ...    ...    ...  ...    ...
 s195        0.565  0.287  0.516  0.466  1.0      1
 s196        0.529  0.518  0.476  0.654  1.0      1
 s197        0.883  0.751  0.613  0.665  1.0      1
 s198        0.865  0.392  0.707  0.933  1.0      1
 s199        0.531  0.372  0.203  0.767  1.0      1
 
 [200 rows x 6 columns],
           W1X0      W1X1      W1X2      W1X3      W1X4      W1X5      W1X6  \
 s0    1.364864  0.934379  0.733533  1.300633  0.918548  1.390242  0.709014   
 s1    1.182084  0.779519  0.562253  1.150693  0.739788  1.235762  0.535414   
 s2    1.214284  0.856839  0.643133  1.175773  0.809288  1.328522  0.604394   
 s3    1.448164  1.037019  0.8255

In [25]:
N_diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_NE_NA_NS,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_NE_NA_NS)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_NE_NA_NS'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.05)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    N_diff=pd.concat([N_diff,score])
    save_files(pi,seed,config_NE_NA_NS,metadata,microbiome,result)

In [23]:
N_diff.index=['s'+str(i) for i in range(0,50)]
N_diff.to_csv('../simuData/NE_NA_NS_changed.csv')
N_diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.95,1.0,0.9,0.947368,0.98,1.0,0.96,0.979592
s1,0.99,1.0,0.98,0.989899,0.99,1.0,0.98,0.989899
s2,0.97,1.0,0.94,0.969072,0.88,1.0,0.76,0.863636
s3,0.98,1.0,0.96,0.979592,0.96,1.0,0.92,0.958333
s4,0.98,1.0,0.96,0.979592,0.97,1.0,0.94,0.969072
s5,0.99,1.0,0.98,0.989899,0.97,1.0,0.94,0.969072
s6,0.94,1.0,0.88,0.93617,0.95,1.0,0.9,0.947368
s7,0.97,1.0,0.94,0.969072,0.86,1.0,0.72,0.837209
s8,0.98,1.0,0.96,0.979592,0.98,1.0,0.96,0.979592
s9,0.95,1.0,0.9,0.947368,0.86,1.0,0.72,0.837209


In [26]:
N_diff.mean(axis=0)

Accuracy         0.961600
Precision        0.998205
Recall           0.924800
F1               0.959571
psm-Accuracy     0.949000
psm-Precision    0.996107
psm-Recall       0.902000
psm-F1           0.943681
dtype: float64

# 0.02 difference

In [27]:
config_SE_SA_SS_1 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.49,
    'MEAN_E1':0.51,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.49,
    'MEAN_A1':0.51,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.49,
    'MEAN_S1':0.51,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [28]:
diff0=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_SE_SA_SS_1,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_SE_SA_SS_1)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_SE_SA_SS_1'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.05)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff0=pd.concat([diff0,score])
    save_files(pi,seed,config_SE_SA_SS_1,metadata,microbiome,result)
diff0.index=['s'+str(i) for i in range(0,50)]
diff0.to_csv('../simuData/SE_SA_SS_changed_1.csv')
diff0

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.99,1.0,0.98,0.989899,0.97,1.0,0.94,0.969072
s1,0.98,0.98,0.98,0.98,0.98,1.0,0.96,0.979592
s2,0.95,1.0,0.9,0.947368,0.92,1.0,0.84,0.913043
s3,0.95,1.0,0.9,0.947368,0.97,0.979592,0.96,0.969697
s4,0.97,0.979592,0.96,0.969697,0.97,1.0,0.94,0.969072
s5,0.96,1.0,0.92,0.958333,0.92,1.0,0.84,0.913043
s6,0.94,1.0,0.88,0.93617,0.9,1.0,0.8,0.888889
s7,0.98,1.0,0.96,0.979592,0.98,1.0,0.96,0.979592
s8,0.97,1.0,0.94,0.969072,0.97,1.0,0.94,0.969072
s9,0.99,1.0,0.98,0.989899,0.94,1.0,0.88,0.93617


In [29]:
diff0.mean(axis=0)

Accuracy         0.968800
Precision        0.993692
Recall           0.944000
F1               0.967649
psm-Accuracy     0.927600
psm-Precision    0.980615
psm-Recall       0.884000
psm-F1           0.924404
dtype: float64

## 0.04 background difference

In [30]:
config_SE_SA_SS_2 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.48,
    'MEAN_E1':0.52,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.48,
    'MEAN_A1':0.52,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.48,
    'MEAN_S1':0.52,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [31]:
diff1=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_SE_SA_SS_2,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_SE_SA_SS_2)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_SE_SA_SS_2'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.05)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff1=pd.concat([diff1,score])
    save_files(pi,seed,config_SE_SA_SS_2,metadata,microbiome,result)
diff1.index=['s'+str(i) for i in range(0,50)]
diff1.to_csv('../simuData/SE_SA_SS_changed_2.csv')
diff1

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.99,1.0,0.98,0.989899,0.92,1.0,0.84,0.913043
s1,0.99,0.980392,1.0,0.990099,0.99,1.0,0.98,0.989899
s2,0.97,1.0,0.94,0.969072,0.97,1.0,0.94,0.969072
s3,0.96,0.979167,0.94,0.959184,0.99,0.980392,1.0,0.990099
s4,0.99,1.0,0.98,0.989899,0.97,1.0,0.94,0.969072
s5,0.99,1.0,0.98,0.989899,0.99,1.0,0.98,0.989899
s6,0.99,1.0,0.98,0.989899,0.84,1.0,0.68,0.809524
s7,0.96,0.979167,0.94,0.959184,0.93,1.0,0.86,0.924731
s8,0.95,1.0,0.9,0.947368,0.94,1.0,0.88,0.93617
s9,0.94,0.94,0.94,0.94,0.96,1.0,0.92,0.958333


In [32]:
diff1.mean(axis=0)

Accuracy         0.965400
Precision        0.972565
Recall           0.960800
F1               0.965571
psm-Accuracy     0.938400
psm-Precision    0.988766
psm-Recall       0.898000
psm-F1           0.936414
dtype: float64

# 0.06 difference

In [33]:
config_SE_SA_SS_3 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.47,
    'MEAN_E1':0.53,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.47,
    'MEAN_A1':0.53,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.47,
    'MEAN_S1':0.53,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [35]:
diff2=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_SE_SA_SS_3,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_SE_SA_SS_3)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_SE_SA_SS_3'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.1)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.1,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff2=pd.concat([diff2,score])
    save_files(pi,seed,config_SE_SA_SS_3,metadata,microbiome,result)
diff2.index=['s'+str(i) for i in range(0,50)]
diff2.to_csv('../simuData/SE_SA_SS_changed_3.csv')
diff2

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.85,0.769231,1.0,0.869565,0.97,1.0,0.94,0.969072
s1,0.79,0.710145,0.98,0.823529,0.5,0.5,1.0,0.666667
s2,0.89,0.819672,1.0,0.900901,0.98,1.0,0.96,0.979592
s3,0.94,0.907407,0.98,0.942308,0.98,1.0,0.96,0.979592
s4,0.8,0.720588,0.98,0.830508,0.95,1.0,0.9,0.947368
s5,1.0,1.0,1.0,1.0,0.99,1.0,0.98,0.989899
s6,0.85,0.777778,0.98,0.867257,0.92,1.0,0.84,0.913043
s7,0.91,0.93617,0.88,0.907216,0.87,1.0,0.74,0.850575
s8,0.97,1.0,0.94,0.969072,0.93,1.0,0.86,0.924731
s9,0.88,0.806452,1.0,0.892857,0.89,1.0,0.78,0.876404


In [36]:
diff2.mean(axis=0)

Accuracy         0.926000
Precision        0.898899
Recall           0.974000
F1               0.932175
psm-Accuracy     0.945600
psm-Precision    0.983531
psm-Recall       0.918000
psm-F1           0.945511
dtype: float64

# 0.08 difference

In [37]:
config_SE_SA_SS_4 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.46,
    'MEAN_E1':0.54,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.46,
    'MEAN_A1':0.54,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.46,
    'MEAN_S1':0.54,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [39]:
diff3=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_SE_SA_SS_4,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_SE_SA_SS_4)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_SE_SA_SS_4'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.1)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.1,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff3=pd.concat([diff3,score])
    save_files(pi,seed,config_SE_SA_SS_4,metadata,microbiome,result)
diff3.index=['s'+str(i) for i in range(0,50)]
diff3.to_csv('../simuData/SE_SA_SS_changed_4.csv')
diff3

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.96,0.942308,0.98,0.960784,0.98,1.0,0.96,0.979592
s1,0.75,0.671233,0.98,0.796748,0.96,1.0,0.92,0.958333
s2,0.81,0.724638,1.0,0.840336,0.48,0.4875,0.78,0.6
s3,0.78,0.694444,1.0,0.819672,0.87,1.0,0.74,0.850575
s4,0.95,1.0,0.9,0.947368,0.91,1.0,0.82,0.901099
s5,0.72,0.641026,1.0,0.78125,0.51,0.505051,1.0,0.671141
s6,0.81,0.731343,0.98,0.837607,0.9,1.0,0.8,0.888889
s7,0.95,0.959184,0.94,0.949495,0.85,1.0,0.7,0.823529
s8,0.93,0.877193,1.0,0.934579,0.95,1.0,0.9,0.947368
s9,0.88,0.816667,0.98,0.890909,0.5,0.5,1.0,0.666667


In [40]:
diff3.mean(axis=0)

Accuracy         0.829800
Precision        0.776734
Recall           0.981200
F1               0.860024
psm-Accuracy     0.918200
psm-Precision    0.962366
psm-Recall       0.900400
psm-F1           0.922481
dtype: float64

# 0.1 difference

In [41]:
config_SE_SA_SS_5 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.45,
    'MEAN_E1':0.55,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.45,
    'MEAN_A1':0.55,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.45,
    'MEAN_S1':0.55,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [42]:
diff4=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_SE_SA_SS_5,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_SE_SA_SS_5)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_SE_SA_SS_5'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.1)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.1,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff4=pd.concat([diff4,score])
    save_files(pi,seed,config_SE_SA_SS_5,metadata,microbiome,result)
diff4.index=['s'+str(i) for i in range(0,50)]
diff4.to_csv('../simuData/SE_SA_SS_changed_5.csv')
diff4

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.71,0.632911,1.0,0.775194,0.97,1.0,0.94,0.969072
s1,0.96,0.96,0.96,0.96,0.95,1.0,0.9,0.947368
s2,0.61,0.561798,1.0,0.719424,0.93,1.0,0.86,0.924731
s3,0.86,0.821429,0.92,0.867925,0.93,1.0,0.86,0.924731
s4,0.69,0.617284,1.0,0.763359,0.98,1.0,0.96,0.979592
s5,0.87,0.803279,0.98,0.882883,0.95,0.978723,0.92,0.948454
s6,0.83,0.753846,0.98,0.852174,0.92,1.0,0.84,0.913043
s7,0.7,0.625,1.0,0.769231,0.99,1.0,0.98,0.989899
s8,0.86,0.78125,1.0,0.877193,0.95,1.0,0.9,0.947368
s9,0.53,0.515789,0.98,0.675862,0.95,1.0,0.9,0.947368


In [43]:
diff4.mean(axis=0)

Accuracy         0.743400
Precision        0.683475
Recall           0.988400
F1               0.801659
psm-Accuracy     0.920800
psm-Precision    0.967993
psm-Recall       0.886800
psm-F1           0.922502
dtype: float64

## 0.12 background difference

In [44]:
config_HE_HA_HS_1 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.44,
    'MEAN_E1':0.56,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.44,
    'MEAN_A1':0.56,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.44,
    'MEAN_S1':0.56,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [45]:
diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_HE_HA_HS_1,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_HE_HA_HS_1)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_HE_HA_HS_1'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.1)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.1,ratio=1)
    
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff=pd.concat([diff,score])
    save_files(pi,seed,config_HE_HA_HS_1,metadata,microbiome,result)
diff.index=['s'+str(i) for i in range(0,50)]
diff.to_csv('../simuData/HE_HA_HS_changed_1.csv')
diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.81,0.738462,0.96,0.834783,0.91,1.0,0.82,0.901099
s1,0.64,0.581395,1.0,0.735294,0.98,0.961538,1.0,0.980392
s2,0.74,0.657895,1.0,0.793651,0.88,1.0,0.76,0.863636
s3,0.57,0.537634,1.0,0.699301,0.98,0.961538,1.0,0.980392
s4,0.59,0.550562,0.98,0.705036,0.91,1.0,0.82,0.901099
s5,0.56,0.531915,1.0,0.694444,0.87,1.0,0.74,0.850575
s6,0.8,0.714286,1.0,0.833333,0.95,1.0,0.9,0.947368
s7,0.63,0.574713,1.0,0.729927,0.96,1.0,0.92,0.958333
s8,0.79,0.710145,0.98,0.823529,0.95,1.0,0.9,0.947368
s9,0.56,0.533333,0.96,0.685714,0.97,1.0,0.94,0.969072


In [46]:
diff.mean(axis=0)

Accuracy         0.647800
Precision        0.595606
Recall           0.996000
F1               0.742800
psm-Accuracy     0.934600
psm-Precision    0.976760
psm-Recall       0.897200
psm-F1           0.931244
dtype: float64

## 0.14 difference

In [47]:
config_HE_HA_HS_2 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.43,
    'MEAN_E1':0.57,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.43,
    'MEAN_A1':0.57,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.43,
    'MEAN_S1':0.57,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [48]:
diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_HE_HA_HS_2,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_HE_HA_HS_2)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_HE_HA_HS_2'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.1)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.1,ratio=1)
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff=pd.concat([diff,score])
    save_files(pi,seed,config_HE_HA_HS_2,metadata,microbiome,result)
diff.index=['s'+str(i) for i in range(0,50)]
diff.to_csv('../simuData/HE_HA_HS_changed_2.csv')
diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.65,0.588235,1.0,0.740741,0.96,1.0,0.92,0.958333
s1,0.51,0.505051,1.0,0.671141,0.94,1.0,0.88,0.93617
s2,0.6,0.555556,1.0,0.714286,0.19,0.268657,0.36,0.307692
s3,0.57,0.537634,1.0,0.699301,0.77,0.864865,0.64,0.735632
s4,0.53,0.515464,1.0,0.680272,0.89,1.0,0.78,0.876404
s5,0.61,0.561798,1.0,0.719424,0.93,1.0,0.86,0.924731
s6,0.57,0.538462,0.98,0.695035,0.9,1.0,0.8,0.888889
s7,0.56,0.531915,1.0,0.694444,0.98,0.961538,1.0,0.980392
s8,0.54,0.521277,0.98,0.680556,0.97,0.979592,0.96,0.969697
s9,0.56,0.531915,1.0,0.694444,0.93,1.0,0.86,0.924731


In [49]:
diff.mean(axis=0)

Accuracy         0.557600
Precision        0.531616
Recall           0.997600
F1               0.693304
psm-Accuracy     0.857200
psm-Precision    0.907291
psm-Recall       0.856400
psm-F1           0.870944
dtype: float64

# 0.16 difference

In [50]:
config_HE_HA_HS_3 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.42,
    'MEAN_E1':0.58,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.42,
    'MEAN_A1':0.58,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.42,
    'MEAN_S1':0.58,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [51]:
diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_HE_HA_HS_3,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_HE_HA_HS_3)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_HE_HA_HS_3'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.2)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.2,ratio=1)
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff=pd.concat([diff,score])
    save_files(pi,seed,config_HE_HA_HS_3,metadata,microbiome,result)
diff.index=['s'+str(i) for i in range(0,50)]
diff.to_csv('../simuData/HE_HA_HS_changed_3.csv')
diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.55,0.526316,1.0,0.689655,0.98,0.961538,1.0,0.980392
s1,0.51,0.505051,1.0,0.671141,0.5,0.5,1.0,0.666667
s2,0.58,0.544444,0.98,0.7,0.39,0.432099,0.7,0.534351
s3,0.5,0.5,1.0,0.666667,0.97,0.960784,0.98,0.970297
s4,0.51,0.505051,1.0,0.671141,0.48,0.481481,0.52,0.5
s5,0.51,0.505051,1.0,0.671141,0.46,0.470588,0.64,0.542373
s6,0.53,0.515464,1.0,0.680272,0.93,1.0,0.86,0.924731
s7,0.58,0.543478,1.0,0.704225,0.95,1.0,0.9,0.947368
s8,0.5,0.5,1.0,0.666667,0.98,1.0,0.96,0.979592
s9,0.53,0.515464,1.0,0.680272,0.92,1.0,0.84,0.913043


In [52]:
diff.mean(axis=0)

Accuracy         0.529400
Precision        0.515646
Recall           0.997600
F1               0.679726
psm-Accuracy     0.833800
psm-Precision    0.875284
psm-Recall       0.858800
psm-F1           0.851704
dtype: float64

# 0.18 difference

In [53]:
config_HE_HA_HS_4 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.41,
    'MEAN_E1':0.59,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.41,
    'MEAN_A1':0.59,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.41,
    'MEAN_S1':0.59,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [54]:
diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_HE_HA_HS_4,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_HE_HA_HS_4)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_HE_HA_HS_4'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.2)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.2,ratio=1)
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff=pd.concat([diff,score])
    save_files(pi,seed,config_HE_HA_HS_4,metadata,microbiome,result)
diff.index=['s'+str(i) for i in range(0,50)]
diff.to_csv('../simuData/HE_HA_HS_changed_4.csv')
diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.5,0.5,1.0,0.666667,0.51,0.505051,1.0,0.671141
s1,0.5,0.5,1.0,0.666667,1.0,1.0,1.0,1.0
s2,0.5,0.5,1.0,0.666667,0.5,0.5,1.0,0.666667
s3,0.5,0.5,1.0,0.666667,0.9,1.0,0.8,0.888889
s4,0.51,0.505051,1.0,0.671141,0.92,1.0,0.84,0.913043
s5,0.51,0.505051,1.0,0.671141,0.96,0.979167,0.94,0.959184
s6,0.5,0.5,1.0,0.666667,1.0,1.0,1.0,1.0
s7,0.5,0.5,1.0,0.666667,0.95,1.0,0.9,0.947368
s8,0.51,0.505051,1.0,0.671141,0.5,0.5,1.0,0.666667
s9,0.54,0.520833,1.0,0.684932,0.97,0.979592,0.96,0.969697


In [55]:
diff.mean(axis=0)

Accuracy         0.511600
Precision        0.505964
Recall           1.000000
F1               0.671918
psm-Accuracy     0.796000
psm-Precision    0.819510
psm-Recall       0.888800
psm-F1           0.834252
dtype: float64

# 0.2 difference

In [56]:
config_HE_HA_HS_5 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.4,
    'MEAN_E1':0.6,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.4,
    'MEAN_A1':0.6,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.4,
    'MEAN_S1':0.6,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
     ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { ### 1: non-differetial taxa
        'N_Species':50,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { ### 2: differetial taxa
        'N_Species':50,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}

In [57]:
diff=pd.DataFrame()
for seed in range(0,50):
    metadata= simu_main_meta(config_HE_HA_HS_5,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_HE_HA_HS_5)
    microbiome=microbiome.iloc[:,6:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    pi='config_HE_HA_HS_5'
    params = [('output', 'output_dir', '../simuData/'+pi+'/'+str(seed)+'/'), ('psm', 'caliper', str(0.2)), ('psm', 'ratio', str(1))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_simu_match(data=metadata, target='Group', params=params,features=['Environment', 'Age', 'Sex'],caliper=0.2,ratio=1)
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    TrueDiffTax=microbiome.columns[50:100]
    TrueOtherTax=microbiome.columns[0:50]
    DiffTax=result.index[result['p-value(raw)']<0.05]
    OtherTax=microbiome.columns[0:100].difference(DiffTax)
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    PSMOtherTax=microbiome.columns[0:100].difference(PSMDiffTax)
    score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
    diff=pd.concat([diff,score])
    save_files(pi,seed,config_HE_HA_HS_5,metadata,microbiome,result)
diff.index=['s'+str(i) for i in range(0,50)]
diff.to_csv('../simuData/HE_HA_HS_changed_5.csv')
diff

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
s0,0.5,0.5,1.0,0.666667,0.94,1.0,0.88,0.93617
s1,0.5,0.5,1.0,0.666667,0.96,1.0,0.92,0.958333
s2,0.5,0.5,1.0,0.666667,0.94,1.0,0.88,0.93617
s3,0.5,0.5,0.98,0.662162,0.86,1.0,0.72,0.837209
s4,0.5,0.5,1.0,0.666667,0.87,1.0,0.74,0.850575
s5,0.5,0.5,1.0,0.666667,0.8,0.96875,0.62,0.756098
s6,0.52,0.510204,1.0,0.675676,0.24,0.314286,0.44,0.366667
s7,0.5,0.5,1.0,0.666667,0.9,1.0,0.8,0.888889
s8,0.53,0.515464,1.0,0.680272,0.98,1.0,0.96,0.979592
s9,0.52,0.510204,1.0,0.675676,0.96,0.925926,1.0,0.961538


In [58]:
diff.mean(axis=0)

Accuracy         0.508000
Precision        0.504105
Recall           0.998000
F1               0.669833
psm-Accuracy     0.782000
psm-Precision    0.822469
psm-Recall       0.849200
psm-F1           0.814902
dtype: float64