In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, truncnorm, bernoulli
import cloudpickle as pickle
import matplotlib.pyplot as plt
import torch
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #last_expr

In [2]:
def simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E):
    dc_D0 = np.round(norm.rvs(loc=MEAN_E0, scale=SIGMA_E0, size=N), 3).clip(MIN_E, MAX_E)
    dc_D1 = np.round(norm.rvs(loc=MEAN_E1, scale=SIGMA_E1, size=N), 3).clip(MIN_E, MAX_E)
    return dc_D0, dc_D1

def simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A):
    ac_a0 = np.round(norm.rvs(loc=MEAN_A0, scale=SIGMA_A0, size=N),3).clip(MIN_A, MAX_A)
    ac_a1 = np.round(norm.rvs(loc=MEAN_A1, scale=SIGMA_A1, size=N),3).clip(MIN_A, MAX_A)
    return ac_a0, ac_a1

def simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S):
    sc_s0 = np.round(norm.rvs(loc=MEAN_S0, scale=SIGMA_S0, size=N), 3).clip(MIN_S, MAX_S)
    sc_s1 = np.round(norm.rvs(loc=MEAN_S1, scale=SIGMA_S1, size=N), 3).clip(MIN_S, MAX_S)
    return sc_s0, sc_s1

def simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N):
    #np.random.seed(0)
    #nc_n = np.round(norm.rvs(loc=MEAN_N, scale=SIGMA_N, size=N*2), 3).clip(MIN_N, MAX_N)
    #pickle.dump(nc_n, open('nc_n.pkl', 'wb'))
    nc_n = pickle.load(open('nc_n.pkl', 'rb'))
    return nc_n

def simi_metabolic_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S,MEAN_N, SIGMA_N, MIN_N, MAX_N):
    dc_D0, dc_D1 = simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E)
    ac_a0, ac_a1 = simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A)
    sc_s0, sc_s1 = simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S)
    nc_n = simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N)
    simudata = pd.DataFrame([np.hstack((dc_D0, dc_D1)), np.hstack((ac_a0, ac_a1)), np.hstack((sc_s0, sc_s1)), np.hstack((nc_n)),np.ones(N*2)], index=['Environment', 'Age', 'Sex','Noise','1'], columns=range(N*2)).T
    #simudata['Noise'] = np.round(norm.rvs(MEAN_U, SIGMA_U, size=2*N), 3)
    #Group=0 : control  Group=1 : case
    simudata['Group'] = np.hstack(([0 for i in range(N)], [1 for i in range(N)]))
    return simudata

def simu_weight(N_Species, MEAN_WE, MEAN_WA, MEAN_WS,MEAN_WN, MEAN_WD, rsize=50):
    simudata = []
    Ws = []
    for i in range(N_Species):
        w = np.random.choice(['E', 'A', 'S','N', 'D'], size=rsize, p=[MEAN_WE, MEAN_WA, MEAN_WS, MEAN_WN,MEAN_WD])
        w = np.round([(w==i).mean() for i in ['E', 'A', 'S','N', 'D']], 3)
        w = list(w)
        w. append(np.random.random( ))
        Ws.append(w)
    return pd.DataFrame(Ws, index=range(N_Species), columns=['E', 'A', 'S','N', 'D','1'])

def cal_abundance(metabolic, weights, windex):
    abundance = pd.DataFrame(np.dot(metabolic[['Environment', 'Age', 'Sex','Noise','Group','1']].values, weights.values.T), index=metabolic.index, 
                             columns=['W'+str(windex)+'X'+str(i) for i in weights.index])
    metabolic = pd.concat([metabolic, abundance], axis=1, sort=False)
    return metabolic

def simu_main_meta(config,Liner):
    metabolic = simi_metabolic_component(config['N'], config['MEAN_E0'], config['MEAN_E1'], config['SIGMA_E0'], config['SIGMA_E1'], config['MIN_E'], config['MAX_E'],
                                         config['MEAN_A0'], config['MEAN_A1'], config['SIGMA_A0'], config['SIGMA_A1'], config['MIN_A'], config['MAX_A'],
                                         config['MEAN_S0'], config['MEAN_S1'], config['SIGMA_S0'], config['SIGMA_S1'], config['MIN_S'], config['MAX_S'],
                                         config['MEAN_N'], config['SIGMA_N'], config['MIN_N'], config['MAX_N'])
    if Liner==True:
        metabolic['Environment'] =config['LE_E'] *metabolic['Group'] + metabolic['Environment']  
        metabolic['Age']=config['LE_A'] * metabolic['Group'] + metabolic['Age']
        metabolic['Sex']=config['LE_S'] * metabolic['Group'] + metabolic['Sex']
    return metabolic

def simu_main_microbiome(metabolic,config):
    windex = 1
    for Ws in config['Weights_proportion']:
        weights = simu_weight(Ws['N_Species'], Ws['MEAN_WE'], Ws['MEAN_WA'], Ws['MEAN_WS'], Ws['MEAN_WN'],Ws['MEAN_WD'])
        metabolic = cal_abundance(metabolic, weights, windex)
        windex += 1
    return metabolic

In [3]:
def validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax):
    TP1=len(DiffTax.intersection(TrueDiffTax))
    FP1=len(DiffTax)-TP1
    TN1=len(OtherTax.intersection(TrueOtherTax))
    FN1=len(OtherTax)-TN1
    TP2=len(PSMDiffTax.intersection(TrueDiffTax))
    FP2=len(PSMDiffTax)-TP2
    TN2=len(PSMOtherTax.intersection(TrueOtherTax))
    FN2=len(PSMOtherTax)-TN2
    accuracy1=(TP1+TN1)/(TP1+FN1+FP1+TN1)
    precision1=TP1/(TP1+FP1)
    recall1=TP1/(TP1+FN1)
    F11=2*precision1*recall1/(precision1+recall1)
    accuracy2=(TP2+TN2)/(TP2+FN2+FP2+TN2)
    if (TP2+FP2)==0:
        precision2=0
    else:
        precision2=TP2/(TP2+FP2)
    recall2=TP2/(TP2+FN2)
    if (precision2+recall2)==0:
        F12=None
    else:
        F12=2*precision2*recall2/(precision2+recall2)
    score=pd.DataFrame(data=None,columns=["Accuracy","Precision","Recall","F1","psm-Accuracy","psm-Precision","psm-Recall","psm-F1"])
    score.loc[len(score.index)] = [accuracy1,precision1,recall1,F11,accuracy2,precision2,recall2,F12]
    return score

In [4]:
%run miMatch.py
psm = miMatch()

# how abundence affect miMatch

#  Noise

In [47]:
def simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E):
    #np.random.seed(0)
    #dc_D0 = np.round(norm.rvs(loc=MEAN_E0, scale=SIGMA_E0, size=N), 3).clip(MIN_E, MAX_E)
    #dc_D1 = np.round(norm.rvs(loc=MEAN_E1, scale=SIGMA_E1, size=N), 3).clip(MIN_E, MAX_E)
    #pickle.dump(dc_D0, open('dc_d0.pkl', 'wb'))
    #pickle.dump(dc_D1, open('dc_d1.pkl', 'wb'))
    dc_D0=pickle.load(open('dc_d0.pkl', 'rb'))
    dc_D1=pickle.load(open('dc_d1.pkl', 'rb'))
    return dc_D0, dc_D1

def simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A):
    #np.random.seed(1)
    #ac_a0 = np.round(norm.rvs(loc=MEAN_A0, scale=SIGMA_A0, size=N),3).clip(MIN_A, MAX_A)
    #ac_a1 = np.round(norm.rvs(loc=MEAN_A1, scale=SIGMA_A1, size=N),3).clip(MIN_A, MAX_A)
    #pickle.dump(ac_a0, open('ac_a0.pkl', 'wb'))
    #pickle.dump(ac_a1, open('ac_a1.pkl', 'wb'))
    ac_a0=pickle.load(open('ac_a0.pkl', 'rb'))
    ac_a1=pickle.load(open('ac_a1.pkl', 'rb'))
    return ac_a0, ac_a1

def simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S):
    #np.random.seed(2)
    #sc_s0 = np.round(norm.rvs(loc=MEAN_S0, scale=SIGMA_S0, size=N), 3).clip(MIN_S, MAX_S)
    #sc_s1 = np.round(norm.rvs(loc=MEAN_S1, scale=SIGMA_S1, size=N), 3).clip(MIN_S, MAX_S)
    #pickle.dump(sc_s0, open('sc_s0.pkl', 'wb'))
    #pickle.dump(sc_s1, open('sc_s1.pkl', 'wb'))
    sc_s0=pickle.load(open('sc_s0.pkl', 'rb'))
    sc_s1=pickle.load(open('sc_s1.pkl', 'rb'))
    return sc_s0, sc_s1

def simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N):
    #np.random.seed(0)
    #nc_n = np.round(norm.rvs(loc=MEAN_N, scale=SIGMA_N, size=N*2), 3).clip(MIN_N, MAX_N)
    #pickle.dump(nc_n, open('nc_n.pkl', 'wb'))
    nc_n = pickle.load(open('nc_n.pkl', 'rb'))
    return nc_n

def simi_metabolic_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S,MEAN_N, SIGMA_N, MIN_N, MAX_N):
    dc_D0, dc_D1 = simu_environment_component(N, MEAN_E0, MEAN_E1, SIGMA_E0, SIGMA_E1, MIN_E, MAX_E)
    ac_a0, ac_a1 = simu_age_component(N, MEAN_A0, MEAN_A1, SIGMA_A0, SIGMA_A1, MIN_A, MAX_A)
    sc_s0, sc_s1 = simu_sex_component(N, MEAN_S0, MEAN_S1, SIGMA_S0, SIGMA_S1, MIN_S, MAX_S)
    nc_n = simu_noise_component(N, MEAN_N, SIGMA_N, MIN_N, MAX_N)
    simudata = pd.DataFrame([np.hstack((dc_D0, dc_D1)), np.hstack((ac_a0, ac_a1)), np.hstack((sc_s0, sc_s1)), np.hstack((nc_n)),np.ones(N*2)], index=['Environment', 'Age', 'Sex','Noise','1'], columns=range(N*2)).T
    #simudata['Noise'] = np.round(norm.rvs(MEAN_U, SIGMA_U, size=2*N), 3)
    #Group=0 : control  Group=1 : case
    simudata['Group'] = np.hstack(([0 for i in range(N)], [1 for i in range(N)]))
    return simudata

def simu_weight(N_Species, MEAN_WE, MEAN_WA, MEAN_WS,MEAN_WN, MEAN_WD, rsize=50):
    simudata = []
    Ws = []
    for i in range(N_Species):
        w = np.random.choice(['E', 'A', 'S','N', 'D'], size=rsize, p=[MEAN_WE, MEAN_WA, MEAN_WS, MEAN_WN,MEAN_WD])
        w = np.round([(w==i).mean() for i in ['E', 'A', 'S','N', 'D']], 3)
        w = list(w)
        w. append(np.random.random( ))
        Ws.append(w)
    return pd.DataFrame(Ws, index=range(N_Species), columns=['E', 'A', 'S','N', 'D','1'])

def cal_abundance(metabolic, weights, windex):
    abundance = pd.DataFrame(np.dot(metabolic[['Environment', 'Age', 'Sex','Noise','Group','1']].values, weights.values.T), index=metabolic.index, 
                             columns=['W'+str(windex)+'X'+str(i) for i in weights.index])
    metabolic = pd.concat([metabolic, abundance], axis=1, sort=False)
    return metabolic

def simu_main_meta(config,Liner):
    metabolic = simi_metabolic_component(config['N'], config['MEAN_E0'], config['MEAN_E1'], config['SIGMA_E0'], config['SIGMA_E1'], config['MIN_E'], config['MAX_E'],
                                         config['MEAN_A0'], config['MEAN_A1'], config['SIGMA_A0'], config['SIGMA_A1'], config['MIN_A'], config['MAX_A'],
                                         config['MEAN_S0'], config['MEAN_S1'], config['SIGMA_S0'], config['SIGMA_S1'], config['MIN_S'], config['MAX_S'],
                                         config['MEAN_N'], config['SIGMA_N'], config['MIN_N'], config['MAX_N'])
    if Liner==True:
        metabolic['Environment'] =config['LE_E'] *metabolic['Group'] + metabolic['Environment']  
        metabolic['Age']=config['LE_A'] * metabolic['Group'] + metabolic['Age']
        metabolic['Sex']=config['LE_S'] * metabolic['Group'] + metabolic['Sex']
    return metabolic

def simu_main_microbiome(metabolic,config):
    windex = 1
    for Ws in config['Weights_proportion']:
        weights = simu_weight(Ws['N_Species'], Ws['MEAN_WE'], Ws['MEAN_WA'], Ws['MEAN_WS'], Ws['MEAN_WN'],Ws['MEAN_WD'])
        metabolic = cal_abundance(metabolic, weights, windex)
        windex += 1
    return metabolic

In [5]:
def validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax):
    TP1=len(DiffTax.intersection(TrueDiffTax))
    FP1=len(DiffTax)-TP1
    TN1=len(OtherTax.intersection(TrueOtherTax))
    FN1=len(OtherTax)-TN1
    TP2=len(PSMDiffTax.intersection(TrueDiffTax))
    FP2=len(PSMDiffTax)-TP2
    TN2=len(PSMOtherTax.intersection(TrueOtherTax))
    FN2=len(PSMOtherTax)-TN2
    accuracy1=(TP1+TN1)/(TP1+FN1+FP1+TN1)
    precision1=TP1/(TP1+FP1)
    recall1=TP1/(TP1+FN1)
    F11=2*precision1*recall1/(precision1+recall1)
    accuracy2=(TP2+TN2)/(TP2+FN2+FP2+TN2)
    if (TP2+FP2)==0:
        precision2=0
    else:
        precision2=TP2/(TP2+FP2)
    recall2=TP2/(TP2+FN2)
    if (precision2+recall2)==0:
        F12=None
    else:
        F12=2*precision2*recall2/(precision2+recall2)
    score=pd.DataFrame(data=None,columns=["Accuracy","Precision","Recall","F1","psm-Accuracy","psm-Precision","psm-Recall","psm-F1"])
    score.loc[len(score.index)] = [accuracy1,precision1,recall1,F11,accuracy2,precision2,recall2,F12]
    return score

###  simulation

In [6]:
config_1 = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.45,
    'MEAN_E1':0.55,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.45,
    'MEAN_A1':0.55,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.45,
    'MEAN_S1':0.55,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
    #noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    
    'Weights_proportion':[
        { 
        'N_Species':1000,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { 
        'N_Species':1000,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [7]:
metadata= simu_main_meta(config_1,Liner=False)
microbiome=simu_main_microbiome(metadata,config_1)
microbiome=microbiome.iloc[:,6:]
microbiome['Group'] =metadata['Group']
metadata.index=['s'+str(i) for i in metadata.index]
microbiome.index=['s'+str(i) for i in microbiome.index]
print('矩阵a的行:',microbiome.shape[0])
print('矩阵a的列:',microbiome.shape[1])
#microbiome.to_csv('./simuData/simu3All.csv')
sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = psm.simu_match(data=metadata, target='Group', features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
### difference microbiome
result = pd.DataFrame()
res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
res.columns = [i+'(raw)' for i in res.columns]
result = pd.concat([result, res], axis=1, sort=False)
res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
res.columns = [i+'(PSM)' for i in res.columns]
result = pd.concat([result, res], axis=1, sort=False)
result = result.sort_values(['fdr(PSM)'])
result


矩阵a的行: 200
矩阵a的列: 2001


Unnamed: 0,N Control(raw),Mean Control(raw),N Case(raw),Mean Case(raw),cohen's d(raw),Fold change(raw),p-value(raw),fdr(raw),N Control(PSM),Mean Control(PSM),N Case(PSM),Mean Case(PSM),cohen's d(PSM),Fold change(PSM),p-value(PSM),fdr(PSM)
W2X880,100,1.214893,100,1.478898,3.046158,1.217307,1.530256e-32,2.236432e-29,77,1.226185,77,1.455204,3.047416,1.186773,2.125911e-25,2.125911e-22
W2X184,100,1.183391,100,1.457129,2.961755,1.231317,2.236432e-32,2.236432e-29,77,1.184092,77,1.434073,3.037700,1.211116,1.070936e-25,2.125911e-22
W2X343,100,0.874498,100,1.140560,2.599064,1.304245,6.142647e-31,2.457059e-28,77,0.867796,77,1.118086,2.700371,1.288420,5.471047e-25,3.301946e-22
W2X251,100,0.400364,100,0.643439,2.824849,1.607137,8.494698e-32,5.663132e-29,77,0.411080,77,0.619311,2.770565,1.506548,6.603892e-25,3.301946e-22
W2X530,100,0.677613,100,0.923089,2.801786,1.362266,1.578289e-31,7.891447e-29,77,0.689148,77,0.898486,2.736181,1.303764,1.505122e-24,6.020489e-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W1X64,100,0.562607,100,0.610881,0.436325,1.085804,6.003488e-03,1.138102e-02,77,0.571759,77,0.581019,0.092845,1.016195,8.481140e-01,8.498136e-01
W1X747,100,0.703654,100,0.738002,0.247208,1.048815,1.436403e-01,1.568982e-01,77,0.694145,77,0.708630,0.116345,1.020867,8.537797e-01,8.550623e-01
W1X475,100,0.652802,100,0.688986,0.280266,1.055429,9.418524e-02,1.111986e-01,77,0.647275,77,0.659230,0.100642,1.018470,8.566154e-01,8.574728e-01
W1X199,100,1.436151,100,1.478263,0.350658,1.029323,2.975052e-02,4.423869e-02,77,1.437234,77,1.448188,0.099411,1.007622,8.736671e-01,8.741042e-01


In [8]:
yDiffTax=result.index[result['p-value(raw)']<0.05]
PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
print(yDiffTax)
print(PSMDiffTax)

Index(['W2X880', 'W2X184', 'W2X343', 'W2X251', 'W2X530', 'W2X31', 'W2X133',
       'W2X635', 'W2X284', 'W2X398',
       ...
       'W1X561', 'W1X766', 'W1X656', 'W1X537', 'W1X490', 'W1X932', 'W1X376',
       'W1X64', 'W1X199', 'W1X614'],
      dtype='object', length=1479)
Index(['W2X880', 'W2X184', 'W2X343', 'W2X251', 'W2X530', 'W2X31', 'W2X133',
       'W2X635', 'W2X284', 'W2X398',
       ...
       'W2X670', 'W2X296', 'W2X108', 'W2X411', 'W2X503', 'W2X748', 'W2X290',
       'W2X500', 'W2X612', 'W2X822'],
      dtype='object', length=952)


In [11]:
#print('microbiome mean: ',np.mean(microbiome.iloc[:,0:1000]))
#print('microbiome sd: ',np.std(microbiome.iloc[:,0:1000],ddof=1)) 
result['MicroMean']=np.mean(microbiome.iloc[:,0:2000])
result['MicroStd']=np.std(microbiome.iloc[:,0:2000],ddof=1)
result
result.to_csv('../simuData/abundence.csv')

Unnamed: 0,N Control(raw),Mean Control(raw),N Case(raw),Mean Case(raw),cohen's d(raw),Fold change(raw),p-value(raw),fdr(raw),N Control(PSM),Mean Control(PSM),N Case(PSM),Mean Case(PSM),cohen's d(PSM),Fold change(PSM),p-value(PSM),fdr(PSM),MicroMean,MicroStd
W2X880,100,1.214893,100,1.478898,3.046158,1.217307,1.530256e-32,2.236432e-29,77,1.226185,77,1.455204,3.047416,1.186773,2.125911e-25,2.125911e-22,1.346895,0.158069
W2X184,100,1.183391,100,1.457129,2.961755,1.231317,2.236432e-32,2.236432e-29,77,1.184092,77,1.434073,3.037700,1.211116,1.070936e-25,2.125911e-22,1.320260,0.165308
W2X343,100,0.874498,100,1.140560,2.599064,1.304245,6.142647e-31,2.457059e-28,77,0.867796,77,1.118086,2.700371,1.288420,5.471047e-25,3.301946e-22,1.007529,0.167967
W2X251,100,0.400364,100,0.643439,2.824849,1.607137,8.494698e-32,5.663132e-29,77,0.411080,77,0.619311,2.770565,1.506548,6.603892e-25,3.301946e-22,0.521901,0.149040
W2X530,100,0.677613,100,0.923089,2.801786,1.362266,1.578289e-31,7.891447e-29,77,0.689148,77,0.898486,2.736181,1.303764,1.505122e-24,6.020489e-22,0.800351,0.150924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W1X64,100,0.562607,100,0.610881,0.436325,1.085804,6.003488e-03,1.138102e-02,77,0.571759,77,0.581019,0.092845,1.016195,8.481140e-01,8.498136e-01,0.586744,0.112981
W1X747,100,0.703654,100,0.738002,0.247208,1.048815,1.436403e-01,1.568982e-01,77,0.694145,77,0.708630,0.116345,1.020867,8.537797e-01,8.550623e-01,0.720828,0.139662
W1X475,100,0.652802,100,0.688986,0.280266,1.055429,9.418524e-02,1.111986e-01,77,0.647275,77,0.659230,0.100642,1.018470,8.566154e-01,8.574728e-01,0.670894,0.130053
W1X199,100,1.436151,100,1.478263,0.350658,1.029323,2.975052e-02,4.423869e-02,77,1.437234,77,1.448188,0.099411,1.007622,8.736671e-01,8.741042e-01,1.457207,0.121639


In [10]:
TrueDiffTax=microbiome.columns[1000:2000]
TrueOtherTax=microbiome.columns[0:1000]
DiffTax=result.index[result['p-value(raw)']<0.05]
OtherTax=microbiome.columns[0:2000].difference(DiffTax)
PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
PSMOtherTax=microbiome.columns[0:2000].difference(PSMDiffTax)
score=validation_score(DiffTax,PSMDiffTax,TrueDiffTax,OtherTax,PSMOtherTax,TrueOtherTax)
score

Unnamed: 0,Accuracy,Precision,Recall,F1,psm-Accuracy,psm-Precision,psm-Recall,psm-F1
0,0.7555,0.672752,0.995,0.802743,0.976,1.0,0.952,0.97541


# different weight of diseases

In [17]:
config_E_A_S_N = {
    'N':100,
    ### Environment-drivers
    'MEAN_E0':0.45,
    'MEAN_E1':0.55,
    'SIGMA_E0':0.2,
    'SIGMA_E1':0.2,
    'MIN_E':0,
    'MAX_E':1,
    ### Age
    'MEAN_A0':0.45,
    'MEAN_A1':0.55,
    'SIGMA_A0':0.2,
    'SIGMA_A1':0.2,
    'MIN_A':0,
    'MAX_A':1,
    ### Sex
    'MEAN_S0':0.45,
    'MEAN_S1':0.55,
    'SIGMA_S0':0.2,
    'SIGMA_S1':0.2,
    'MIN_S':0,
    'MAX_S':1,
    ### Noise
    'MEAN_N':0.5,
    'SIGMA_N':0.2,
    'MIN_N':0,
    'MAX_N':1,
    'Weights_proportion':[
        { 
        'N_Species':100,
        'MEAN_WE':0.133,
        'MEAN_WA':0.133,
        'MEAN_WS':0.133,
        'MEAN_WN':0.6,
        'MEAN_WD':0.001,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.131,
        'MEAN_WA':0.132,
        'MEAN_WS':0.132,
        'MEAN_WN':0.6,
        'MEAN_WD':0.005,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.13,
        'MEAN_WA':0.13,
        'MEAN_WS':0.13,
        'MEAN_WN':0.6,
        'MEAN_WD':0.01,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.12,
        'MEAN_WA':0.13,
        'MEAN_WS':0.13,
        'MEAN_WN':0.6,
        'MEAN_WD':0.02,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.12,
        'MEAN_WA':0.12,
        'MEAN_WS':0.12,
        'MEAN_WN':0.6,
        'MEAN_WD':0.04,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.10,
        'MEAN_WA':0.12,
        'MEAN_WS':0.12,
        'MEAN_WN':0.6,
        'MEAN_WD':0.06,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.12,
        'MEAN_WN':0.6,
        'MEAN_WD':0.08,
        },
        { 
        'N_Species':100,
        'MEAN_WE':0.1,
        'MEAN_WA':0.1,
        'MEAN_WS':0.1,
        'MEAN_WN':0.6,
        'MEAN_WD':0.1,
        },
    ],
    'LE_E':0.0,
    'LE_A':0.0,
    'LE_S':0.0,
}


In [10]:
#metadata=pd.read_csv('./simuData/simu3_meta.csv',index_col=0)
DiffTax=[]
for seed in range(0,50):
    metadata= simu_main_meta(config_E_A_S_N,Liner=False)
    microbiome=simu_main_microbiome(metadata,config_E_A_S_N)
    microbiome=microbiome.iloc[:,5:]
    microbiome['Group'] =metadata['Group']
    metadata.index=['s'+str(i) for i in metadata.index]
    microbiome.index=['s'+str(i) for i in microbiome.index]
    #microbiome.to_csv('./simuData/simu3All.csv')
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = psm.simu_match(data=metadata, target='Group', features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    PSMDiffTax=result.index[result['p-value(PSM)']<0.05]
    DiffTax.append(PSMDiffTax) 
resoult = []  # 定义一个空字典
for i in range(0,50):  # 遍历输入的字符串，以键值对的方式存储在字典中
    for j in range(0,len(DiffTax[i])):
        resoult.append(DiffTax[i][j])
d = {}
for word in resoult: 
    d[word] = d.get(word, 0) + 1
#for k in d_order: 
#    print("{}:{}".format(k, d_order[k]))
d_order=sorted(d.items(),key=lambda x:x[1],reverse=True)#字典按value值排序
d_order 

[('W6X4', 42),
 ('W6X1', 42),
 ('W6X6', 42),
 ('W6X7', 39),
 ('W6X5', 38),
 ('W6X9', 38),
 ('W5X8', 38),
 ('W6X3', 38),
 ('W6X8', 38),
 ('W6X0', 37),
 ('W5X0', 37),
 ('W6X2', 36),
 ('W5X1', 35),
 ('W5X2', 33),
 ('W5X3', 33),
 ('W5X5', 33),
 ('W5X7', 31),
 ('W5X4', 31),
 ('W5X6', 30),
 ('W5X9', 30),
 ('W4X6', 27),
 ('W4X2', 27),
 ('W4X3', 25),
 ('W4X9', 25),
 ('W4X7', 23),
 ('W4X8', 22),
 ('W4X4', 22),
 ('W4X5', 21),
 ('W4X0', 21),
 ('W4X1', 20),
 ('W3X8', 12),
 ('W3X0', 12),
 ('W3X5', 12),
 ('W3X7', 9),
 ('W3X9', 9),
 ('W3X4', 9),
 ('W3X1', 9),
 ('W3X2', 8),
 ('W3X3', 8),
 ('W3X6', 8),
 ('W2X4', 7),
 ('W2X8', 6),
 ('W2X1', 5),
 ('W2X3', 5),
 ('W1X4', 4),
 ('W2X9', 4),
 ('W2X6', 3),
 ('W2X0', 3),
 ('W2X7', 3),
 ('W1X9', 2),
 ('W1X6', 2),
 ('W2X5', 2),
 ('W1X7', 1),
 ('W2X2', 1),
 ('W1X3', 1),
 ('W1X0', 1),
 ('W1X2', 1),
 ('W1X1', 1),
 ('W1X8', 1)]

In [38]:
metadata= simu_main_meta(config_E_A_S_N,Liner=False)
microbiome=simu_main_microbiome(metadata,config_E_A_S_N)
microbiome=microbiome.iloc[:,6:]
microbiome['Group'] =metadata['Group']
metadata.index=['s'+str(i) for i in metadata.index]
microbiome.index=['s'+str(i) for i in microbiome.index]
sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = psm.simu_match(data=metadata, target='Group', features=['Environment', 'Age', 'Sex'],caliper=0.05,ratio=1)
### difference microbiome
result = pd.DataFrame()
res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
res.columns = [i+'(raw)' for i in res.columns]
result = pd.concat([result, res], axis=1, sort=False)
res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
res.columns = [i+'(PSM)' for i in res.columns]
result = pd.concat([result, res], axis=1, sort=False)
result = result.sort_values(['fdr(PSM)'])
result


Unnamed: 0,N Control(raw),Mean Control(raw),N Case(raw),Mean Case(raw),cohen's d(raw),Fold change(raw),p-value(raw),fdr(raw),N Control(PSM),Mean Control(PSM),N Case(PSM),Mean Case(PSM),cohen's d(PSM),Fold change(PSM),p-value(PSM),fdr(PSM)
W8X66,100,1.264432,100,1.510605,2.260220,1.194691,4.934373e-28,3.947499e-25,85,1.296288,85,1.512059,1.779279,1.166453,1.140260e-18,9.122079e-16
W8X57,100,0.440143,100,0.668347,2.017397,1.518479,6.521805e-25,2.608722e-22,85,0.477175,85,0.669892,1.537272,1.403872,2.567038e-15,1.026815e-12
W8X72,100,1.344577,100,1.556863,1.901819,1.157883,2.504124e-23,6.677664e-21,85,1.382446,85,1.556609,1.422513,1.125982,1.370244e-13,3.653985e-11
W7X55,100,0.549039,100,0.757380,1.826026,1.379466,1.162460e-22,1.859936e-20,85,0.585237,85,0.758886,1.364843,1.296716,8.687732e-13,1.390037e-10
W7X93,100,1.161401,100,1.369742,1.826026,1.179388,1.162460e-22,1.859936e-20,85,1.197599,85,1.371248,1.364843,1.144998,8.687732e-13,1.390037e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W3X44,100,0.937763,100,0.977019,0.302299,1.041861,7.408016e-02,9.844540e-02,85,0.978862,85,0.975560,0.022946,0.996627,8.663432e-01,8.696044e-01
W6X67,100,1.412924,100,1.461511,0.325437,1.034388,4.369368e-02,6.461173e-02,85,1.457604,85,1.466594,0.054056,1.006168,8.663434e-01,8.696044e-01
W5X4,100,1.469244,100,1.524472,0.389553,1.037590,9.908766e-03,1.860801e-02,85,1.518519,85,1.526675,0.052056,1.005371,8.835365e-01,8.857508e-01
W5X50,100,0.504880,100,0.542781,0.225759,1.075068,1.809762e-01,2.010846e-01,85,0.540578,85,0.552428,0.063624,1.021922,8.909222e-01,8.920373e-01


In [39]:
result.to_csv('../simuData/DiseaseWeight/DiseaseWeight10.csv')