In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.lines as mlines
from matplotlib.font_manager import FontProperties
import seaborn as sns
from scipy.stats import norm, pearsonr, spearmanr
import scipy.stats as stats
from scipy.spatial import distance
from sklearn.feature_selection import VarianceThreshold
import cloudpickle as pickle
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #last_expr

# Download and normalization

In [2]:
def load_project(project):
    metadata = pd.read_csv('../Data/'+project[0]+'_'+project[1]+'_META.txt', sep='\t', index_col=0)
    microbiome = pd.read_csv('../Data/'+project[0]+'_'+project[1]+'_Relative.txt', sep='\t', index_col=0).T
    pathway = pd.read_csv('../Data/'+project[0]+'_'+project[1]+'_Pathway.txt', sep='\t', index_col=0).T
  
    metadata = metadata.loc[(metadata['study_condition']==project[2])|(metadata['study_condition']==project[3]), :]
    microbiome = microbiome.loc[metadata.sample_id, [i for i in microbiome.columns if 's__' in i and '|t__' not in i]]
    pathway = pathway.loc[metadata.sample_id, [i for i in pathway.columns if '|' not in i]]
    ### Normalize
    #microbiome = normalize(microbiome)
    #pathway = normalize(pathway)
    return metadata, microbiome, pathway

def normalize(data, zero_threshold=1e-8, zero_ratio=0.5):
    ### 1. o delete
    data = data.loc[:, (data<zero_threshold).sum()/float(data.shape[0])<zero_ratio]
    return data

# Read file

In [3]:

#dir_str=r"../Data/CMD/"  
#file_name=os.listdir(dir_str)
#file_dir=[os.path.join(dir_str,x) for x in file_name]
#file_dir

In [32]:
#for filename in file_name: 
#    if "2021-03-31" in filename:
#        newname = filename.replace('2021-03-31.', '')  
#    if "2021-04-02" in filename:
#        newname = filename.replace('2021-04-02.', '')  
#    if "2021-10-14" in filename: 
#        newname = filename.replace('2021-10-14.', '')
#    if "2022-04-13" in filename: 
#        newname = filename.replace('2022-04-13.', '')
#    if not(os.path.exists('../Data/'+newname)):
#        os.rename('../Data/CMD/'+filename, '../Data/'+newname)
#

In [3]:
dir_str=r"../Data/" 
file_name=os.listdir(dir_str)
file_dir=[os.path.join(dir_str,x) for x in file_name]
file_dir

['../Data/.Rhistory',
 '../Data/AsnicarF_2017_milk_META.txt',
 '../Data/AsnicarF_2017_milk_Pathway.txt',
 '../Data/AsnicarF_2017_milk_Relative.txt',
 '../Data/AsnicarF_2017_stool_META.txt',
 '../Data/AsnicarF_2017_stool_Pathway.txt',
 '../Data/AsnicarF_2017_stool_Relative.txt',
 '../Data/AsnicarF_2021_stool_META.txt',
 '../Data/AsnicarF_2021_stool_Pathway.txt',
 '../Data/AsnicarF_2021_stool_Relative.txt',
 '../Data/BackhedF_2015_stool_META.txt',
 '../Data/BackhedF_2015_stool_Pathway.txt',
 '../Data/BackhedF_2015_stool_Relative.txt',
 '../Data/Bengtsson-PalmeJ_2015_stool_META.txt',
 '../Data/Bengtsson-PalmeJ_2015_stool_Pathway.txt',
 '../Data/Bengtsson-PalmeJ_2015_stool_Relative.txt',
 '../Data/BritoIL_2016_oralcavity_META.txt',
 '../Data/BritoIL_2016_oralcavity_Pathway.txt',
 '../Data/BritoIL_2016_oralcavity_Relative.txt',
 '../Data/BritoIL_2016_stool_META.txt',
 '../Data/BritoIL_2016_stool_Pathway.txt',
 '../Data/BritoIL_2016_stool_Relative.txt',
 '../Data/BrooksB_2017_stool_META.txt'

# load dictionary

In [4]:
projects=pd.read_csv('../Label/fenzu.csv',index_col=0).T
projects=projects.to_dict('list')
projects

{'1': ['AsnicarF_2017', 'milk', 'control', nan],
 '2': ['AsnicarF_2017', 'stool', 'control', nan],
 '3': ['AsnicarF_2021', 'stool', 'control', nan],
 '4': ['BackhedF_2015', 'stool', 'control', nan],
 '5': ['Bengtsson-PalmeJ_2015', 'stool', 'control', nan],
 '6': ['BritoIL_2016', 'oralcavity', 'control', nan],
 '7': ['BritoIL_2016', 'stool', 'control', nan],
 '8': ['BrooksB_2017', 'stool', 'control', 'premature_born'],
 '9': ['Castro-NallarE_2015', 'oralcavity', 'control', 'schizofrenia'],
 '10': ['ChengpingW_2017', 'stool', 'AS', nan],
 '11': ['ChngKR_2016', 'skin', 'control', 'AD'],
 '12': ['ChuDM_2017', 'oralcavity', 'control', nan],
 '13': ['ChuDM_2017', 'stool', 'control', nan],
 '14': ['CosteaPI_2017', 'stool', 'control', nan],
 '15': ['DavidLA_2015', 'stool', 'control', 'acute_diarrhoea'],
 '16': ['DeFilippisF_2019', 'stool', 'control', nan],
 '17': ['DhakanDB_2019', 'stool', 'control', nan],
 '18_1': ['FengQ_2015', 'stool', 'control', 'adenoma'],
 '18_2': ['FengQ_2015', 'stool',

# miMatch

In [5]:
%run miMatch.py

In [6]:
test()

    N Control  Mean Control  N Case  Mean Case  cohen's d  Fold change  \
F1        100      5.041349     100   4.483199   0.560422     0.889286   
F2        100      1.919333     100   2.745474   0.791626     1.430431   
F3        100      2.938563     100   3.554254   0.674756     1.209521   

         p-value       fdr  
F1  1.545838e-04  0.000155  
F2  4.912068e-07  0.000001  
F3  9.921885e-06  0.000015  
    N Control  Mean Control  N Case  Mean Case  cohen's d  Fold change  \
F1         98      4.458103      98   4.527594   0.073514     1.015588   
F2         98      2.527884      98   2.725603   0.209926     1.078215   
F3         98      3.680662      98   3.537779   0.153638     0.961180   

     p-value       fdr  
F1  0.548001  0.548001  
F2  0.310651  0.504858  
F3  0.336572  0.504858  


In [7]:
def matched_result_check(sample_size, balance_stats):
    sample_check = (sample_size.loc['Matched', :]/sample_size.loc['All', :]).min()>=0.5
    p_check = balance_stats.loc[balance_stats['isMatch']=='matched', 'pvalue'].min()>0.05
    return sample_check, p_check

def select_params(selector):
    res = []
    k = 0
    for caliper, ratio, sample_size, balance_stats in selector:
        temp = list(balance_stats.loc[(balance_stats['isMatch']=='matched')&(balance_stats['feature']=='distance'), 'pvalue'])
        sample_check, p_check = matched_result_check(sample_size, balance_stats)
        res.append([k, temp[0], temp[1], temp[0]*temp[1], sample_check, p_check, sample_check and p_check])
        k += 1
    res = pd.DataFrame(res, columns=['index', 'p_ttest', 'p_wilcoxon', 'p_ttest*p_wilcoxon', 'Sample_check', 'P_check', 'Sample&P_check'])
    if res['Sample&P_check'].sum()>0:
        select_index = res.loc[res['Sample&P_check'], 'index']
        best_caliper, best_ratio, best_p = 0, 0, 0
        for i in select_index:
            if float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])>=best_p:
                best_caliper, best_ratio, best_p = selector[i][0], selector[i][1], float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])
    elif res['P_check'].sum()>0: 
        select_index = res.loc[res['P_check'], 'index']
        best_caliper, best_ratio, best_p = 0, 0, 0
        for i in select_index:
            if float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])>=best_p:
                best_caliper, best_ratio, best_p = selector[i][0], selector[i][1], float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])
    else:
        select_index = res['index']
        best_caliper, best_ratio, best_p = 0, 0, 0
        for i in select_index:
            if float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])>=best_p:
                best_caliper, best_ratio, best_p = selector[i][0], selector[i][1], float(res.loc[res['index']==i, 'p_ttest*p_wilcoxon'])
    return best_caliper, best_ratio, best_p

In [8]:
ps_result = []
print(projects.keys())
for pi in project_bf.keys():
    project = projects[pi]
    if pd.isnull(project[2]) or pd.isnull(project[3]):
        continue
    metadata, microbiome, pathway = load_project(project)
    microbiome['Group'] = [0 if i == project[2] else 1 for i in metadata['study_condition']]
    pathway['Group'] = [0 if i == project[2] else 1 for i in metadata['study_condition']]
    ### select params
    selector = []
    for caliper, ratio in [(0.1, 1), (0.1, 2), (0.1, 3), (0.15, 1), (0.15, 2), (0.15, 3), 
                           (0.2, 1), (0.2, 2), (0.2, 3), (0.25, 1), (0.25, 2), (0.25, 3)]:
        try:
            params = [('output', 'output_dir', '../Temp/'+pi+'/'), ('psm', 'caliper', str(caliper)), ('psm', 'ratio', str(ratio))]
            sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_miMatch(pathway, target='Group', params=params, is_pca=True)
            sample_check, p_check = matched_result_check(sample_size, balance_stats)
            selector.append([caliper, ratio, sample_size, balance_stats])
        except:
            pass
    best_caliper, best_ratio, best_p = select_params(selector)
    
    ### run with best params
    params = [('output', 'output_dir', '../Temp/'+pi+'/'), ('psm', 'caliper', str(best_caliper)), ('psm', 'ratio', str(best_ratio))]
    sample_size, match_drop_unmatched, match, pairs, sum_matched, balance_stats = run_miMatch(pathway, target='Group', params=params, is_pca=True)
    print(pi, project[0], project[2], project[3], best_caliper, best_ratio, sample_check, p_check)
    ps_result.append([pi, project[0], project[2], project[3], best_caliper, best_ratio])
    
    ### difference microbiome
    result = pd.DataFrame()
    res = diff_by_rank_sum(microbiome, target='Group', features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(microbiome, pairs, features=[i for i in microbiome.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    result.to_csv('../Temp/'+pi+'/'+'Microbiome_difference.csv')
    
    ### differential pathway
    result = pd.DataFrame()
    res = diff_by_rank_sum(pathway, target='Group', features=[i for i in pathway.columns if i!='Group'])
    res.columns = [i+'(raw)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    res = diff_by_signed_rank(pathway, pairs, features=[i for i in pathway.columns if i!='Group'])
    res.columns = [i+'(PSM)' for i in res.columns]
    result = pd.concat([result, res], axis=1, sort=False)
    result = result.sort_values(['fdr(PSM)'])
    result.to_csv('../Temp/'+pi+'/'+'Pathway_difference.csv')
    
    ### raw data
    metadata.to_csv('../Temp/'+pi+'/'+'Metadata.csv')
    microbiome.to_csv('../Temp/'+pi+'/'+'Microbiome.csv')
    pathway.to_csv('../Temp/'+pi+'/'+'Pathway.csv')
    
ps_result = pd.DataFrame(ps_result, columns=['ID', 'Pname', 'PControl', 'PCase', 'Caliper', 'Ratio'])
ps_result.to_csv('../Temp/PSM_check(0.25_2).csv')

dict_keys(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18_1', '18_2', '18_3', '19', '20', '21', '22', '23', '24_1', '24_2', '24_3', '25', '26', '27', '28_1', '28_2', '28_3', '29', '30', '31', '32', '33', '34', '35', '36', '37_1', '37_2', '37_3', '38', '39', '40_1', '40_2', '40_3', '41', '42', '43', '44', '45', '46', '47', '48_1', '48_2', '48_3', '48_4', '48_5', '48_6', '49_1', '49_2', '49_3', '50_1', '50_2', '50_3', '51', '52', '53', '54_1', '54_2', '54_3', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81_1', '81_2', '81_3', '82', '83', '84_1', '84_2', '84_3', '84_4', '84_5', '84_6', '84_7', '84_8', '84_9', '84_10', '84_11', '84_12', '84_13', '84_14', '84_15', '84_16', '84_17', '85', '86', '87', '88', '89', '90', '91', '92_1', '92_2', '92_3', '93_1', '93_2', '93_3', '93_4', '93_5', '93_6', '94', '95', '96', '97', '98',

# test part

In [9]:
pi='08'
project = projects[pi]
metadata, microbiome, pathway = load_project(project)
microbiome['Group'] = [0 if i == project[2] else 1 for i in metadata['study_condition']]
pathway['Group'] = [0 if i == project[2] else 1 for i in metadata['study_condition']]
microbiome.head(2)
pathway.head(2)

Unnamed: 0,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Klebsiella|s__Klebsiella_pneumoniae,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Klebsiella|s__Klebsiella_variicola,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Klebsiella|s__Klebsiella_quasipneumoniae,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Lactococcus|s__Lactococcus_lactis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Staphylococcaceae|g__Staphylococcus|s__Staphylococcus_epidermidis,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_faecalis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_agalactiae,k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Veillonella|s__Veillonella_dispar,k__Bacteria|p__Firmicutes|c__Tissierellia|o__Tissierellales|f__Peptoniphilaceae|g__Finegoldia|s__Finegoldia_magna,...,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Kosakonia|s__Kosakonia_sp_S29,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_turicensis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_hirae,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_glucuronolyticum,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Morganellaceae|g__Proteus|s__Proteus_mirabilis,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Propionibacteriales|f__Propionibacteriaceae|g__Cutibacterium|s__Propionibacterium_namnetense,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales|f__Malasseziaceae|g__Malassezia|s__Malassezia_restricta,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micrococcales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micrococcales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_aloeverae,Group
N2_031_008G1,71.74142,19.43325,8.80403,0.01135,0.00995,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
N2_031_010G1,71.6604,18.55608,9.7741,0.0,0.00625,0.00317,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


Unnamed: 0,UNMAPPED,UNINTEGRATED,PWY0-1586: peptidoglycan maturation (meso-diaminopimelate containing),PWY-1042: glycolysis IV (plant cytosol),PWY-7111: pyruvate fermentation to isobutanol (engineered),"PWY-7013: L-1,2-propanediol degradation",PWY-621: sucrose degradation III (sucrose invertase),PWY-6936: seleno-amino acid biosynthesis,SER-GLYSYN-PWY: superpathway of L-serine and glycine biosynthesis I,ASPASN-PWY: superpathway of L-aspartate and L-asparagine biosynthesis,...,PWY-6565: superpathway of polyamine biosynthesis III,PWY-6572: chondroitin sulfate degradation I (bacterial),PWY-7312: dTDP-D-&beta;-fucofuranose biosynthesis,PWY-6478: GDP-D-glycero-&alpha;-D-manno-heptose biosynthesis,"PWY-6992: 1,5-anhydrofructose degradation",PWY66-201: nicotine degradation IV,PWY-7626: bacilysin biosynthesis,PWY-6876: isopropanol biosynthesis,PWY-3502: superpathway of NAD biosynthesis in eukaryotes,Group
N2_031_008G1,0.063191,0.855767,0.001267,0.001178,0.000841,0.000792,0.000671,0.000628,0.000627,0.000616,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
N2_031_010G1,0.060563,0.858861,0.001246,0.001153,0.000857,0.000841,0.000654,0.000601,0.000662,0.000589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [7]:
selector = []
caliper, ratio=(0.1, 1)
params = [('output', 'output_dir', '../Temp/'+pi+'/'), ('psm', 'caliper', str(caliper)), ('psm', 'ratio', str(ratio))]
params

[('output', 'output_dir', '../Temp/08/'),
 ('psm', 'caliper', '0.1'),
 ('psm', 'ratio', '1')]

In [6]:
key_bf=[ '97', '98', '99_1', '99_2', '99_3', '100', '101']
project_bf= dict([(key,projects[key]) for key in key_bf])
print(project_bf)

{'97': ['YuJ_2015', 'stool', 'control', 'CRC'], '98': ['ZeeviD_2015', 'stool', 'control', nan], '99_1': ['ZellerG_2014', 'stool', 'control', 'adenoma'], '99_2': ['ZellerG_2014', 'stool', 'control', 'CRC'], '99_3': ['ZellerG_2014', 'stool', 'CRC', 'adenoma'], '100': ['ZhuF_2020', 'stool', 'control', 'schizofrenia'], '101': ['NagySzakalD_2017', 'stool', 'control', 'ME/CFS']}
