In [42]:
import os
import pandas as pd
import biom
import qiime2
from qiime2.plugins import (taxa)
from tqdm import tqdm


base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data')
results_dir = os.path.join(base_dir, 'results')
merged_dir = os.path.join(results_dir, 'merged')

# merged_composition_dir: Store the feature table at specified taxonomic level (FeatureTable[Frequency]) 
merged_composition_dir = os.path.join(merged_dir, 'merged_composition')
merged_composition_by_bodysite_dir = os.path.join(merged_dir, 'merged_composition_by_bodysite')

In [43]:
# Load the merged feature table and taxonomy class
merged_table_clean = qiime2.Artifact.load(os.path.join(merged_dir, 'merged_table_clean.qza'))
merged_taxonomy_class = qiime2.Artifact.load(os.path.join(merged_dir, 'merged_taxonomy_class.qza'))

# Merges sequence features in the feature table according to the specified taxonomic level level = i (e.g., genus, family, phylum)
tax_list = ['', '', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
for i in tqdm(range(2,8)):
    temp = taxa.methods.collapse(merged_table_clean, merged_taxonomy_class, level = i).collapsed_table
    temp = temp.view(biom.Table).to_dataframe().T
    row_sums = temp.sum(axis=1)
    temp = temp.div(row_sums, axis=0) * 100
    temp = temp.fillna(0)
    temp = temp.filter(like=tax_list[i])
    temp = temp.loc[:, (temp != 0).any(axis=0)]
    temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
print('STEP 4  Done!')


  0%|          | 0/6 [00:00<?, ?it/s]

  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
  temp.to_csv(os.path.join(merged_composition_dir, 'level-' + str(i) + '.csv'))
100%|██████████| 6/6 [00:04<00:00,  1.45it/s]

STEP 4  Done!





In [44]:
microbiota_taxonomy_type_dict = {'Phylum':2, 'Class':3, 'Order':4, 'Family':5,'Genus':6, 'Species':7}
merged_sample_metadata_df = pd.read_csv(os.path.join(data_dir, 'sample_metadata.txt'), sep='\t')
bodysite_list = merged_sample_metadata_df['BodySite'].unique().tolist()

for key,value in microbiota_taxonomy_type_dict.items():
    composition_df = pd.read_csv(os.path.join(merged_composition_dir, 'level-' + str(value) + '.csv'))
    composition_df.rename(columns={composition_df.columns[0]: 'd__SampleID'}, inplace=True)
    col_selected_list = [item for item in composition_df.columns.to_list() if item.startswith('d__') and  not item.endswith('__')]
    composition_df = composition_df.loc[:,col_selected_list]
    col_new = [item.split(';')[-1][3:] for item in col_selected_list]
    composition_df.columns = col_new
    composition_df = pd.merge(merged_sample_metadata_df[['SampleID','BodysiteClass']], composition_df, on='SampleID', how='inner')
    composition_df.drop('SampleID', axis=1, inplace=True)    
    composition_df = composition_df.groupby(['BodysiteClass']).mean().reset_index()
    composition_df.to_csv(os.path.join(merged_composition_by_bodysite_dir, key + '_composition.csv'), index = False)
