# Explorative Data Analysis (EDA) for demographic information and distribution

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pickle as pkl
from sklearn.model_selection import train_test_split
from pathlib import Path

## Exploration of the meta data for demographic information

In [2]:
# load data that includes meta information
merged_df = pd.read_csv('../../data/raw/healthy_subset_df_with_meta.csv')

  merged_df = pd.read_csv('../../data/healthy_subset_df_with_meta.csv')


In [5]:
# see the number of unique values in the 'subject_id' and 'study_name' columns
print(f"The data contains samples from {len(merged_df['subject_id'].unique())} unique subjects")
print(f"The data contains samples from {len(merged_df['study_name'].unique())} different studies")

The data contains samples from 4845 unique subjects
The data contains samples from 47 different studies


In [6]:
# get descriptive statistics of the age column 
print(merged_df['age'].describe())
print(f"The mean age of the participants was {merged_df['age'].mean():.2f} years (SD = {merged_df['age'].std():.2f}).")

count    5502.000000
mean       46.144311
std        15.656810
min        18.000000
25%        32.000000
50%        46.000000
75%        58.000000
max        91.000000
Name: age, dtype: float64
The mean age of the participants was 46.14 years (SD = 15.66).


In [8]:
# different genders
print(len(merged_df['gender'].unique()))
print(merged_df['gender'].unique())

3
['male' 'female' nan]


In [11]:
# gender distribution including NaN values
gender_distribution = merged_df['gender'].value_counts(normalize=True, dropna=False) * 100
gender_distribution_df = pd.DataFrame(gender_distribution).reset_index()
gender_distribution_df.columns = ['gender', 'percentage']
total_row = pd.DataFrame([{'gender': 'Total', 'percentage': 100, 'n': merged_df['gender'].notna().sum()}])
gender_distribution_df = pd.concat([total_row, gender_distribution_df], ignore_index=True)
gender_distribution_df['n'] = (gender_distribution_df['percentage'] / 100 * merged_df['gender'].count()).round()
print(gender_distribution_df)


   gender  percentage       n
0   Total  100.000000  5425.0
1  female   57.161032  3101.0
2    male   41.439477  2248.0
3     NaN    1.399491    76.0


## Exploration of the data distribution

In [8]:
root = Path.cwd().parents[1]
file_path = root / "data" / "raw" / "healthy_subset_df.csv"
healthy_subset_df = pd.read_csv(file_path)

# identify lactobacillus columns
lacto_columns = [col for col in healthy_subset_df.columns if 'lactobacillus' in col.lower()]

# count rows where at least one lactobacillus column has a value of 0.01 or above
rows_with_lacto_above_threshold = healthy_subset_df[lacto_columns].ge(0.01).any(axis=1).sum()

# count how many columns are in lacto_columns
num_lacto_columns = len(lacto_columns)

# calculate the total sum across all lactobacillus columns
total_lacto_sum = healthy_subset_df[lacto_columns].sum().sum()

# filter for numeric columns only
numeric_df = healthy_subset_df.select_dtypes(include='number')

# calculate the total sum across all numeric columns
total_sum = numeric_df.sum().sum()

# calculate the percentage of the total abundance that lactobacillus columns correspond to
lacto_percentage_of_total = (total_lacto_sum / total_sum) * 100

# calculate the percentage each lactobacillus column contributes to the total lactobacillus sum
column_percentages = healthy_subset_df[lacto_columns].sum() / total_lacto_sum * 100

# count how many columns correspond to less than 0.1% of the total lactobacillus sum
lacto_columns_below_0_1_percent = (column_percentages < 0.1).sum()

print(f"Number of rows with at least one lactobacillus column value of 0.01 or above: {rows_with_lacto_above_threshold}")
print(f"Number of columns in lacto_columns: {num_lacto_columns}")
print(f"Number of lactobacillus columns with less than 0.1% of the total lactobacillus abundance: {lacto_columns_below_0_1_percent}")
print(f"Percentage of total abundance that lactobacillus species correspond to: {lacto_percentage_of_total:.2f}%")


Number of rows with at least one lactobacillus column value of 0.01 or above: 2700
Number of columns in lacto_columns: 83
Number of lactobacillus columns with less than 0.1% of the total lactobacillus abundance: 56
Percentage of total abundance that lactobacillus species correspond to: 0.01%


In [4]:
# print column names
for col in healthy_subset_df.columns:
    print(col)

sample_id
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_vulgatus
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_stercoris
k__Bacteria|p__Firmicutes|c__Negativicutes|o__Acidaminococcales|f__Acidaminococcaceae|g__Acidaminococcus|s__Acidaminococcus_intestini
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae|g__Eubacterium|s__Eubacterium_sp_CAG_38
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Tannerellaceae|g__Parabacteroides|s__Parabacteroides_distasonis
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_faecis
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Lachnospiraceae_unclassified|s__Eubacterium_rectale
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_uniformis
k__Bacteria|p__Pr

In [5]:
# display df
healthy_subset_df

Unnamed: 0,sample_id,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_vulgatus,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_stercoris,k__Bacteria|p__Firmicutes|c__Negativicutes|o__Acidaminococcales|f__Acidaminococcaceae|g__Acidaminococcus|s__Acidaminococcus_intestini,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae|g__Eubacterium|s__Eubacterium_sp_CAG_38,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Tannerellaceae|g__Parabacteroides|s__Parabacteroides_distasonis,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_faecis,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Lachnospiraceae_unclassified|s__Eubacterium_rectale,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_uniformis,k__Bacteria|p__Proteobacteria|c__Proteobacteria_unclassified|o__Proteobacteria_unclassified|f__Proteobacteria_unclassified|g__Proteobacteria_unclassified|s__Proteobacteria_bacterium_CAG_139,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Staphylococcaceae|g__Staphylococcus|s__Staphylococcus_vitulinus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_crustorum,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Klebsiella|s__Klebsiella_quasivariicola,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Alkaliphilus|s__Alkaliphilus_oremlandii,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Paenibacillaceae|g__Paenibacillus|s__Paenibacillus_polysaccharolyticus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_frumenti,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_falsenii,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_avium,k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Alcaligenaceae|g__Oligella|s__Oligella_ureolytica,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Paenibacillaceae|g__Paenibacillus|s__Paenibacillus_sp_7884_2
1,SAMEA7041133,28.73852,26.47003,4.20975,4.00703,3.50675,3.06530,2.98546,2.88462,2.68337,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
2,SAMEA7041134,1.23552,0.00000,0.00000,0.29849,0.74520,3.22400,5.01580,0.29423,0.00000,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
3,SAMEA7041135,2.26906,0.99832,2.13706,0.06362,1.35183,0.94276,3.96864,5.71884,0.00028,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
4,SAMEA7041136,4.47529,0.61157,0.00000,0.03127,0.48709,0.00000,4.53838,1.65956,0.00491,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5,SAMEA7041137,0.00000,0.00000,0.00000,0.72104,1.20537,3.29252,2.47252,5.42743,0.00234,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5498,wHAXPI034920-8,25.60556,45.20126,0.00000,3.38668,1.53275,0.00618,0.00240,4.07668,2.33042,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5499,wHAXPI034921-9,16.16002,0.18648,0.00000,1.04148,0.49642,0.66276,1.53953,12.19125,0.74415,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5500,wHAXPI034922-11,84.97335,0.19644,0.00000,1.75105,0.00933,0.00827,0.00315,0.04653,3.64185,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5501,wHAXPI034923-12,5.81158,0.55622,0.00000,0.67390,0.65899,0.03672,0.38077,1.66165,0.00698,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0


In [6]:
# get dimensions of healthy_subset_df. I subtract 1 because the first column is the sample ID 
print(f"The data frame contains {healthy_subset_df.shape[0]} samples and {healthy_subset_df.shape[1]-1} genes.")

The data frame contains 5502 samples and 1478 genes.


In [7]:
# samples as rows, genes as columns
display(healthy_subset_df)

Unnamed: 0,sample_id,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_vulgatus,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_stercoris,k__Bacteria|p__Firmicutes|c__Negativicutes|o__Acidaminococcales|f__Acidaminococcaceae|g__Acidaminococcus|s__Acidaminococcus_intestini,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Eubacteriaceae|g__Eubacterium|s__Eubacterium_sp_CAG_38,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Tannerellaceae|g__Parabacteroides|s__Parabacteroides_distasonis,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_faecis,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Lachnospiraceae_unclassified|s__Eubacterium_rectale,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_uniformis,k__Bacteria|p__Proteobacteria|c__Proteobacteria_unclassified|o__Proteobacteria_unclassified|f__Proteobacteria_unclassified|g__Proteobacteria_unclassified|s__Proteobacteria_bacterium_CAG_139,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Staphylococcaceae|g__Staphylococcus|s__Staphylococcus_vitulinus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_crustorum,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Klebsiella|s__Klebsiella_quasivariicola,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Alkaliphilus|s__Alkaliphilus_oremlandii,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Paenibacillaceae|g__Paenibacillus|s__Paenibacillus_polysaccharolyticus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_frumenti,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_falsenii,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_avium,k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Alcaligenaceae|g__Oligella|s__Oligella_ureolytica,k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Paenibacillaceae|g__Paenibacillus|s__Paenibacillus_sp_7884_2
1,SAMEA7041133,28.73852,26.47003,4.20975,4.00703,3.50675,3.06530,2.98546,2.88462,2.68337,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
2,SAMEA7041134,1.23552,0.00000,0.00000,0.29849,0.74520,3.22400,5.01580,0.29423,0.00000,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
3,SAMEA7041135,2.26906,0.99832,2.13706,0.06362,1.35183,0.94276,3.96864,5.71884,0.00028,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
4,SAMEA7041136,4.47529,0.61157,0.00000,0.03127,0.48709,0.00000,4.53838,1.65956,0.00491,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5,SAMEA7041137,0.00000,0.00000,0.00000,0.72104,1.20537,3.29252,2.47252,5.42743,0.00234,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5498,wHAXPI034920-8,25.60556,45.20126,0.00000,3.38668,1.53275,0.00618,0.00240,4.07668,2.33042,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5499,wHAXPI034921-9,16.16002,0.18648,0.00000,1.04148,0.49642,0.66276,1.53953,12.19125,0.74415,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5500,wHAXPI034922-11,84.97335,0.19644,0.00000,1.75105,0.00933,0.00827,0.00315,0.04653,3.64185,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0
5501,wHAXPI034923-12,5.81158,0.55622,0.00000,0.67390,0.65899,0.03672,0.38077,1.66165,0.00698,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,0


In [8]:
# extract indices for columns that include the word 'Lactobacillus' (case-insensitive)
lacto_index = [i for i, s in enumerate(healthy_subset_df.columns) if 'lactobacillus' in s.lower()]

In [9]:
# calculate number of samples (rows) in healthy_subset_df.columns in which all 83 lactobacillus row values are 0
num_zero_lacto = healthy_subset_df.iloc[:,lacto_index].eq(0).all(axis=1).sum()
print(f"Across all {healthy_subset_df.shape[0]} samples, there are {num_zero_lacto} samples that have 0 counts for all 83 Lactobacillus species. This corresponds to {num_zero_lacto/(healthy_subset_df.shape[0])*100:.4f}% of the samples.")

Across all 5502 samples, there are 2103 samples that have 0 counts for all 83 Lactobacillus species. This corresponds to 38.2225% of the samples.


In [10]:
# print descriptive status for all 83 Lactobacillus species
lacto_df = healthy_subset_df.iloc[:,lacto_index]
display(lacto_df.describe())

# print the number of unique values for each Lactobacillus species
unique_vals = lacto_df.nunique()
print(f"The number of unique values for each Lactobacillus species is as follows: \n{unique_vals}")

Unnamed: 0,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_rogosae,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_ruminis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_delbrueckii,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_brevis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_acidophilus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_reuteri,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_paragasseri,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_gasseri,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_rhamnosus,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_pantheris,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_kisonensis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_rapi,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_xiangfangensis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_manihotivorans,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_murinus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_intestinalis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_dextrinicus,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_crustorum,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_frumenti
count,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,...,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0,5502.0
mean,0.021327,0.139363,0.020942,0.000238,0.00717,0.006679,0.016677,0.010084,0.003074,0.003176,...,0.0,0.0,0.0,0.0,5.816067e-08,2e-06,0.0,0.000579,3e-06,0.0
std,0.068771,0.832285,0.183655,0.006764,0.074638,0.189679,0.688821,0.236043,0.062446,0.076103,...,0.0,0.0,0.0,0.0,4.314095e-06,9.4e-05,0.0,0.024908,0.000214,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.00937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.50641,18.15348,6.06072,0.41698,3.18034,10.11799,40.24085,11.47367,2.54116,5.26019,...,0.0,0.0,0.0,0.0,0.00032,0.00569,0.0,1.63868,0.01586,0.0


The number of unique values for each Lactobacillus species is as follows: 
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_rogosae         1732
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_ruminis          685
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_delbrueckii      493
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_brevis            24
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_acidophilus      299
                                                                                                                              ... 
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_murinus            5
k__Bacte