# Application to 16S rRNA amplicon analysis of the Finnish children

## Environment settings
```sh
# Working Directory
cd Bac2fFeature/scripts/07_application_gutmicrobiome
# Output Directory
directories=(
    "../../data/t1d"
    "../../data/t1d/qiime2"
    "../../data/t1d_samples"
)
for dir in "${directories[@]}"; do
  if [ ! -d "$dir" ]; then
    mkdir -p "$dir"
  fi
done
```

## DADA2 using Qiime2 pipeline
Notion:  
The number of forward reads and the number of reverse reads in some samples were different and using all samples resulted in a DADA2 error.  
We therefore used 654 samples where the number of forward reads and the number of reverse reads were equal.
```sh
conda activate qiime2-2023.5
# Import (Please replace <path_to_data_dir> in Manifest_study_sample.csv to bsolute path of data directory)
qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path Manifest_study_sample.csv --output-path ../../data/t1d/demux_study_sample.qza --input-format PairedEndFastqManifestPhred33
qiime demux summarize --i-data ../../data/t1d/qiime2/demux_study_sample.qza --o-visualization ../../data/t1d/qiime2/demux_study_sample.qzv
# DADA2
qiime dada2 denoise-paired --i-demultiplexed-seqs ../../data/t1d/qiime2/demux_study_sample.qza --p-trunc-len-f 0 --p-trunc-len-r 160 --p-n-threads 4 --o-table ../../data/t1d/qiime2/table_study_sample.qza --o-denoising-stats ../../data/t1d/qiime2/stats_study_sample.qza --o-representative-sequences ../../data/t1d/qiime2/reps_sequences_study_sample.qza --verbose
# Export
qiime tools export --input-path ../../data/t1d/qiime2/reps_sequences_study_sample.qza --output-path ../../data/t1d/qiime2/
qiime tools export --input-path ../../data/t1d/qiime2/table_study_sample.qza --output-path ../../data/t1d/qiime2/
biom convert -i ../../data/t1d/qiime2/feature-table.biom -o ../../data/t1d/qiime2/feature_table_study_sample.tsv --to-tsv
qiime tools export --input-path ../../data/t1d/qiime2/taxonomy_study_sample.qza --output-path ../../data/t1d/qiime2/
```

```sh
wget -P ../../data/t1d/qiime2 https://data.qiime2.org/2023.7/common/silva-138-99-nb-classifier.qza
# Classifying taxonomy
qiime feature-classifier classify-sklearn --i-classifier ../../data/t1d/qiime2/silva-138-99-nb-classifier.qza --i-reads ../../data/t1d/qiime2/reps_sequences_study_sample.qza --o-classification ../../data/t1d/qiime2/taxonomy_study_sample.qza
# Barplot
qiime taxa barplot --i-table ../../data/t1d/qiime2/table_study_sample.qza --i-taxonomy ../../data/t1d/qiime2/taxonomy_study_sample.qza --m-metadata-file ../../data/t1d/qiime2/qiime_metadata.tsv --o-visualization ../../data/t1d/qiime2/taxa_barplot.qzv
# Please obtain file "level-7.csv" from Qiime2 viewer and copy it to ../../data/t1d/qiime2/level-7.csv
```

In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib
import matplotlib.pyplot as plt

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams["figure.figsize"]    = [4,3]
matplotlib.rcParams["font.size"]         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3

from scipy.stats import pearsonr
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

In [None]:
taxa = pd.read_csv('../../data/t1d/qiime2/level-7.csv', sep=',')

# Normarize
taxa_cols = [c for c in taxa.columns if 'd__' in c]
taxa[taxa_cols] = taxa[taxa_cols] / taxa[taxa_cols].sum(axis=1)[:,None]

# Melt
taxa_melt = taxa.melt(id_vars=['index'], value_vars=taxa_cols, var_name='taxa', value_name='abundance')

# Split taxonomy columns
taxa_split = taxa_melt['taxa'].str.split(';', expand=True)
taxa_split = taxa_split.set_axis(['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'],
                                  axis=1, copy=False)

taxa_melt_split = pd.concat([taxa_melt, taxa_split], axis=1)

tax_level = 'family'
tax_num = 7

# Top major tax
tax_sum = taxa_melt_split[[tax_level, 'abundance']].groupby(tax_level).sum()
tax_sum = tax_sum.reset_index().sort_values(by='abundance', ascending=False)
top_tax = list(tax_sum[tax_level].head(tax_num))

taxa_melt_split['major_tax'] = taxa_melt_split[tax_level].apply(lambda x: x if x in top_tax else 'others')

# Groupby with tax
tax_level_abd = taxa_melt_split.groupby(['index', 'major_tax']).sum()
tax_level_abd.reset_index(inplace=True)
tax_level_abd.sort_values(by='index').reset_index(inplace=True)

tax_level_abd_age = pd.merge(tax_level_abd, taxa[['index', 'Age_at_Collection']], how='left', on='index')
tax_level_abd_age.sort_values(by=['Age_at_Collection', 'index'], inplace=True)
tax_level_abd_age.reset_index(inplace=True, drop=True)

# Binning the age at collection
bin_width=100
bin_max = tax_level_abd_age['Age_at_Collection'].max() + 1
bin_age = list(np.arange(0, tax_level_abd_age['Age_at_Collection'].max(), 100)) + [bin_max]
tax_level_abd_age['age_category'] = pd.cut(tax_level_abd_age['Age_at_Collection'], bin_age, right=False)

tax_age = tax_level_abd_age[['age_category', 'major_tax', 'abundance']].groupby(['age_category', 'major_tax']).mean()
tax_age.reset_index(inplace=True)

# Plot
fig, ax = plt.subplots(figsize=(4, 2))

age_list = list(tax_age['age_category'].unique())
tax_list = list(tax_age['major_tax'].unique())

xs = bin_age
for x, c in zip(xs, age_list):
    tax_age_cat = tax_age[tax_age['age_category']==c]
    for i, p in enumerate(tax_list):
        abd = np.array(tax_age_cat[tax_age_cat['major_tax']==p]['abundance'])
        if i == 0:
            ax.bar(x, abd, width=90, color=cmap2.colors[i], align='edge')
            sum = abd
        elif p == 'others':
            ax.bar(x, abd, bottom=sum, width=90, color='grey', align='edge')
            sum += abd
        else:
            ax.bar(x, abd, bottom=sum, width=90, color=cmap2.colors[i], align='edge')
            sum += abd

ax.legend([t.replace('f__', '') for t in tax_list],
          bbox_to_anchor=(1.03, -0.05), loc='lower left', borderaxespad=0, ncol=1, frameon=False)

ax.set_ylim(0, 1)
ax.xaxis.set_major_locator(MultipleLocator(200))
ax.xaxis.set_minor_locator(MultipleLocator(100))

ax.set_xlabel('Age at Collection (days)')
ax.set_ylabel('Relative abundance')

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.savefig("../../results/07_application_gutmicrobiome/fig4a.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)

## Trait prediction using Bac2Feature
```sh
conda activate bac2feature_experiment
bac2feature -s ../../data/t1d/qiime2/dna-sequences.fasta -o ../../data/t1d/predicted_trait.tsv -m phylogeny --ref_dir ../../data/ref_bac2feature/phylogeny --ref_trait ../../data/ref_bac2feature/trait_bac2feature.tsv --intermediate_dir ../../data/t1d/intermediate_dir --calculate_NSTI
```

In [None]:
predicted_trait_path = "../../data/t1d/predicted_trait.tsv"
pred = pd.read_csv(predicted_trait_path, sep="\t")

threshold_phylodistance = pd.read_csv("../../data/trait_autocorrelations/threshold_phylodistance.tsv", sep= "\t", index_col=0)
threshold_phylodistance.rename(columns={"cor_0.5":"threshold"}, inplace=True)

# Feature table
table = pd.read_csv("../../data/t1d/qiime2/feature_table_study_sample.tsv", sep="\t", header=1)
# rename the first column
table.rename(columns={"#OTU ID": "sequence"}, inplace=True)

# Metadata from the paper
metadata = pd.read_csv("../../data/t1d/qiime2/metadata_study_sample.tsv", sep="\t", dtype=str)

taxonomy = pd.read_csv('../../data/t1d/qiime2/taxonomy.tsv', sep='\t')
# Split taxonomy columns
taxonomy_split = taxonomy['Taxon'].str.split(';', expand=True)
taxonomy_split = taxonomy_split.set_axis(['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'],
                                         axis=1, copy=False)

taxonomy_split = pd.concat([taxonomy['Feature ID'], taxonomy_split], axis=1)
# rename the first column
taxonomy_split.rename(columns={"Feature ID": "sequence"}, inplace=True)

In [None]:
def normalize_by_phylum(table, taxonomy_split, p=None): # Limiting within phylum
    abd_cols = table.columns[1:]
    table_tax = pd.merge(table, taxonomy_split, how='inner', on='sequence')
    if p is not None:
        table_tax = table_tax[table_tax['phylum']==p]
    table_tax.reset_index(drop=True, inplace=True)
    table_tax[abd_cols] = table_tax[abd_cols] / table_tax[abd_cols].sum(axis=0)
    return table_tax[table.columns]

titles = {'cell_diameter': 'Cell diameter', 'cell_length': 'Cell length', 'doubling_h': 'Doubling time', 'growth_tmp': 'Growth temp.', 'optimum_tmp': 'Optimum temp.', 'optimum_ph': 'Optimum pH', 'genome_size': 'Genome size', 'gc_content': 'GC content', 'coding_genes': 'Coding genes', 'rRNA16S_genes': 'rRNA16S genes', 'tRNA_genes': 'tRNA genes', 'gram_stain': 'Gram stain', 'sporulation': 'Sporulation', 'motility': 'Motility', 'range_salinity': 'Halophiles', 'facultative_respiration': 'Facultatives', 'anaerobic_respiration': 'Anaerobes', 'aerobic_respiration':'Aerobes' ,'mesophilic_range_tmp': 'Mesophiles', 'thermophilic_range_tmp':'Thermophiles', 'psychrophilic_range_tmp': 'Psychrophiles', 'bacillus_cell_shape': 'Bacillus', 'coccus_cell_shape': 'Coccus', 'filament_cell_shape': 'Filament', 'coccobacillus_cell_shape': 'Coccobacillus', 'vibrio_cell_shape': 'Vibrio', 'spiral_cell_shape': 'Spiral'}

### Continuous traits

In [None]:
nt = ["doubling_h", "genome_size", "gc_content", "coding_genes", "optimum_tmp", "growth_tmp", "rRNA16S_genes", "tRNA_genes"]

weighted_nt = dict()
is_first = True

p = None # Not limiting within phylum

# Calculate community weighted mean
for t in nt:
    # filtering
    x = pd.merge(table, pred[["sequence", t+"_nsti"]], how="inner", on="sequence")
    filtered_table = x[x[t+"_nsti"] < threshold_phylodistance.loc[t, "threshold"]].drop(t+"_nsti", axis=1).copy()

    # normalize
    filtered_table = normalize_by_phylum(filtered_table, taxonomy_split, p)
    # melt table
    melt_table = filtered_table.melt(id_vars="sequence", value_name="abundance", var_name="sample")
    # Merging tooks several seconds
    melt_trait = pd.merge(melt_table, pred[["sequence", t]], how="inner", on="sequence")
    melt_trait["weighted_nt"] = melt_trait["abundance"] * melt_trait[t]

    cwm_nt = melt_trait[["sample", "weighted_nt"]].groupby(by="sample").sum()
    cwm_nt = cwm_nt.reset_index()
    cwm_nt = cwm_nt.sort_values(by="sample")

    if len(list(cwm_nt["weighted_nt"])) == 0:
        continue
    if is_first:
        is_first = False
        weighted_nt["sample"] = list(cwm_nt["sample"])
    weighted_nt[t] = list(cwm_nt["weighted_nt"])

cwn_nt_concat = pd.DataFrame(weighted_nt)
cwm_nt_metadata = pd.merge(cwn_nt_concat, metadata, how="inner", left_on="sample", right_on="G_id")

cwm_nt_metadata["Age_at_Collection"] = cwm_nt_metadata["Age_at_Collection"].astype(int)

bin_width = 100
bin_max = 1200
bin_list = list(np.arange(start=0, stop=bin_max+1, step=bin_width))
bin_mid_list = list(np.arange(start=bin_width/2, stop=bin_max+1, step=bin_width))
cwm_nt_metadata['cut'] = pd.cut(cwm_nt_metadata['Age_at_Collection'], bins=bin_list)

# Visualization
matplotlib.rcParams['axes.xmargin'] = 0.05
matplotlib.rcParams['axes.ymargin'] = 0.05

trait_list = ['doubling_h', 'optimum_tmp', 'genome_size', 'gc_content', 'rRNA16S_genes', 'tRNA_genes']
yunit_dict = {'doubling_h': '(log[h])', 'optimum_tmp': '(Â°C)',
              'genome_size': '(Mb)', 'gc_content': '(%)', 'rRNA16S_genes': '(#)', 'tRNA_genes': '(#)'}

col = 2
row = math.ceil(len(trait_list) / col)
fig, axes = plt.subplots(row, col, figsize=(2*col, 2*row))

for t, ax in zip(trait_list, axes.flatten()):
    x = cwm_nt_metadata["Age_at_Collection"]
    y = cwm_nt_metadata[t]

    if t == 'genome_size': # Convert to Mb
        y = y / 1e+6

    bins = 10
    hist, xedges, yedges = np.histogram2d(x, y, bins=bins)

    # Set Color based on density
    hist = (hist - hist.min().min()) / hist.sum().sum()

    x_indices = np.digitize(x, xedges) - 1
    y_indices = np.digitize(y, yedges) - 1

    x_indices[x_indices < 0] = 0
    x_indices[x_indices >= bins] = bins - 1
    y_indices[y_indices < 0] = 0
    y_indices[y_indices >= bins] = bins - 1

    colors = hist[x_indices, y_indices]

    scatter = ax.scatter(x, y, c=colors, cmap='Oranges', s=20, edgecolors='grey', linewidths=0.1)

    ymin, ymax = ax.get_ylim()
    ax.set_ylim(ymin, ymax + (ymax - ymin) * 0.20)

    ax.text(0.05, 0.95, titles[t], transform=ax.transAxes, verticalalignment='top')
    ax.xaxis.set_major_locator(MultipleLocator(300))
    ax.tick_params('x', pad = 3)
    ax.tick_params('y', pad = 1)

    ax.text(-0.01, 0.97, yunit_dict[t], ha='right', transform=ax.transAxes)

plt.savefig("../../results/07_application_gutmicrobiome/fig4b.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)

### Categorical traits

In [None]:
ct = ['gram_stain', 'sporulation', 'motility', 'anaerobic_respiration', 'mesophilic_range_tmp', 'thermophilic_range_tmp', 'bacillus_cell_shape', 'coccus_cell_shape', 'filament_cell_shape', 'spiral_cell_shape']

weighted_ct = dict()
is_first = True

p = None # Not limiting within phylum

for t in ct:
    x = pd.merge(table, pred[["sequence", t+"_nsti"]], how="inner", on="sequence")
    filtered_table = x[x[t+"_nsti"] < threshold_phylodistance.loc[t, "threshold"]].drop(t+"_nsti", axis=1).copy()

    # normalize
    filtered_table = normalize_by_phylum(filtered_table, taxonomy_split, p)
    # melt table
    melt_table = filtered_table.melt(id_vars="sequence", value_name="abundance", var_name="sample")
    # Merge
    melt_trait = pd.merge(melt_table, pred[["sequence", t]], how="inner", on="sequence")

    melt_trait["weighted_ct"] = melt_trait["abundance"] * melt_trait[t]

    cwm_ct = melt_trait[["sample", "weighted_ct"]].groupby(by="sample").sum()
    cwm_ct = cwm_ct.reset_index()
    cwm_ct = cwm_ct.sort_values(by="sample")
    if len(list(cwm_ct["weighted_ct"])) == 0:
        continue
    if is_first:
        is_first = False
        weighted_ct["sample"] = list(cwm_ct["sample"])
    weighted_ct[t] = list(cwm_ct["weighted_ct"])

cwn_ct_concat = pd.DataFrame(weighted_ct)
cwm_ct_metadata = pd.merge(cwn_ct_concat, metadata, how="inner", left_on="sample", right_on="G_id")
cwm_ct_metadata["Age_at_Collection"] = cwm_ct_metadata["Age_at_Collection"].astype(int)

bin_width=100
bin_max = 1200
bin_list = list(np.arange(start=0, stop=bin_max+1, step=bin_width))
bin_mid_list = list(np.arange(start=bin_width/2, stop=bin_max+1, step=bin_width))
cwm_ct_metadata['cut'] = pd.cut(cwm_ct_metadata['Age_at_Collection'], bins=bin_list)

# Visualization
matplotlib.rcParams['axes.xmargin'] = 0.05
matplotlib.rcParams['axes.ymargin'] = 0.05

trait_list = ['gram_stain', 'sporulation', 'motility', 'anaerobic_respiration', 'bacillus_cell_shape', 'coccus_cell_shape']

col = 2
row = math.ceil(len(trait_list) / col)
fig, axes = plt.subplots(row, col, figsize=(2*col, 2*row))

for t, ax in zip(trait_list, axes.flatten()):
    x = cwm_ct_metadata["Age_at_Collection"]
    y = cwm_ct_metadata[t] * 100

    bins = 10
    hist, xedges, yedges = np.histogram2d(x, y, bins=bins)

    # Set Color based on density
    hist = (hist - hist.min().min()) / hist.sum().sum()

    x_indices = np.digitize(x, xedges) - 1
    y_indices = np.digitize(y, yedges) - 1

    x_indices[x_indices < 0] = 0
    x_indices[x_indices >= bins] = bins - 1
    y_indices[y_indices < 0] = 0
    y_indices[y_indices >= bins] = bins - 1

    colors = hist[x_indices, y_indices]

    scatter = ax.scatter(x, y, c=colors, cmap='Oranges', s=15, edgecolors='grey', linewidths=0.1)

    ax.set_ylim(-5, 125)

    ax.text(0.05, 0.95, titles[t], transform=ax.transAxes, verticalalignment='top')

    ax.xaxis.set_major_locator(MultipleLocator(300))
    ax.tick_params('x', pad = 3)
    ax.tick_params('y', pad = 1)
    ax.text(-0.01, 0.97, '(%)', ha='right', transform=ax.transAxes)

plt.savefig("../../results/07_application_gutmicrobiome/fig4c.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)

# Categorical traits within Firmicutes

In [None]:
ct = ['gram_stain', 'sporulation', 'motility', 'anaerobic_respiration', 'mesophilic_range_tmp', 'thermophilic_range_tmp', 'bacillus_cell_shape', 'coccus_cell_shape', 'filament_cell_shape', 'spiral_cell_shape']

def above_abd_thres(df, focal_col, thres_col, thres):
    res = df[focal_col]
    if df[thres_col] < thres:
        res = np.nan
    return res

p = ' p__Firmicutes' # Limiting within phylum Firmicutes

weighted_ct = dict()
is_first = True

for t in ct:
    x = pd.merge(table, pred[["sequence", t+"_nsti"]], how="inner", on="sequence")
    filtered_table = x[x[t+"_nsti"] < threshold_phylodistance.loc[t, "threshold"]].drop(t+"_nsti", axis=1).copy()

    # normalize
    filtered_table = normalize_by_phylum(filtered_table, taxonomy_split, p)
    # melt table
    melt_table = filtered_table.melt(id_vars="sequence", value_name="abundance", var_name="sample")
    # Merge
    melt_trait = pd.merge(melt_table, pred[["sequence", t]], how="inner", on="sequence")
    melt_trait["weighted_ct"] = melt_trait["abundance"] * melt_trait[t]

    cwm_ct = melt_trait[["sample", "abundance", "weighted_ct"]].groupby(by="sample").sum()
    cwm_ct = cwm_ct.reset_index()
    cwm_ct['weighted_ct'] = cwm_ct.apply(
        lambda df: above_abd_thres(df, focal_col='weighted_ct', thres_col='abundance', thres=0.01), axis=1)
    cwm_ct = cwm_ct.sort_values(by="sample")
    if len(list(cwm_ct["weighted_ct"])) == 0:
            continue
    if is_first:
        is_first = False
        weighted_ct["sample"] = list(cwm_ct["sample"])
    weighted_ct[t] = list(cwm_ct["weighted_ct"])

cwn_ct_concat = pd.DataFrame(weighted_ct)
cwm_ct_metadata = pd.merge(cwn_ct_concat, metadata, how="inner", left_on="sample", right_on="G_id")

cwm_ct_metadata["Age_at_Collection"] = cwm_ct_metadata["Age_at_Collection"].astype(int)
bin_width=100
bin_max = 1200
bin_list = list(np.arange(start=0, stop=bin_max+1, step=bin_width))
bin_mid_list = list(np.arange(start=bin_width/2, stop=bin_max+1, step=bin_width))
cwm_ct_metadata['cut'] = pd.cut(cwm_ct_metadata['Age_at_Collection'], bins=bin_list)

# Visualization
matplotlib.rcParams['axes.xmargin'] = 0.05
matplotlib.rcParams['axes.ymargin'] = 0.05

trait_list = ['bacillus_cell_shape', 'coccus_cell_shape']

col = 2
row = math.ceil(len(trait_list) / col)
fig, axes = plt.subplots(row, col, figsize=(2*col, 2*row))

for t, ax in zip(trait_list, axes.flatten()):
    x = cwm_ct_metadata["Age_at_Collection"]
    y = cwm_ct_metadata[t] * 100

    bins = 10
    hist, xedges, yedges = np.histogram2d(x, y, bins=bins)

    # Set Color based on density
    hist = (hist - hist.min().min()) / hist.sum().sum()

    x_indices = np.digitize(x, xedges) - 1
    y_indices = np.digitize(y, yedges) - 1

    x_indices[x_indices < 0] = 0
    x_indices[x_indices >= bins] = bins - 1
    y_indices[y_indices < 0] = 0
    y_indices[y_indices >= bins] = bins - 1

    colors = hist[x_indices, y_indices]

    scatter = ax.scatter(x, y, c=colors, cmap='Oranges', s=20, edgecolors='grey', linewidths=0.1)

    ax.set_ylim(-5, 125)
    ax.text(0.05, 0.95, titles[t], transform=ax.transAxes, verticalalignment='top')
    ax.xaxis.set_major_locator(MultipleLocator(300))
    ax.tick_params('x', pad = 3)
    ax.tick_params('y', pad = 1)
    ax.text(-0.01, 0.97, '(%)', ha='right', transform=ax.transAxes)

plt.savefig("../../results/07_application_gutmicrobiome/fig4d.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)