# Application to 16S rRNA amplicon analysis of the lake and soil microbiome

## Environment settings
```sh
# Working Directory
cd Bac2fFeature/scripts/07_application_gutmicrobiome
```

In [None]:
import json
import os
import math

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams['figure.figsize']    = [4,3]
matplotlib.rcParams['font.size']         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3

## Lake microbiome

### DADA2 parameters
```sh
qiime dada2 denoise-paired --i-demultiplexed-seqs ../../data/2023-10-05/demux_cutadapt_SRP136143.qza --p-trunc-len-f 240 --p-trunc-len-r 240 --p-min-overlap 20  --o-representative-sequences ../../data/2023-10-05/reps_seqs_cutadapt_SRP136143.qza --o-table ../../data/2023-10-05/table_cutadapt_SRP136143.qza --o-denoising-stats ../../data/2023-10-05/denoising_stats_cutadapt_SRP136143.qza --p-n-threads 40
```

### Trait prediction
```sh
bac2feature -s asv_seqs_PRJEB27579.fasta -o predicted_trait_PRJEB27579.tsv -m phylogeny --ref_dir ../../data/ref_bac2feature/phylogeny/ --ref_table ../../data/ref_bac2feature.tsv --intermediate_dir intermediate_dir --threads 20 --calculate_NSTI
```

In [None]:
trait = pd.read_csv("predicted_trait_PRJEB27579.tsv", sep="\t")
feature_table = pd.read_csv("table_PRJEB27579.tsv", sep="\t", header=1)
metadata_path = "metadata_PRJEB27579.tsv"
metadata = pd.read_csv(metadata_path, sep="\t")
metadata = metadata.rename(columns={"sampleid": "sample"})

# normalize
abundance = feature_table.iloc[:, 1:]
feature_table.iloc[:, 1:] = abundance / abundance.sum(axis=0)
# rename id column
feature_table = feature_table.rename(columns={"#OTU ID": "sequence"})

feature_table = feature_table.melt(id_vars="sequence",
                                   value_name="abundance",
                                   var_name="sample"
                )

# Merge feature table with trait
feature_trait = pd.merge(feature_table, trait, how="inner", on="sequence")

threshold_phylodistance = pd.read_csv("../../data/trait_autocorrelations/threshold_phylodistance.tsv", sep= "\t", index_col=0)
threshold_phylodistance.rename(columns={"cor_0.5": "threshold"}, inplace=True)

In [None]:
# Focus on anaerobic respiration
t = "anaerobic_respiration"

def rm_spurious_prediction(df, threshold_phylodistance, t):
    threshold = threshold_phylodistance.loc[t, "threshold"]
    # remove
    if df[t+"_nsti"] > threshold:
        return np.nan
    else:
        return df["abundance"]

feature_trait["fil_abd"] = feature_trait.apply(
    lambda x: rm_spurious_prediction(x, threshold_phylodistance, t), axis=1)

feature_trait[t+"_fil_abd"] = feature_trait["fil_abd"] * feature_trait[t]

# Nomalization
ratio = feature_trait.groupby("sample", as_index=False)["fil_abd"].sum()
ratio["inv"] = 1 / ratio["fil_abd"]
anaerobic_abd = feature_trait.groupby("sample", as_index=False)[t+"_fil_abd"].sum()

anaerobic_abd_normed = pd.merge(anaerobic_abd, ratio, how="inner", on="sample")
anaerobic_abd_normed[t+"_fil_abd_normed"] = anaerobic_abd_normed[t+"_fil_abd"] * anaerobic_abd_normed["inv"]

anaerobic_abd_meta = pd.merge(anaerobic_abd_normed, metadata, how="inner", on="sample")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(3, 2.5))
x = anaerobic_abd_meta["Oxygen"]
y = 1 - anaerobic_abd_meta[t+"_fil_abd_normed"]

for g, m, c in zip(["Greifensee", "Lake Zug", "Lake Lugano", "Rotsee"], ["D", "s", "^", "o"], ["tab:blue", "tab:orange", "tab:green", "tab:red"]):
    x = anaerobic_abd_meta[anaerobic_abd_meta["Group"]==g]["Oxygen"]
    y = 1 - anaerobic_abd_meta[anaerobic_abd_meta["Group"]==g][t+"_fil_abd_normed"]
    sc = ax.scatter(x+1, y, marker=m, s=40, alpha=1, edgecolor=c, facecolor="None")

ax.set_xlabel("Oxygen (Âµmol/L)", fontsize=14)
ax.set_ylabel("Fraction of\n oxygen tolerant spp.", fontsize=14)

ax.set_title("Lake water", fontsize=14)
ax.legend(["Greifensee", "Lake Zug", "Lake Lugano", "Rotsee"])

spearman_corr, p_value = spearmanr(anaerobic_abd_meta["Oxygen"], 1 - anaerobic_abd_meta[t+"_fil_abd_normed"], nan_policy='omit')
print(f"Spearman correlation coefficient: {spearman_corr:.3f}, p-value: {p_value:.2e}")

plt.savefig("../../results/08_application_soil_lake_microbiome/fig5a.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)

## Soil microbiome

### DADA2 parameters
```sh
qiime dada2 denoise-paired --i-demultiplexed-seqs ../../data/2023-10-05/demux_cutadapt_SRP136143.qza --p-trunc-len-f 240 --p-trunc-len-r 240 --p-min-overlap 20  --o-representative-sequences ../../data/2023-10-05/reps_seqs_cutadapt_SRP136143.qza --o-table ../../data/2023-10-05/table_cutadapt_SRP136143.qza --o-denoising-stats ../../data/2023-10-05/denoising_stats_cutadapt_SRP136143.qza --p-n-threads 40
```

### Trait prediction
```sh
bac2feature -s asv_seqs_SRP136143.fasta -o predicted_trait_SRP136143.tsv -m phylogeny --ref_dir ../../data/ref_bac2feature/phylogeny/ --ref_table ../../data/ref_bac2feature.tsv --intermediate_dir intermediate_dir --threads 20 --calculate_NSTI
```

In [None]:
predicted_trait_path = "predicted_trait_SRP136143.tsv"
pred = pd.read_csv(predicted_trait_path, sep="\t")

# Feature table
table = pd.read_csv("table_SRP136143.tsv", sep="\t", header=1)
# normalize
abundance = table.iloc[:, 1:]
table.iloc[:, 1:] = abundance / abundance.sum(axis=0)
# rename the first column
table.rename(columns={"#OTU ID": "sequence"}, inplace=True)
# melt table
melt_table = table.melt(id_vars="sequence", value_name="abundance", var_name="sample")
# Merging tooks several seconds
feature_trait = pd.merge(melt_table, pred, how="inner", on="sequence")

# Metadata from the paper
metadata = pd.read_csv("metadata_SRP136143.csv")
metadata.drop(0, inplace=True)

threshold_phylodistance = pd.read_csv("../../data/trait_autocorrelations/threshold_phylodistance.tsv", sep= "\t", index_col=0)
threshold_phylodistance.rename(columns={"cor_0.5": "threshold"}, inplace=True)

In [None]:
# Focus on genome size
t = "genome_size"

def rm_spurious_prediction(df, threshold_phylodistance, t):
    threshold = threshold_phylodistance.loc[t, "threshold"]
    # remove
    if df[t+"_nsti"] > threshold:
        return np.nan
    else:
        return df["abundance"]

feature_trait["fil_abd"] = feature_trait.apply(
    lambda x: rm_spurious_prediction(x, threshold_phylodistance, t), axis=1)

feature_trait[t+"_fil_abd"] = feature_trait["fil_abd"] * feature_trait[t]

# Nomalization
ratio = feature_trait.groupby("sample", as_index=False)["fil_abd"].sum()
ratio["inv"] = 1 / ratio["fil_abd"]
cwm = feature_trait.groupby("sample", as_index=False)[t+"_fil_abd"].sum()
cwm_normed = pd.merge(cwm, ratio, how="inner", on="sample")
cwm_normed[t+"_fil_abd_normed"] = cwm_normed[t+"_fil_abd"] * cwm_normed["inv"]

cwm_nomred = pd.merge(cwm_normed, metadata, how="inner", left_on="sample", right_on="sample-id")

In [None]:
fig, ax = plt.subplots(figsize=(3, 2.5))

# plot cwm along with salinity gradient
x = cwm_nomred["SC"].astype(float).values
y = cwm_nomred[t+"_fil_abd_normed"].astype(float).values / 1e6
ax.scatter(x, y, facecolor="None", edgecolors="black", s=30)

# Fitting linear model
lr = LinearRegression()
lr.fit(x.reshape(-1, 1), y.reshape(-1, 1))
ax.plot(x, lr.predict(x.reshape(-1, 1)), linestyle="dashed", color="grey")

lr_sm = sm.OLS(y, sm.add_constant(x)).fit()
conf_interval = lr_sm.conf_int(0.05)
fill_x = np.sort(x)
X = sm.add_constant(fill_x)
y_lower = X.dot(conf_interval[:, 0])
y_upper = X.dot(conf_interval[:, 1])
ax.fill_between(fill_x, y_lower, y_upper, color="grey", alpha=0.2)

ax.set_title("Agricultural soil", fontsize=14)
ax.set_xlabel(r"Salt content (g/kg)", fontsize=14)
ax.set_ylabel("Average genome size (Mb)", fontsize=14)

plt.savefig("../../results/08_application_soil_lake_microbiome/fig5b.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)