# Calculating phylogenetic thresholds and applying them to EMP dataset

## Environment settings
```sh
# Working Directory
cd Bac2fFeature/scripts/06_trait_autocorrelations
# Output Directory
directories=(
    "../../data/trait_autocorrelations"
)
for dir in "${directories[@]}"; do
  if [ ! -d "$dir" ]; then
    mkdir -p "$dir"
  fi
done
```

## Calculating phylogenetic distance where traits are correlated
```sh
# Use Conda environment for R
conda activate r_bac2feature
# Calculation of trait autocorrelation along with phylogenetic signals
Rscript calc_autocorrelations.R
# Setting thresholds
Rscript setting_phylogenetic_signals.R
# Use default Conda environment in the following analysis
conda activate bac2feature_experiment
```

## Calculate phylogenetic signals in Earth Microbiome Project1 dataset

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from matplotlib.ticker import FixedLocator, FormatStrFormatter
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec

matplotlib.rcParams['font.family']       = 'Arial'
matplotlib.rcParams['font.sans-serif']   = ["Arial","DejaVu Sans","Lucida Grande","Verdana"]
matplotlib.rcParams["figure.figsize"]    = [4,3]
matplotlib.rcParams["font.size"]         = 10
matplotlib.rcParams["axes.labelcolor"]   = "#000000"
matplotlib.rcParams["axes.linewidth"]    = 1.0
matplotlib.rcParams["xtick.major.width"] = 1.0
matplotlib.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab20
cmap2 = plt.cm.Set3

### Preprocessing EMP data

In [None]:
# Convert tsv to fasta
otu_table_filepath = "../../data/trait_autocorrelations/emp_deblur_150bp.subset_2k.rare_5000.tsv"
otu_fasta_filepath = "../../data/trait_autocorrelations/otu_emp_deblur_150bp.subset_2k.rare_5000.fasta"

otu_table = pd.read_csv(otu_table_filepath, sep="\t", header=1)
otu_table["#OTU No"] = ["OTU" + str(i) for i in np.arange(0, otu_table.shape[0])]

def make_seqrecord(df):
    return SeqRecord(
        Seq(df["#OTU ID"]),
        id=df["#OTU No"],
        name="",
        description="",
    )
records = otu_table.apply(lambda df: make_seqrecord(df), axis=1)

with open(otu_fasta_filepath, "w") as outfh:
    SeqIO.write(records, outfh, "fasta")

### Calculating phylogenetic distance
```sh
bac2feature -s ../../data/trait_autocorrelations/otu_emp_deblur_150bp.subset_2k.rare_5000.fasta -o ../../data/trait_autocorrelations/bac2feature_result.tsv -m phylogeny --ref_dir ../../data/ref_bac2feature/phylogeny --ref_trait ../../data/ref_bac2feature/trait_bac2feature.tsv --intermediate_dir ../../data/trait_autocorrelations/intermediate_dir --calculate_NSTI
```

### Calculating relative abundance of ASVs within thresholds

In [None]:
# Phylogenetic distance
b2f_result = pd.read_csv("../../data/trait_autocorrelations/bac2feature_result.tsv", sep="\t")

# EMP dataset
otu_table_filepath = "../../data/trait_autocorrelations/emp_deblur_150bp.subset_2k.rare_5000.tsv"
otu_table = pd.read_csv(otu_table_filepath, sep="\t", header=1)

all_sample_list = list(otu_table.columns[1:])
otu_table["sequence"] = ["OTU" + str(i) for i in np.arange(0, otu_table.shape[0])]

metadata_filepath = "../../data/trait_autocorrelations/emp_qiime_mapping_subset_2k.tsv"
metadata = pd.read_csv(metadata_filepath, sep="\t", header=0, index_col=0)
metadata = metadata.loc[all_sample_list,:]
metadata = metadata.reset_index()

sample_with_metadata = list(metadata["#SampleID"])

total_abandance = otu_table.loc[:, sample_with_metadata].sum(axis=0)
relative_abundance = otu_table.loc[:, sample_with_metadata] / total_abandance
otu_relative_abundance = pd.concat([otu_table.loc[:, "sequence"], relative_abundance], axis=1)

rel_abd_melt = otu_relative_abundance.melt(id_vars="sequence", var_name="#SampleID", value_name="abundance")
rel_abd_meta = pd.merge(rel_abd_melt, metadata[["#SampleID", "empo_2", "empo_3"]], how="left", on="#SampleID")

# Threshold
threshold_phylodistance = pd.read_csv("../../data/trait_autocorrelations/threshold_phylodistance.tsv", sep= "\t", index_col=0)
threshold_phylodistance.rename(columns={"cor_0.5": "threshold"}, inplace=True)

In [None]:
def abd_above_threshold(df, t, threshold):
    return df["abundance"] if df[t+"_nsti"] < threshold else 0

# Continuous traits 
# Remove traits with thresholds values 0
trait_list = ["doubling_h", "genome_size", "gc_content", "coding_genes", "optimum_tmp", "growth_tmp", "rRNA16S_genes", "tRNA_genes"]

env_col = "empo_2"
above_abd_env_type_2d = []

for t in trait_list:
    # Calculate weighted NSTI per OTU
    abd_nsti = pd.merge(rel_abd_melt, b2f_result[["sequence", t+"_nsti"]], how="left", on="sequence")
    # extract abundance with nsti is smaller than threshold
    threshold = threshold_phylodistance.loc[t, "threshold"]
    abd_nonzero_nsti = abd_nsti[abd_nsti["abundance"] > 0].copy()
    abd_nonzero_nsti["above_abd"] = abd_nonzero_nsti.apply(
        lambda df: abd_above_threshold(df, t, threshold), axis=1)

    # sum abundance by each sample
    above_abd = abd_nonzero_nsti[["#SampleID", "above_abd"]].groupby(by="#SampleID").sum()
    above_abd = above_abd.reset_index()
    above_abd_meta = pd.merge(above_abd, metadata, how="left", on="#SampleID")

    # mean by env_type
    above_abd_env_type = above_abd_meta[["above_abd", env_col]].groupby(by=env_col).mean()
    above_abd_env_type = above_abd_env_type.reset_index()

    above_abd_env_type_2d.append(list(above_abd_env_type["above_abd"]))

df_above_abd_empo2_nt = pd.DataFrame(above_abd_env_type_2d, index=trait_list, columns=list(above_abd_env_type["empo_2"]))

# Categorical traits
# Remove traits with thresholds values 0
trait_list = ['gram_stain', 'sporulation', 'motility', 'anaerobic_respiration', 'mesophilic_range_tmp', 'thermophilic_range_tmp', 'bacillus_cell_shape', 'coccus_cell_shape', 'filament_cell_shape', 'spiral_cell_shape']

env_col = "empo_2"
above_abd_env_type_2d = []

for t in trait_list:
    # Calculate weighted NSTI per OTU
    abd_nsti = pd.merge(rel_abd_melt, b2f_result[["sequence", t+"_nsti"]], how="left", on="sequence")
    # extract abundance with nsti is smaller than threshold
    threshold = threshold_phylodistance.loc[t, "threshold"]
    abd_nonzero_nsti = abd_nsti[abd_nsti["abundance"] > 0].copy()
    abd_nonzero_nsti["above_abd"] = abd_nonzero_nsti.apply(
        lambda df: abd_above_threshold(df, t, threshold), axis=1)

    # sum abundance by each sample
    above_abd = abd_nonzero_nsti[["#SampleID", "above_abd"]].groupby(by="#SampleID").sum()
    above_abd = above_abd.reset_index()
    above_abd_meta = pd.merge(above_abd, metadata, how="left", on="#SampleID")

    # mean by env_type
    above_abd_env_type = above_abd_meta[["above_abd", env_col]].groupby(by=env_col).mean()
    above_abd_env_type = above_abd_env_type.reset_index()

    above_abd_env_type_2d.append(list(above_abd_env_type["above_abd"]))

df_above_abd_empo2_ct = pd.DataFrame(above_abd_env_type_2d, index=trait_list, columns=list(above_abd_env_type["empo_2"]))

# Save
df_out = pd.concat([df_above_abd_empo2_nt, df_above_abd_empo2_ct], axis=0)
df_out.reset_index(names=['trait'], inplace=True)
df_out.to_csv('../../data/trait_autocorrelations/above_abd_empo2.tsv', sep='\t', index=False)

### Visualization

In [None]:
df_out = pd.read_csv('../../data/trait_autocorrelations/above_abd_empo2.tsv', sep='\t', index_col=['trait'])

phylod_df = threshold_phylodistance.reset_index(drop=False)
empo_df = df_out.reset_index(drop=False)
phylod_empo = pd.merge(phylod_df[["trait", "threshold"]], empo_df, how='inner', on='trait')

nt = ['doubling_h', 'optimum_tmp', 'tRNA_genes', 'rRNA16S_genes', 'coding_genes', 'genome_size', 'growth_tmp', 'gc_content']
phylod_empo_nt = phylod_empo.set_index('trait').loc[nt, :]

ct = ['mesophilic_range_tmp', 'motility', 'bacillus_cell_shape', 'thermophilic_range_tmp', 'filament_cell_shape', 'spiral_cell_shape', 'coccus_cell_shape', 'anaerobic_respiration', 'sporulation', 'gram_stain']
phylod_empo_ct = phylod_empo.set_index('trait').loc[ct, :]

titles = {'cell_diameter': 'Cell diameter', 'cell_length': 'Cell length', 'doubling_h': 'Doubling time', 'growth_tmp': 'Growth temp.', 'optimum_tmp': 'Optimum temp.', 'optimum_ph': 'Optimum pH', 'genome_size': 'Genome size', 'gc_content': 'GC content', 'coding_genes': 'Coding genes', 'rRNA16S_genes': 'rRNA16S genes', 'tRNA_genes': 'tRNA genes', 'gram_stain': 'Gram stain', 'sporulation': 'Sporulation', 'motility': 'Motility', 'range_salinity': 'Halophiles', 'facultative_respiration': 'Facultatives', 'anaerobic_respiration': 'Anaerobes', 'aerobic_respiration':'Aerobes' ,'mesophilic_range_tmp': 'Mesophiles', 'thermophilic_range_tmp':'Thermophiles', 'psychrophilic_range_tmp': 'Psychrophiles', 'bacillus_cell_shape': 'Bacillus', 'coccus_cell_shape': 'Coccus', 'filament_cell_shape': 'Filament', 'coccobacillus_cell_shape': 'Coccobacillus', 'vibrio_cell_shape': 'Vibrio', 'spiral_cell_shape': 'Spiral'}

# Visualization
figure = plt.figure(figsize=(3, 6))
gs_master = GridSpec(nrows=2, ncols=2, height_ratios=[8, 10], width_ratios=[1.5, 2], hspace=0.55, wspace=0.25)

### Loliplot: continuous traits
gs_1 = GridSpecFromSubplotSpec(nrows=1, ncols=1, subplot_spec=gs_master[0, 0])
axes_1 = figure.add_subplot(gs_1[:, :])

height = list(reversed(list(phylod_empo_nt['threshold'])))
markerline, _, _ = axes_1.stem(height, linefmt='k-', markerfmt='ko', orientation='horizontal', basefmt='black')
plt.setp(markerline, markersize = 4)
for i, t in enumerate(height):
    axes_1.text(t+0.2, i, f'{t: .2f}', fontsize=7,
                horizontalalignment='left', verticalalignment='center_baseline')

axes_1.set_xlabel('Thresholds for \nPhylogenetic distance')
axes_1.set_yticks(np.arange(phylod_empo_nt.shape[0]))
axes_1.set_yticklabels([titles[t] for t in list(reversed(list(phylod_empo_nt.index)))])
axes_1.set_title('8 continuous traits', fontsize=10)

# threshold_phylodistance
axes_1.set_xlim(0, 3)

axes_1.xaxis.set_major_locator(FixedLocator(np.arange(start=0, stop=2+1, step=1)))
axes_1.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))
axes_1.xaxis.set_minor_locator(FixedLocator(np.arange(start=0, stop=2+0.25, step=0.25)))
axes_1.grid(axis='x', alpha=0.1, color='grey')

axes_1.spines['top'].set(color='grey', linewidth=.15)
axes_1.spines['right'].set(color='grey', linewidth=.15)

### Heatplot: continuous traits
gs_2 = GridSpecFromSubplotSpec(nrows=1, ncols=1, subplot_spec=gs_master[0, 1])
axes_2 = figure.add_subplot(gs_2[:, :])

sns.heatmap(phylod_empo_nt.drop(['threshold'], axis='columns'), cmap='magma', cbar=False,
            square=True, vmax=1, vmin=0, annot=True, annot_kws={"size":6}, fmt='.2f', xticklabels=1, yticklabels=1, linewidths=.5, ax=axes_2)

axes_2.set_xticklabels(phylod_empo_nt.columns[1:], rotation=90)
axes_2.set_yticklabels([])
axes_2.set_ylabel('')
axes_2.tick_params(bottom=False, left=False)
axes_2.tick_params(axis='x', which='major', pad=0)

cbar = figure.colorbar(axes_2.collections[0], shrink=0.7, aspect=30, anchor=(1.0, 0.0), pad=-0.05)
cbar.set_label('Relative abundance')
cbar.ax.tick_params(labelsize=7)
cbar.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
cbar.outline.set_linewidth(0.5)

### Loliplot: categorical traits
gs_3 = GridSpecFromSubplotSpec(nrows=1, ncols=1, subplot_spec=gs_master[1, 0])
axes_3 = figure.add_subplot(gs_3[:, :])

height = list(reversed(list(phylod_empo_ct['threshold'])))
markerline, _, _ = axes_3.stem(height, linefmt='k-', markerfmt='ko', orientation='horizontal', basefmt='black')
plt.setp(markerline, markersize = 4)
for i, t in enumerate(height):
    axes_3.text(t+0.2, i, f'{t: .2f}', fontsize=7, horizontalalignment='left', verticalalignment='center_baseline')

axes_3.set_xlabel('Thresholds for \nPhylogenetic distance')
axes_3.set_yticks(np.arange(phylod_empo_ct.shape[0]))
axes_3.set_yticklabels([titles[t] for t in list(reversed(list(phylod_empo_ct.index)))])
axes_3.set_title('10 categorical traits', fontsize=10)

# threshold_phylodistance
axes_3.set_xlim(0, 3.0)

axes_3.xaxis.set_major_locator(FixedLocator(np.arange(start=0, stop=2+1, step=1)))
axes_3.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))
axes_3.xaxis.set_minor_locator(FixedLocator(np.arange(start=0, stop=2+0.25, step=0.25)))
axes_3.grid(axis='x', alpha=0.1, color='grey')

axes_3.spines['top'].set(color='grey', linewidth=.15)
axes_3.spines['right'].set(color='grey', linewidth=.15)

### Heatplot: continuous traits
gs_4 = GridSpecFromSubplotSpec(nrows=1, ncols=1, subplot_spec=gs_master[1, 1])
axes_4 = figure.add_subplot(gs_4[:, :])

sns.heatmap(phylod_empo_ct.drop(['threshold'], axis='columns'), cmap='magma', cbar=False,
            square=True, vmax=1, vmin=0, annot=True, annot_kws={"size":6}, fmt='.2f', xticklabels=1, yticklabels=1, linewidths=.5, ax=axes_4)

axes_4.set_xticklabels(phylod_empo_ct.columns[1:], rotation=90)
axes_4.tick_params(axis='x', which='major', pad=0)
axes_4.set_yticklabels([])
axes_4.set_ylabel('')
axes_4.tick_params(bottom=False, left=False)

cbar = figure.colorbar(axes_4.collections[0], shrink=0.65, aspect=30, anchor=(1.0, 0.0), pad=-0.05)
cbar.set_label('Relative abundance')
cbar.ax.tick_params(labelsize=7)
cbar.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
cbar.outline.set_linewidth(0.5)

plt.savefig("../../results/06_trait_autocorrelations/fig3a-3b.pdf", format="pdf", dpi=300, facecolor="white", bbox_inches="tight", pad_inches=0.1)