In [1]:
# Normal packages
import os
import sys
import lz4.frame
import numpy as np
import pandas as pd
sys.path.append("/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/scripts/postprocessing/postprocessing_scripts/")
import config

# Statistical packages
import scipy.stats as stats
from scipy.spatial import distance_matrix
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import matplotlib.patches as mpatches

import seaborn as sns
sns.set_theme(style="whitegrid")

# custom
import diversity_utils
import species_utils
import parse_midas_data
import calculate_intersample_changes

## Assessing QP stats

In [2]:
# Load species list
species_list_path = "%sspecies_snps.txt" % (config.metadata_directory)

with open(species_list_path, "r") as f:
    species_list = [line.strip() for line in f if line.strip()]

species_code_map = species_utils.parse_species_code_maps()[0]


In [3]:
sample_list = np.loadtxt(config.accessions, comments = "#", dtype = str)

In [4]:
qp_array = []
for species in species_list:
    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(species)
    if len(highcoverage_samples) == 0:
        continue
    haploid_samples = diversity_utils.calculate_haploid_samples(species, quick_and_dirty=True) #  quick_and_dirty=True
    non_haploid_samples = [s for s in highcoverage_samples if s not in set(haploid_samples)]
    number_of_haploid_samples = len(haploid_samples)
    number_of_non_haploid_samples = len(non_haploid_samples)
    qp_array.append([species, number_of_haploid_samples, "QP"])
    qp_array.append([species, number_of_non_haploid_samples, "Not QP"])
qp_df = pd.DataFrame(data = qp_array, columns = ["species", "sample_count", "QP"])
qp_df['sample_count'] = qp_df["sample_count"].astype(float)
qp_df['species_name'] = [species_code_map[species] for species in qp_df['species']]

In [5]:
qp_df[qp_df["QP"] == "Not QP"].sort_values("sample_count", ascending = False)

Unnamed: 0,species,sample_count,QP,species_name
33,207693,6.0,Not QP,f__Oscillospiraceae (207693)
11,100555,4.0,Not QP,g__Lawsonibacter (100555)
51,214603,4.0,Not QP,g__CAG-95 (214603)
17,203686,4.0,Not QP,g__UBA3282 (203686)
97,261672,2.0,Not QP,g__Angelakisella (261672)
...,...,...,...,...
63,217378,0.0,Not QP,g__1XD42-69 (217378)
67,229722,0.0,Not QP,g__Ruminiclostridium_E
69,231109,0.0,Not QP,g__CAG-81
71,231118,0.0,Not QP,g__Clostridium_Q


In [None]:
# Plotting
## species
qp_df_species_total = pd.DataFrame(qp_df.groupby(["species_name"])['sample_count'].sum()).reset_index()
qp_df_species_notqp = qp_df[qp_df["QP"] == "Not QP"]

## Order
threshold = 2
species_over_threshold = qp_df_species_total[qp_df_species_total['sample_count'] >= threshold].species_name.unique()
qp_df_species_total = qp_df_species_total[qp_df_species_total.species_name.isin(species_over_threshold)]
qp_df_species_notqp = qp_df_species_notqp[qp_df_species_notqp.species_name.isin(species_over_threshold)]
species_order = qp_df_species_total.sort_values(by="sample_count", ascending=False)["species_name"].tolist()

## Plot
fig, ax = plt.subplots(figsize=(8, 14))  # Adjusted height for better vertical spacing

sns.barplot(data=qp_df_species_total, y="species_name", x="sample_count", color="turquoise", order=species_order, ax=ax)
sns.barplot(data=qp_df_species_notqp, y="species_name", x="sample_count", color="purple", order=species_order, ax=ax)

qp_patch = mpatches.Patch(color="turquoise", label="QP")
not_qp_patch = mpatches.Patch(color="purple", label="Not QP")
ax.legend(handles=[qp_patch, not_qp_patch], title="QP status")
ax.set_xlabel("Number of Samples", fontsize=14)   # change x-axis label
ax.set_ylabel("Species", fontsize=14)  

plt.yticks(rotation=0)  # No need to rotate y-axis ticks
plt.tight_layout()
plt.show()

In [None]:
out_path = "/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/figures/summary_stats/qp_samples.png"
fig.savefig(out_path, dpi = 300, facecolor = "white", bbox_inches='tight')


## PCoA of species

In [None]:
mean_normalize = False

if mean_normalize:

    print("Mean normalizing features in the data set (i.e., each species has a mean of ~0 and a variance of 1)")
    
    ## load data
    species_relabs_path = "%sspecies/species_relative_abundance.tsv" % (config.data_directory)
    species_relabs = pd.read_csv(species_relabs_path, sep = "\t").set_index("species_id")

    ## Remove rows and column where all values are 0
    species_relabs = species_relabs[~(species_relabs == 0).all(axis=1)] 
    species_relabs = species_relabs.loc[:,~(species_relabs == 0).all(axis=0)] 

    # Rotate df
    species_relabs = species_relabs.T

    ## mean normalization
    species_relabs = (species_relabs - species_relabs.mean())/species_relabs.std()

    ## numpy array format
    samples = species_relabs.index
    species = species_relabs.columns
    species_relabs = np.array(species_relabs)

else:

    print("Using raw compositional data (i.e., L1 normalization, wherein features in a sample sum to 1)")

    ## load data
    species_relabs_path = "%sspecies/species_relative_abundance.tsv" % (config.data_directory,)
    species_relabs = pd.read_csv(species_relabs_path, sep = "\t").set_index("species_id")

    ## Remove rows and column where all values are 0
    species_relabs = species_relabs[~(species_relabs == 0).all(axis=1)] # Remove rows where all values are 0
    species_relabs = species_relabs.loc[:,~(species_relabs == 0).all(axis=0)] 

    ## Rotate df
    species_relabs = species_relabs.T
    
    ## numpy array format
    samples = species_relabs.index
    species = species_relabs.columns
    species_relabs = np.array(species_relabs)


In [None]:
# calculate distances
distances = pairwise_distances(species_relabs, metric='braycurtis')

In [None]:
# PCoA using MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
coordinates = mds.fit_transform(distances)

In [None]:
# Load metadata
sample_metadata_map = parse_midas_data.parse_sample_metadata_map()
# Configure as pandas dataframe
mds_df = pd.DataFrame(coordinates)
mds_df['samples'] = samples
mds_df['mouse'] = mds_df['samples'].apply(lambda s: sample_metadata_map[s]['mouse'])
mds_df['cage'] = mds_df['samples'].apply(lambda s: sample_metadata_map[s]['cage'])
mds_df['location'] = mds_df['samples'].apply(lambda s: sample_metadata_map[s]['location'])
mds_df['sex'] = mds_df['samples'].apply(lambda s: sample_metadata_map[s]['sex'])
mds_df = mds_df.rename(columns = {0: "MDS 1", 1: "MDS 2"})

In [None]:
# Visualize the results
fig, ax = plt.subplots(figsize=(8, 6))

sns.scatterplot(data = mds_df, x = "MDS 1", y = "MDS 2", hue = "cage", ax = ax)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="cage")





In [None]:
out_path = "/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/figures/summary_stats/PCoA_cage.png"
fig.savefig(out_path, dpi = 300, facecolor = "white", bbox_inches='tight')

## Calling evolutionary changes

In [6]:
species_list = species_utils.load_species_list()
species_list[:5]

['100042', '100110', '100111', '100113', '100118']

In [7]:
# Load metadata
sample_metadata_map = parse_midas_data.parse_sample_metadata_map()

In [8]:
# Load species code maps
species_code_map = species_utils.parse_species_code_maps()[0]

In [11]:
change_rate_array = []
for species in species_list:
    print("Processing species %s" % (species))

    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(species)

    if len(highcoverage_samples) == 0:
        print("Species %s doesn't meet coverage requirements. Skipping to next species." % (species))
        continue

    haploid_samples = diversity_utils.calculate_haploid_samples(species, quick_and_dirty=True, quick_and_dirty_threshold = 1e-5)
    
    if len(haploid_samples) < 2:
        print("Species %s doesn't has less than 2 haploid samples. Skipping to next species" % (species))
        continue

    print("%s has multiple haploid samples!" % (species))
    
    intersample_change_map = calculate_intersample_changes.load_intersample_change_map(species)
    
    pairs = list(intersample_change_map.keys())

    for pair in pairs:
        
        opportunities = intersample_change_map[pair]['snps'][0]
        changes = intersample_change_map[pair]['snps'][2]
        number_of_changes = len(changes)
        change_rate = number_of_changes/opportunities

        change_rate_array.append([species, pair[0], pair[1], number_of_changes, opportunities, change_rate])

change_rate_df = pd.DataFrame(data = change_rate_array, columns = ["species", "sample_1", "sample_2", "number_of_changes", "opportunities", "change_rate"])
change_rate_df = change_rate_df.sort_values(by = "number_of_changes", ascending=False)

# annotate
## Mouse
change_rate_df["mouse_1"] = change_rate_df["sample_1"].apply(lambda x: sample_metadata_map[x]["mouse"])
change_rate_df["mouse_2"] = change_rate_df["sample_2"].apply(lambda x: sample_metadata_map[x]["mouse"])

# cage
change_rate_df["cage_1"] = change_rate_df["sample_1"].apply(lambda x: sample_metadata_map[x]["cage"])
change_rate_df["cage_2"] = change_rate_df["sample_2"].apply(lambda x: sample_metadata_map[x]["cage"])

# cage
change_rate_df["location_1"] = change_rate_df["sample_1"].apply(lambda x: sample_metadata_map[x]["location"])
change_rate_df["location_2"] = change_rate_df["sample_2"].apply(lambda x: sample_metadata_map[x]["location"])

# Host orientation
change_rate_df["orientation"] = change_rate_df.apply(lambda row: "Within host" if ((row["mouse_1"] == row["mouse_2"]) & (row["cage_1"] == row["cage_2"])) else "Between host", axis = 1)

# Coarse graining
change_rate_df["coarse_location_1"] = change_rate_df["location_1"].apply(lambda x: "Small intestine" if x in ["Duodenum", "Jejunum", "Ileum"] else "Large intestine")
change_rate_df["coarse_location_2"] = change_rate_df["location_2"].apply(lambda x: "Small intestine" if x in ["Duodenum", "Jejunum", "Ileum"] else "Large intestine")

# Taxnomic assignment

change_rate_df['taxonomic_assignment'] = change_rate_df["species"].apply(lambda x: species_code_map[x])





Processing species 100042
Species 100042 doesn't meet coverage requirements. Skipping to next species.
Processing species 100110
Species 100110 doesn't meet coverage requirements. Skipping to next species.
Processing species 100111
Species 100111 doesn't has less than 2 haploid samples. Skipping to next species
Processing species 100113
Species 100113 doesn't meet coverage requirements. Skipping to next species.
Processing species 100118
Species 100118 doesn't meet coverage requirements. Skipping to next species.
Processing species 100158
Species 100158 doesn't has less than 2 haploid samples. Skipping to next species
Processing species 100171
Species 100171 doesn't meet coverage requirements. Skipping to next species.
Processing species 100320
Species 100320 doesn't has less than 2 haploid samples. Skipping to next species
Processing species 100334
100334 has multiple haploid samples!
Processing species 100338
Species 100338 doesn't meet coverage requirements. Skipping to next species

In [12]:
out_path = "/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/tables/snp_change_rate_5e5.tsv"
change_rate_df.to_csv(out_path, sep = "\t", index = False)

In [None]:
change_rate_df

### Plotting

In [None]:
# Set the desired order for the 'orientation' column

change_rate_df['orientation'] = pd.Categorical(
    change_rate_df['orientation'],
    categories=["Within host", "Between host"],
    ordered=True
)

# Initialize subplots
fig, ax = plt.subplots(figsize=(10, 6))

# Create the box plot
sns.boxplot(
    data=change_rate_df,
    x="orientation",
    y="change_rate",
    ax=ax,
    showfliers=False  # Optional: Hide outliers in the box plot
)

# Overlay scatter points
sns.stripplot(
    data=change_rate_df,
    x="orientation",
    y="change_rate",
    hue="taxonomic_assignment",
    ax=ax,
    dodge=False,  # Separate points by hue
    palette="dark",
    alpha=0.7,  # Adjust transparency for better visualization
)

# Customize the legend
ax.legend(
    title="Taxonomic Assignment",
    loc="upper center",  # Place the legend above the x-axis
    bbox_to_anchor=(0.5, -0.15),  # Center the legend below the plot
    ncol=2  # Set the legend to have two columns
)

# Add labels and title
ax.set_ylabel("SNV change rate")
ax.set_xlabel("")
ax.set_title("Rate of SNV changes observed across conventional mouse samples.")
# ax.set_title("SNP change rate from 2mo to 6mo within hosts")


out_path = "/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/figures/evolution/snv_change_rate.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")  # Save with high resolution and tight layout



## SNP changes df

In [None]:
snp_change_array = []
different_strain_threshold = 1e-4

for species in species_list:
    print("Processing species %s" % (species))

    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(species)

    if len(highcoverage_samples) == 0:
        print("Species %s doesn't meet coverage requirements. Skipping to next species." % (species))
        continue

    haploid_samples = diversity_utils.calculate_haploid_samples(species, quick_and_dirty=True)
    
    if len(haploid_samples) < 2:
        print("Species %s doesn't has less than 2 haploid samples. Skipping to next species" % (species))
        continue

    print("%s has multiple haploid samples!" % (species))
    
    intersample_change_map = calculate_intersample_changes.load_intersample_change_map(species)
    
    pairs = list(intersample_change_map.keys())

    for pair in pairs:
        
        opportunities = intersample_change_map[pair]['snps'][0]
        changes = intersample_change_map[pair]['snps'][2]
        number_of_changes = len(changes)
        change_rate = number_of_changes/opportunities

        if change_rate > different_strain_threshold:
            continue
        
        for snp in changes:
            gene = snp[0]
            contig = snp[1]
            site_pos = snp[2]
            site_type = snp[3]
            alt_depth_1 = snp[4]
            depth_1 = snp[5]
            alt_depth_2 = snp[6]
            depth_2 = snp[7]
            freq_1 = alt_depth_1/depth_1
            freq_2 = alt_depth_2/depth_2


            snp_change_array.append([species, pair[0], pair[1], opportunities, change_rate, number_of_changes, gene, contig, site_pos, site_type, freq_1, freq_2, alt_depth_1, depth_1, alt_depth_2, depth_2])

snp_change_df = pd.DataFrame(data = snp_change_array, columns = ["species", "sample_1", "sample_2", "opportunities", "change_rate", "number_of_changes", "gene", "contig", "site_pos", "site_type", "freq_1", "freq_2", "alt_depth_1", "depth_1", "alt_depth_2", "depth_2"])
snp_change_df = snp_change_df.sort_values(by = ["species", "sample_1", "sample_2", "contig", "site_pos"])

# annotate
## Mouse
snp_change_df["mouse_1"] = snp_change_df["sample_1"].apply(lambda x: sample_metadata_map[x]["mouse"])
snp_change_df["mouse_2"] = snp_change_df["sample_2"].apply(lambda x: sample_metadata_map[x]["mouse"])

# cage
snp_change_df["cage_1"] = snp_change_df["sample_1"].apply(lambda x: sample_metadata_map[x]["cage"])
snp_change_df["cage_2"] = snp_change_df["sample_2"].apply(lambda x: sample_metadata_map[x]["cage"])

# cage
snp_change_df["location_1"] = snp_change_df["sample_1"].apply(lambda x: sample_metadata_map[x]["location"])
snp_change_df["location_2"] = snp_change_df["sample_2"].apply(lambda x: sample_metadata_map[x]["location"])

# Host orientation
snp_change_df["orientation"] = snp_change_df.apply(lambda row: "Within host" if ((row["mouse_1"] == row["mouse_2"]) & (row["cage_1"] == row["cage_2"])) else "Between host", axis = 1)

# Coarse graining
snp_change_df["coarse_location_1"] = snp_change_df["location_1"].apply(lambda x: "Small intestine" if x in ["Duodenum", "Jejunum", "Ileum"] else "Large intestine")
snp_change_df["coarse_location_2"] = snp_change_df["location_2"].apply(lambda x: "Small intestine" if x in ["Duodenum", "Jejunum", "Ileum"] else "Large intestine")

# Taxnomic assignment

snp_change_df['taxonomic_assignment'] = snp_change_df["species"].apply(lambda x: species_code_map[x])





In [None]:
out_path = "/u/project/ngarud/michaelw/Diversity-Along-Gut/ConventionalMouse/tables/snp_changes.tsv"
snp_change_df.to_csv(out_path, sep = "\t", index = False)

In [None]:
intersample_change_map[('Cec_1_1','Cec_2_1')]['snps'][2][0]