# Check training model

This notebook will examine the trained model to see if it learned patterns of the input data. Specifically this notebook will look at:

1. Is there any structure in our latent space or just noise?
2. Does our latent space capture the clusters in our input data? Are there samples that we know should cluster together? Do we find those in the input and encoded data?

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import glob
import umap
import pandas as pd
from keras.models import load_model
from sklearn.decomposition import PCA
from plotnine import (
    ggplot,
    labs,
    geom_line,
    geom_point,
    geom_errorbar,
    aes,
    ggsave,
    theme_bw,
    theme,
    xlim,
    ylim,
    facet_wrap,
    scale_color_manual,
    guides,
    guide_legend,
    element_blank,
    element_text,
    element_rect,
    element_line,
    coords,
)
from cm_modules import paths
from ponyo import utils

random_state = 123

Using TensorFlow backend.


In [2]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

# Read in config variables
config_filename = os.path.abspath(
    os.path.join(base_dir, "test_vae_training", "config_current_vae.tsv")
)

params = utils.read_config(config_filename)

dataset_name = params["dataset_name"]
NN_architecture = params["NN_architecture"]
normalized_compendium_filename = params["normalized_compendium_filename"]

NN_dir = os.path.join(base_dir, dataset_name, "models", NN_architecture)

In [3]:
# Import normalized data
normalized_compendium = pd.read_csv(
    normalized_compendium_filename, sep="\t", index_col=0, header=0
)

In [4]:
normalized_compendium.head()

Unnamed: 0,Bacteria Actinobacteriota Actinobacteria Bifidobacteriales Bifidobacteriaceae Bifidobacterium,Bacteria Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides,Bacteria Actinobacteriota Coriobacteriia Coriobacteriales Coriobacteriaceae Collinsella,Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Agathobacter,Bacteria Firmicutes Negativicutes Veillonellales-Selenomonadales Selenomonadaceae Megamonas,Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Blautia,Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Faecalibacterium,Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Anaerostipes,Bacteria Bacteroidota Bacteroidia Bacteroidales Prevotellaceae Prevotella,Bacteria Firmicutes Bacilli Lactobacillales Streptococcaceae Streptococcus,...,Bacteria Actinobacteriota Acidimicrobiia Microtrichales Ilumatobacteraceae NA,Bacteria Verrucomicrobiota Verrucomicrobiae Pedosphaerales Pedosphaeraceae ADurb.Bin063-1,Bacteria Proteobacteria Alphaproteobacteria Caulobacterales Caulobacteraceae PMMR1,Bacteria Bacteroidota Bacteroidia Flavobacteriales Cryomorphaceae NA,Bacteria Bacteroidota Bacteroidia Flavobacteriales Flavobacteriaceae Pseudofulvibacter,Bacteria Proteobacteria Alphaproteobacteria Rickettsiales Rickettsiaceae NA,Bacteria Bacteroidota Bacteroidia Flavobacteriales Flavobacteriaceae Gelidibacter,Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae Ideonella,Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Xanthobacteraceae Rhodoplanes,Bacteria Proteobacteria Alphaproteobacteria Sphingomonadales Sphingomonadaceae Rhizorhapis
PRJDB5310_DRR077057,0.001496,0.000948,0.0,0.0,0.0,0.0,0.004278,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRJDB5310_DRR077058,0.001169,0.000702,0.0,0.0,0.0,4.6e-05,0.002383,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRJDB5310_DRR077059,0.0,0.000572,0.0,0.001081,0.0,0.001694,0.002962,0.001032,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRJDB5310_DRR077060,0.003358,0.0,0.004034,0.000877,0.0,0.000561,0.001506,0.000465,0.0,0.00062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRJDB5310_DRR077061,0.000188,0.000751,0.001546,0.000529,0.003158,0.000671,0.001037,0.001376,0.0,0.000112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Drop outliers found manually
samples_to_drop = [
    "PRJEB34610_ERR3561806",
    "PRJEB34610_ERR3561830",
    "PRJNA297268_SRR2568180",
]
normalized_compendium = normalized_compendium.drop(samples_to_drop)

### Plot umap of normalized data

In [6]:
pca = PCA(n_components=2)

In [9]:
type(normalized_compendium)

pandas.core.frame.DataFrame

In [7]:
# Get and save PCA model
# model = pca.fit(normalized_compendium)
model = umap.UMAP(random_state=random_state).fit(normalized_compendium)

compendium_PCAencoded = model.transform(normalized_compendium)

compendium_PCAencoded_df = pd.DataFrame(
    data=compendium_PCAencoded, index=normalized_compendium.index, columns=["1", "2"]
)

TypeError: a bytes-like object is required, not 'list'

In [None]:
# Plot
fig = ggplot(compendium_PCAencoded_df, aes(x="1", y="2"))
fig += geom_point(alpha=0.2)
fig += labs(x="PCA 1", y="PCA 2", title="PCA normalized compendium")
fig += theme_bw()
fig += theme(
    legend_title_align="center",
    plot_background=element_rect(fill="white"),
    legend_key=element_rect(fill="white", colour="white"),
    legend_title=element_text(family="sans-serif", size=15),
    legend_text=element_text(family="sans-serif", size=12),
    plot_title=element_text(family="sans-serif", size=15),
    axis_text=element_text(family="sans-serif", size=12),
    axis_title=element_text(family="sans-serif", size=15),
)
fig += guides(colour=guide_legend(override_aes={"alpha": 1}))

print(fig)

### Plot encoded data

In [None]:
# Load VAE models
model_encoder_file = glob.glob(os.path.join(NN_dir, "*_encoder_model.h5"))[0]

weights_encoder_file = glob.glob(os.path.join(NN_dir, "*_encoder_weights.h5"))[0]

model_decoder_file = glob.glob(os.path.join(NN_dir, "*_decoder_model.h5"))[0]

weights_decoder_file = glob.glob(os.path.join(NN_dir, "*_decoder_weights.h5"))[0]

# Load saved models
loaded_model = load_model(model_encoder_file)
loaded_decode_model = load_model(model_decoder_file)

loaded_model.load_weights(weights_encoder_file)
loaded_decode_model.load_weights(weights_decoder_file)

In [None]:
# Encode normalized compendium into latent space
compendium_encoded = loaded_model.predict_on_batch(normalized_compendium)

compendium_encoded_df = pd.DataFrame(
    data=compendium_encoded, index=normalized_compendium.index
)

In [None]:
# Get and save PCA model
# model = pca.fit(compendium_encoded_df)
model = umap.UMAP(random_state=random_state).fit(compendium_encoded_df)

latent_compendium_PCAencoded = model.transform(compendium_encoded_df)

latent_compendium_PCAencoded_df = pd.DataFrame(
    data=latent_compendium_PCAencoded,
    index=compendium_encoded_df.index,
    columns=["1", "2"],
)

In [None]:
# Plot umap of encoded data
fig = ggplot(latent_compendium_PCAencoded_df, aes(x="1", y="2"))
fig += geom_point(alpha=0.2)
fig += labs(x="PCA 1", y="PCA 2", title="PCA encoded normalized compendium")
fig += theme_bw()
fig += theme(
    legend_title_align="center",
    plot_background=element_rect(fill="white"),
    legend_key=element_rect(fill="white", colour="white"),
    legend_title=element_text(family="sans-serif", size=15),
    legend_text=element_text(family="sans-serif", size=12),
    plot_title=element_text(family="sans-serif", size=15),
    axis_text=element_text(family="sans-serif", size=12),
    axis_title=element_text(family="sans-serif", size=15),
)
fig += guides(colour=guide_legend(override_aes={"alpha": 1}))

print(fig)