# Validate Ersilia Compound embeddings

In [1]:
from pathlib import Path
import random

from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from umap import UMAP
import pandas as pd
import torch

from compound_embedding.fs_mol.utils.protonet_utils import PrototypicalNetworkTrainer
from compound_embedding.utils import smiles_to_proto_input
from compound_embedding.fs_mol.data import FSMolDataset, DataFold

from eosce.models import ErsiliaCompoundEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


## Using the test assays for comparison

In [2]:
dataset = FSMolDataset.from_directory(str(Path().parent.joinpath("../scratch/fs-mol-mordred").absolute()), num_workers=0)
task_iter = dataset.get_task_reading_iterable(DataFold.TEST)

In [3]:
test_df = pd.DataFrame()
for _ in range(0,5):
    task = next(iter(task_iter))
    print(task.name)
    for sample in task.samples:
        if sample.bool_label:
            label = f"{task.name}-True"
        else:
            label = f"{task.name}-False"
        test_df = pd.concat([test_df, pd.DataFrame([[sample.smiles, label]], columns=["smiles", "label"])], ignore_index=True)

CHEMBL2219366
CHEMBL2219059
CHEMBL2219352
CHEMBL1614423
CHEMBL663407


## Generate embediings

In [None]:
protonet_model = PrototypicalNetworkTrainer.build_from_model_file(Path().parent.joinpath("../scratch/train_protonet/FSMol_protonet_2023-02-04_22-26-17/fully_trained.pt"), device="cuda")
protonet_model.eval()
protonet_model.to("cuda")
with torch.no_grad():
    proto_fps = protonet_model.fc(smiles_to_proto_input(test_df["smiles"].to_list(), "cuda", 1))
proto_fps = proto_fps.to("cpu").numpy()
proto_fps_df = test_df.join(pd.DataFrame(proto_fps))

In [5]:
efp_model = ErsiliaCompoundEmbeddings()
efp_fps = efp_model.transform(test_df["smiles"].to_list())
efp_fps_df = test_df.join(pd.DataFrame(efp_fps))

## Generate plots

In [6]:
tsne = TSNE(n_components=2 ,random_state=0)
proto_tsne_projections = tsne.fit_transform(proto_fps_df.iloc[:,2:].to_numpy())
proto_tsne_projections_df = test_df.join(pd.DataFrame(proto_tsne_projections))
proto_tsne_fig = px.scatter(proto_tsne_projections_df, x=0, y=1, color="label", labels={'color': 'label'})
proto_tsne_fig.show()

In [7]:
tsne = TSNE(n_components=2 ,random_state=0)
efp_tsne_projections = tsne.fit_transform(efp_fps_df.iloc[:,2:].to_numpy())
efp_tsne_projections_df = test_df.join(pd.DataFrame(efp_tsne_projections))
efp_tsne_fig = px.scatter(efp_tsne_projections_df, x=0, y=1, color="label", labels={'color': 'label'})
efp_tsne_fig.show()

In [8]:
umap = UMAP(n_components=2, init="random", random_state=0)
proto_umap_projections = umap.fit_transform(proto_fps)
proto_umap_projections_df = test_df.join(pd.DataFrame(proto_umap_projections))
proto_umap_fig = px.scatter(proto_umap_projections_df, x=0, y=1, color="label", labels={'color': 'label'})
proto_umap_fig.show()

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [9]:
umap = UMAP(n_components=2, init="random", random_state=0)
efp_umap_projections = umap.fit_transform(efp_fps)
efp_umap_projections_df = test_df.join(pd.DataFrame(efp_umap_projections))
efp_umap_fig = px.scatter(efp_umap_projections_df, x=0, y=1, color="label", labels={'color': 'label'})
efp_umap_fig.show()

## Generate comparative plot

In [10]:
def add_subplot(fig_container: go.Figure, fig: go.Figure, *args, **kwargs) -> None:
    """Add px figures as subplots."""
    traces = []
    for trace in range(len(fig["data"])):
        traces.append(fig["data"][trace])
    for trace in traces:
        fig_container.add_trace(trace, *args, **kwargs)

In [11]:
final_compare_fig = make_subplots(2,2, subplot_titles=("TSNE Proto", "UMAP Proto", "TSNE EFP", "UMAP EFP"))
add_subplot(final_compare_fig, proto_tsne_fig, 1, 1)
add_subplot(final_compare_fig, proto_umap_fig, 1, 2)
add_subplot(final_compare_fig, efp_tsne_fig, 2, 1)
add_subplot(final_compare_fig, efp_umap_fig, 2, 2)
final_compare_fig.update_layout(height=1200)
final_compare_fig.show()

## Generate embedding correlations

In [15]:
import numpy as np

def corr2_coeff(A, B):
    # Rowwise mean of input arrays & subtract from input arrays themeselves
    A_mA = A - A.mean(1)[:, None]
    B_mB = B - B.mean(1)[:, None]

    # Sum of squares across rows
    ssA = (A_mA**2).sum(1)
    ssB = (B_mB**2).sum(1)

    # Finally get corr coeff
    return np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))

In [16]:
out = corr2_coeff(proto_fps[0:100,:], efp_fps[0:100,:])

In [17]:
fig = px.imshow(out)
fig.show()

Correlation is not good. Model re-training required.

## Sample grid embeddings

In [55]:
grid_fps = efp_model.transform(test_df["smiles"].to_list(), grid=True)
sampled_grid_fps = random.sample(list(grid_fps), 10)

In [56]:
px.imshow(np.asarray(sampled_grid_fps), facet_col=0, facet_col_wrap=5)