### `Visualise annotated features with smiles`
#### Scatter plot and PCA


Initially, import the dataframe with features and SMILES annotations:

In [None]:
import pandas as pd
import plotly.express as px
import molplotly

# load a DataFrame with smiles
DF = pd.read_csv('results/annotations/commercial_std/annotated_FeatureMatrix_commercial_std.csv')
DF= DF.rename(columns={"CSI_predictions_smiles":"smiles"})
DF = DF[DF['smiles'].notna()]
DF= DF.fillna(0)
DF

In [None]:
DF_T= DF.set_index(["smiles", "CSI_predictions_name"])
DF_T= DF_T.drop(columns=["m/z", "RT (s)", "CSI_predictions_formula"])
DF_T

In [None]:
features = ['GermicidinB', 
            'Kanamycin', 
            'Tetracycline', 
            'Thiostreptone',
            'Globomycin', 
            'Ampicillin', 
            'Apramycin',
            'GermicidinA']
            						
fig_matrix = px.scatter_matrix(DF,
                               dimensions=features,
                               width=1800,
                               height=1500,
                               title='Scatter matrix of standards')

app_matrix = molplotly.add_molecules(fig=fig_matrix,
                                     df=DF,
                                     smiles_col='smiles',
                                     title_col='CSI_predictions_name',
                                     caption_cols=features,
                                     width=200,
                                     show_coords=False)

# Only show informative lower triangle
fig_matrix.update_traces(diagonal_visible=False, showupperhalf=False)
app_matrix.run_server(mode='inline', port=8700, height=1000)

In [None]:
DF_treat= DF.reset_index()
DF_treat= DF_treat.drop(columns=["CSI_predictions_formula", "index"])
fig_scatter = px.scatter(DF_treat,
                         x="RT (s)",
                         y="m/z",
                         title='Scatter matrix',
                         width=1200,
                         height=800)

# This adds a dashed line for what a perfect model _should_ predict
y = DF_treat["m/z"].values
fig_scatter.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)

fig_scatter.show()

In [None]:
fig_scatter.update_layout(
    title='Scatter matrix w smiles')

app_scatter_with_captions = molplotly.add_molecules(fig=fig_scatter,
                                                    df=DF,
                                                    smiles_col='smiles',
                                                    title_col='CSI_predictions_name',
                                                    show_coords=True)

app_scatter_with_captions.run_server(mode='inline', port=8002, height=1000)

In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.decomposition import PCA


def smi_to_fp(smi):
    fp = AllChem.GetMorganFingerprintAsBitVect(
        Chem.MolFromSmiles(smi), 2, nBits=1024)
    arr = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

esol_fps = np.array([smi_to_fp(smi) for smi in DF_treat['smiles']])
pca = PCA(n_components=2)
components = pca.fit_transform(esol_fps.reshape(-1, 1024))
DF_treat['PCA-1'] = components[:, 0]
DF_treat['PCA-2'] = components[:, 1]

In [None]:
fig_pca = px.scatter(DF_treat,
                     x="PCA-1",
                     y="PCA-2",
                     color="m/z",
                     title='PCA w smiles',
                     width=1200,
                     height=800)

app_pca = molplotly.add_molecules(fig=fig_pca,
                                  df=DF_treat,
                                  smiles_col='smiles',
                                  title_col='CSI_predictions_name',
                                  show_coords=False)

app_pca.run_server(mode='inline', port=8006, height=850)