# Chem Space (No Scaffold)
### By: Francisco Feitosa

### Import Libraries

In [None]:
import os
import time
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import rdDepictor
from rdkit.Chem import MACCSkeys
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.PandasTools import ChangeMoleculeRendering

import mols2grid
from tqdm.auto import tqdm
from ipywidgets import widgets
from typing import List

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#import umap as umap
#import hdbscan

import matplotlib.pyplot as plt
import seaborn as sns
IPythonConsole.ipython_useSVG=True 
from IPython.display import SVG

#Bokeh library for plotting
import json
from bokeh.plotting import figure, show, output_notebook, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.transform import factor_cmap
from bokeh.transform import transform
from bokeh.transform import LinearColorMapper
from bokeh.models import ColorBar
from bokeh.palettes import PiYG
from bokeh.plotting import figure, output_file, save
output_notebook()

In [2]:
tqdm.pandas()

### Setup

In [3]:
fname = r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Skin\DADOS\GHS\curated_binary_GHS.csv"

In [None]:
df = pd.read_csv(fname)
df.info()

In [None]:
df=df.rename(columns={'Outcome':'ACTIVITY'})
df.info()

In [None]:
df1=df[['SMILES','ACTIVITY']]
df1.info()

In [None]:
df1.isnull().any()

In [None]:
df1=df1.dropna(subset=['ACTIVITY'])
df1.info()

In [None]:
df1['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)

In [None]:
df1.isnull().any()
df1.info()

### Fingerprints

In [11]:
fp_type = "ECFP"
radii = 2
bits = 1024

In [12]:
for type in fp_type:
    if fp_type == "ECFP":
        fps = np.array([AllChem.GetMorganFingerprintAsBitVect(x, radius = radii, nBits = bits, useFeatures=False) for x in df1['mol']])
    if fp_type == "FCFP":
        fps = np.array([AllChem.GetMorganFingerprintAsBitVect(x, radius = radii, nBits = bits, useFeatures=True) for x in df1['mol']])
    if fp_type == "MACCS":
        fps = np.array([MACCSkeys.GenMACCSKeys(x) for x in df1['mol']])
    

### Calculate PCA

In [None]:
%%time
pca = PCA(n_components=2)
X_pca = pca.fit_transform(fps)
pca_df = pd.DataFrame(X_pca, columns= ['X_PCA','Y_PCA'])
pca_df.info()

### Calculate TSNE

In [None]:
%%time
tsne = TSNE(random_state=0).fit_transform(fps)
tsne_df = pd.DataFrame(tsne, columns= ['X_TSNE', 'Y_TSNE'])

tsne_df.info()

### Calculate UMAP

In [None]:
"""%%time
umap_model = umap.UMAP(metric = "euclidean",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.9)
X_umap = umap_model.fit_transform(fps)
umap_df = pd.DataFrame(X_umap, columns= ['X_UMAP', 'Y_UMAP'])"""

### Prepare molecules to print

In [16]:
def _prepareMol(mol,kekulize):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    return mc

def moltosvg(mol,molSize=(450,200),kekulize=True,drawer=None,**kwargs):
    mc = _prepareMol(mol,kekulize)
    if drawer is None:
        drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc,**kwargs)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return SVG(svg.replace('svg:',''))

In [17]:
svgs = [moltosvg(m).data for m in df1.mol]

In [None]:
df1=df1.reset_index()
df1.head()

In [None]:
df_merge = pd.concat([df1, pca_df, tsne_df], axis=1)
df_merge.head()

### Plot Map

In [None]:
color_mapper=LinearColorMapper(palette=PiYG[9],
                               low=df1.ACTIVITY.max(), high=df1.ACTIVITY.min())

In [None]:
df1['s'] = 1
df1

In [None]:
def plot_int_map(metodology):
    source = ColumnDataSource(data=dict(x=metodology[:,0], y=metodology[:,1], freq = df1.s, desc= df1.ACTIVITY,
                                    svgs=svgs, s=df1.s*6, c=df1.ACTIVITY))
    ChangeMoleculeRendering(renderer='PNG')



    hover = HoverTool(tooltips="""
        <div>
            <div>@svgs{safe}
            </div>
            <div>
                <span style="font-size: 17px; font-weight: bold;">Frequency @freq</span>
            </div>
            <div>
                <span style="font-size: 17px; font-weight: bold;">Activity @desc</span>
            </div>
        </div>
        </body>
        """)

    interactive_map = figure(width = 1000, height=1000, tools=['reset,box_zoom,wheel_zoom,zoom_in,zoom_out,pan',hover],
           title="Chemical Space " + str(fp_type) + " Radius: " + str(radii) + " bits: " + str(bits))



    interactive_map.circle('x', 'y', 
                           source=source,
                           size='s',
                           color=transform('c', color_mapper),
                           fill_alpha=0.5);

    color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0,0), title ='Activity')
    interactive_map.add_layout(color_bar,'left')
    show(interactive_map)
    return interactive_map

In [None]:
pca_map=plot_int_map(X_pca)

In [None]:
tsne_map=plot_int_map(tsne)

In [None]:
#umap_map= plot_int_map(X_umap)

In [None]:
#Save Hmtl for later viewing
#output_file("./umap " + str(fp_type) + " " + str(radii) + " " + str(bits) + ".html")
#save(umap_map)

output_file("./tsne " + str(fp_type) + " " + str(radii) + " " + str(bits) + ".html")
save(tsne_map)

output_file("./pca " + str(fp_type) + " " + str(radii) + " " + str(bits) + ".html")
save(pca_map)
