In [2]:
import os
import pandas as pd
import deepchem as dc
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, PyMol, rdFMCS
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from deepchem import metrics
from IPython.display import Image, display
from rdkit.Chem.Draw import SimilarityMaps
import tensorflow as tf

  from pandas.core.computation.check import NUMEXPR_INSTALLED
No normalization for AvgIpc. Feature removed!
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/home/ubuntu/.local/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ubuntu/.local/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [None]:
#Use SQL query to retrieve compounds for specified target
from sqlalchemy import create_engine
from dotenv import dotenv_values
from sqlalchemy import text

config = dotenv_values('database_url.env')
url = config['DATABASE_URL']

engine = create_engine(url, echo=False)
#target_id = 'ENSG00000120217'
target_id = 'ENSG00000198900'


with engine.begin() as conn:
    query = text("SELECT * FROM target_to_compounds WHERE target_ensemble_id='{target}';".format(target=target_id))
    target_to_compounds_df = pd.read_sql(query, conn)

#display(target_to_compounds_df)

In [None]:
compound_dataset = target_to_compounds_df
smiles = compound_dataset['smiles']

IC50 = compound_dataset['standard_value']
featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
compound_dataset['featurized'] = featurizer.featurize(smiles)
featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation = True)
compound_dataset['frag_featurized'] = featurizer.featurize(smiles)
compound_dataset['divided values'] = compound_dataset['standard_value'].astype(float).div(108000)
compound_dataset['pIC50'] = np.log10(compound_dataset['divided values'].astype(float)).mul(-1)
compound_dataset['number'] = list(range(0,len(compound_dataset)))
#display(compound_dataset.head(5))

training_dataset = compound_dataset.sample(frac = 0.7)

#training_dataset.featurized[0].n_feat

testing_dataset = (compound_dataset[~compound_dataset['number'].isin(training_dataset['number'])])


numpy_training_dataset = dc.data.NumpyDataset(X=training_dataset['featurized'],y=training_dataset['pIC50'].astype(float), ids=training_dataset['smiles'])
numpy_testing_dataset = dc.data.NumpyDataset(X=testing_dataset['featurized'],y=testing_dataset['pIC50'].astype(float), ids=testing_dataset['smiles'])

mols = [m for m in Chem.SmilesMolSupplier('smiles.csv', ',') if m is not None]
dataset = numpy_testing_dataset


In [None]:
#compound_dataset['smi_to_mol'] = Chem.SmilesMolSupplierFromText(compound_dataset['smiles'].values.tolist())
#pd.DataFrame.to_csv(compound_dataset)
just_smiles_df = pd.DataFrame()
#just_smiles_df['smiles'] = compound_dataset['smiles']
just_smiles_df['smiles'] = testing_dataset['smiles']
smiles = just_smiles_df['smiles'].tolist()
#just_smiles_df['name'] = [-1 for thing in smiles]
just_smiles_df['name'] = just_smiles_df['smiles']
#print(just_smiles_df)
just_smiles_df.to_csv('smiles.csv', index = False)

In [None]:
%env XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/lib/cuda

In [None]:
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2, dense_layer_size=10)
model.fit(numpy_training_dataset, nb_epoch=10)

In [None]:
test_dataset = numpy_testing_dataset
pred = model.predict(test_dataset)
print(test_dataset.y)
print(pred)
mse = metrics.mean_squared_error(y_true=test_dataset.y, y_pred=pred)
r2 = metrics.r2_score(y_true=test_dataset.y, y_pred=pred)
print(mse)
print(r2)

In [None]:
frag_dataset = dc.data.NumpyDataset(X=testing_dataset['frag_featurized'], y = None, w = None, ids = test_dataset.ids)
print(frag_dataset.get_shape)
tr = dc.trans.FlatteningTransformer(frag_dataset) # flatten dataset and add ids to each fragment
frag_dataset = tr.transform(frag_dataset)
print(frag_dataset.get_shape)

In [None]:
# whole molecules
pred = model.predict(test_dataset)
pred = pd.DataFrame(pred, index=test_dataset.ids, columns=["Molecule"])  # turn to dataframe for convenience
display(pred)
# fragments
pred_frags = model.predict(frag_dataset)
pred_frags = pd.DataFrame(pred_frags, index=frag_dataset.ids, columns=["Fragment"])  # turn to dataframe for convenience
#pred_frags = pd.DataFrame(pred_frags,index=range(0, len(frag_dataset)), columns=["Fragment"])
print(pred_frags)
# merge 2 dataframes by molecule names
df = pd.merge(pred_frags, pred, right_index=True, left_index=True)
# find contribs
df['Contrib'] = df["Molecule"] - df["Fragment"]
display(df)

In [None]:
for molecule in pred.index:
    print('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{molecule}/PNG'.format(molecule=molecule))

In [None]:
def vis_contribs(mols, df, smi_or_sdf = "smi"): 
    # input format of file, which was used to create dataset determines the order of atoms, 
    # so we take it into account for correct mapping!
    maps = []
    for mol  in mols:
        wt = {}
        if smi_or_sdf == "smi":
            for n,atom in enumerate(Chem.rdmolfiles.CanonicalRankAtoms(mol)):
                wt[atom] = df.loc[mol.GetProp("_Name"),"Contrib"][n]

        if smi_or_sdf == "sdf":        
            for n,atom in enumerate(range(mol.GetNumHeavyAtoms())):
                wt[atom] = df.loc[Chem.MolToSmiles(mol),"Contrib"][n]
        maps.append(SimilarityMaps.GetSimilarityMapFromWeights(mol,wt))
    return maps    


In [None]:
vis_contribs(mols, df, 'smi')