In [None]:
import os
import torch
os.environ["TORCH_VERSION"] = torch.__version__

!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-$TORCH_VERSION.html
!pip install torchdrug

Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 12.0 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Collecting torchdrug
  Downloading torchdrug-0.1.2.post1-py3-none-any.whl (191 kB)
[K     |████████████████████████████████| 191 kB 12.8 MB/s 
Collecting ninja
  Downloading ninja-1.10.2.3-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (108 kB)
[K     |████████████████████████████████| 108 kB 31.4 MB/s 
[?25hCollecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.2.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 1.4 MB/s 
Installing collected packages: rdkit-pypi, ninja, torchdrug
Successfully installed ninja-1.10.2.3 rdkit-pypi-2021.9.2.

In [None]:
from torchdrug import data,core, models, tasks
from torch import nn, optim
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**Load dataset**

In [None]:
path = '/content/drive/MyDrive/bio_project/zinc_chembl/'
df = pd.read_csv(path+'chembl_zinc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,smiles,logP,qed
0,0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506,0.702012
1,1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137,0.928975
2,2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682
3,3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944
4,4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027


###**Cleaned dataset**

In [None]:
new_smiles = []

for smile in df['smiles'] :
  new_smiles.append(smile.strip())

df['smiles'] = new_smiles
df.head()

Unnamed: 0.1,Unnamed: 0,smiles,logP,qed
0,0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,5.0506,0.702012
1,1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1,3.1137,0.928975
2,2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682
3,3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944
4,4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027


In [None]:
chembl_zinc_dataset = data.MoleculeDataset()
chembl_zinc_dataset.load_csv(path+'chembl_zinc.csv', smiles_field='smiles',target_fields=['logP','QED'],kekulize=True,node_feature="symbol")

###**Defined Model**

In [None]:
model = models.RGCN(input_dim=chembl_zinc_dataset.node_feature_dim,
                    num_relation=chembl_zinc_dataset.num_bond_type,
                    hidden_dims=[256, 256, 256, 256], batch_norm=False)

task = tasks.GCPNGeneration(model, chembl_zinc_dataset.atom_types, max_edge_unroll=12,
                            max_node=38, criterion="nll")


optimizer = optim.Adam(task.parameters(), lr=1e-5)
solver = core.Engine(task, chembl_zinc_dataset, None, None, optimizer,
                     gpus=(0,), batch_size=32, log_interval=1)

solver.train(num_epoch=5)
solver.save(path+'gcpn_chembl_zinc_5epoch.pkl')

10:09:33   Preprocess training set
10:09:42   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
10:09:42   Epoch 0 begin




10:10:01   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
10:10:01   edge acc: 0.243243
10:10:01   edge loss: 1.09291
10:10:01   node1 acc: 0.10296
10:10:01   node1 loss: 2.25574
10:10:01   node2 acc: 0.0836551
10:10:01   node2 loss: 2.94758
10:10:01   stop acc: 0.039555
10:10:01   stop bce loss: 0.670351
10:10:01   total loss: 6.96658
10:10:01   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
10:10:01   edge acc: 0.27446
10:10:01   edge loss: 1.09073
10:10:01   node1 acc: 0.106734




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11:24:58   node1 loss: 1.239
11:24:58   node2 acc: 0.729352
11:24:58   node2 loss: 0.833095
11:24:58   stop acc: 0.891331
11:24:58   stop bce loss: 0.268945
11:24:58   total loss: 2.61728
11:24:59   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:24:59   edge acc: 0.871094
11:24:59   edge loss: 0.305706
11:24:59   node1 acc: 0.470052
11:24:59   node1 loss: 1.27417
11:24:59   node2 acc: 0.746094
11:24:59   node2 loss: 0.798909
11:24:59   stop acc: 0.89
11:24:59   stop bce loss: 0.278021
11:24:59   total loss: 2.6568
11:24:59   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:24:59   edge acc: 0.897561
11:24:59   edge loss: 0.25957
11:24:59   node1 acc: 0.480488
11:24:59   node1 loss: 1.32666
11:24:59   node2 acc: 0.706098
11:24:59   node2 loss: 0.90324
11:24:59   stop acc: 0.84507
11:24:59   stop bce loss: 0.226434
11:24:59   total loss: 2.7159
11:24:59   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:24:59   edge acc: 0.883838
11:24:59   edge loss: 0.282819
11:24



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11:29:54   stop acc: 0.830325
11:29:54   stop bce loss: 0.270858
11:29:54   total loss: 2.61791
11:29:54   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:29:54   edge acc: 0.87069
11:29:54   edge loss: 0.285899
11:29:54   node1 acc: 0.476601
11:29:54   node1 loss: 1.2867
11:29:54   node2 acc: 0.710591
11:29:54   node2 loss: 0.910198
11:29:54   stop acc: 0.856635
11:29:54   stop bce loss: 0.221867
11:29:54   total loss: 2.70467
11:29:54   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:29:54   edge acc: 0.881432
11:29:54   edge loss: 0.255905
11:29:54   node1 acc: 0.501119
11:29:54   node1 loss: 1.28197
11:29:54   node2 acc: 0.717002
11:29:54   node2 loss: 0.879996
11:29:54   stop acc: 0.844492
11:29:54   stop bce loss: 0.264032
11:29:54   total loss: 2.68191
11:29:54   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
11:29:54   edge acc: 0.885642
11:29:54   edge loss: 0.258613
11:29:54   node1 acc: 0.532402
11:29:54   node1 loss: 1.14446
11:29:54   node2 acc: 0.7344

###**Reinforment**

In [None]:
model = models.RGCN(input_dim=chembl_zinc_dataset.node_feature_dim,
                    num_relation=chembl_zinc_dataset.num_bond_type,
                    hidden_dims=[256, 256, 256, 256], batch_norm=False)

task = tasks.GCPNGeneration(model, chembl_zinc_dataset.atom_types,
                            max_edge_unroll=12, max_node=38,
                            task=('qed','plogp'), criterion=('ppo', 'nll'),
                            reward_temperature=1,
                            agent_update_interval=3, gamma=0.9)

optimizer = optim.Adam(task.parameters(), lr=1e-5)
solver = core.Engine(task, chembl_zinc_dataset, None, None, optimizer,
                     gpus=(0,), batch_size=32, log_interval=10)

solver.load(path+'gcpn_chembl_zinc_5epoch.pkl',load_optimizer=False)

solver.train(num_epoch=5)
solver.save(path+'gcpn_chembl_zinc_5epoch_reinforcement.pkl')


###**Generate**

In [None]:
model = models.RGCN(input_dim=chembl_zinc_dataset.node_feature_dim,
                    num_relation=chembl_zinc_dataset.num_bond_type,
                    hidden_dims=[256, 256, 256, 256], batch_norm=False)

task = tasks.GCPNGeneration(model, chembl_zinc_dataset.atom_types, max_edge_unroll=12,
                            max_node=38, criterion="nll")


optimizer = optim.Adam(task.parameters(), lr=1e-5)
solver = core.Engine(task, chembl_zinc_dataset, None, None, optimizer,
                     gpus=(0,), batch_size=32, log_interval=1)

solver.save(path+'gcpn_chembl_zinc_5epoch.pkl')
results = task.generate(num_sample=100, max_resample=5)
all_smiles = results.to_smiles()

16:22:52   Preprocess training set
16:22:52   Save checkpoint to /content/drive/MyDrive/bio_project/zinc_chembl/gcpn_chembl_zinc_5epoch.pkl




16:22:53   4 / 100 molecules are invalid even after 5 resampling
16:22:53   2 / 96 molecules are invalid even after 5 resampling
16:22:54   6 / 94 molecules are invalid even after 5 resampling
16:22:54   7 / 88 molecules are invalid even after 5 resampling
16:22:54   9 / 81 molecules are invalid even after 5 resampling
16:22:55   4 / 64 molecules are invalid even after 5 resampling
16:22:55   2 / 39 molecules are invalid even after 5 resampling
16:22:55   1 / 19 molecules are invalid even after 5 resampling


###**analize the result**

In [None]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
! conda install -c rdkit rdkit -y
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

--2021-11-27 16:20:39--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’


2021-11-27 16:20:40 (152 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0


In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [None]:
def calculate_logp_qed(smiles):
  logP = []
  qed = []
  for smile in smiles : 
    mol = Chem.MolFromSmiles(smile)
    if mol != None : 
      logP.append(Descriptors.MolLogP(mol))
      qed.append(Chem.QED.weights_max(mol))
  return (logP,qed)

In [None]:
logP,qed = calculate_logp_qed(all_smiles)
data = {'smiles':all_smiles, 'logP':logP, 'qed': qed}
df2 = pd.DataFrame(data=data)

In [None]:
df2.head()

Unnamed: 0,smiles,logP,qed
0,C#CC(P)=C=C,1.1635,0.212592
1,C=C=CC(C)I,2.151,0.297962
2,C=CC=C=CC,1.9036,0.32291
3,C=C=C(C)CC,2.1276,0.44971
4,C=C(C)C#CC,1.5858,0.410933


In [None]:
df2.describe()

Unnamed: 0,logP,qed
count,56.0,56.0
mean,1.738436,0.401445
std,0.505482,0.069279
min,0.2562,0.212592
25%,1.4144,0.328917
50%,1.81125,0.411639
75%,2.1276,0.450373
max,2.6086,0.524302


###**Export**

In [None]:
df2.to_csv(path+'zinc_chembl_output.csv')