<a href="https://colab.research.google.com/github/satishgaurav/Molecule-Classification/blob/master/Copy_of_chemprop_colab_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip install chemprop
!pip install rdkit-pypi  # should be included in above after Chemprop v1.6 release

import chemprop
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

Collecting chemprop
  Downloading chemprop-1.6.1-py3-none-any.whl (166 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pandas-flavor>=0.2.0 (from chemprop)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Collecting tensorboardX>=2.0 (from chemprop)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting typed-argument-parser>=1.6.1 (from chemprop)
  Downloading typed-argument-parser-1.9.0.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdkit>=2020.03.1.0 (from chemprop)
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
hiv_df = pd.read_csv("HIV.csv")
hiv_df.head()

Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0


In [4]:
hiv_df.describe()

Unnamed: 0,HIV_active
count,41127.0
mean,0.035086
std,0.184001
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [5]:
unique_values = hiv_df['HIV_active'].unique()
print(f"Unique values in 'HIV_active': {unique_values}")

Unique values in 'HIV_active': [0 1]


In [6]:
unique_values = hiv_df['smiles'].unique()
print(f"Unique values in 'smiles': {unique_values}")
print(f"length of uniqe value: {len(unique_values)}")

Unique values in 'smiles': ['CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)=[O+]2'
 'C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3)CC(c3ccccc3)=[O+]2)[O+]=C(c2ccccc2)C1'
 'CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21' ...
 'Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C)CC4C3C2=O)cc1'
 'Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C)CC4C3C2=O)c1'
 'CCCCCC=C(c1cc(Cl)c(OC)c(-c2nc(C)no2)c1)c1cc(Cl)c(OC)c(-c2nc(C)no2)c1']
length of uniqe value: 41127


In [7]:
# Filter rows where 'your_column' is not equal to 1 or 0
filtered_df = hiv_df[(hiv_df['HIV_active'] != 1) & (hiv_df['HIV_active'] != 0)]
filtered_df

Unnamed: 0,smiles,activity,HIV_active


In [8]:
# Filter rows where 'target_column' is equal to 1h
hiv_df_filtered_active = hiv_df[hiv_df['HIV_active'] == 1]
hiv_df_filtered_active

Unnamed: 0,smiles,activity,HIV_active
11,O=C(O)Cc1ccc(SSc2ccc(CC(=O)O)cc2)cc1,CM,1
16,NNP(=S)(NN)c1ccccc1,CM,1
80,O=Nc1ccc(O)c(N=O)c1O,CM,1
203,Oc1ccc(Cl)cc1C(c1cc(Cl)ccc1O)C(Cl)(Cl)Cl,CM,1
234,NNC(=O)c1ccccc1SSc1ccccc1C(=O)NN,CM,1
...,...,...,...
41090,Cc1cn(COCCCOCC(=O)c2ccccc2)c(=O)[nH]c1=O,CM,1
41092,Cc1cn(C2CC3C(COC(CCC[Se]c4ccccc4)N3O)O2)c(=O)[...,CM,1
41093,Cc1cn(C2CC3C(COC(CCCC[Se]c4ccccc4)N3O)O2)c(=O)...,CM,1
41098,Cc1cn(C2CC3C(COC(CC[Se]C#N)N3O)O2)c(=O)[nH]c1=O,CM,1


In [13]:
# Filter rows where 'target_column' is equal to 1h
hiv_df_filtered_inactive = hiv_df[hiv_df['HIV_active'] == 0]
hiv_df_filtered_inactive = hiv_df_filtered_inactive.sample(n=1500, axis=0)
hiv_df_filtered_inactive

Unnamed: 0,smiles,activity,HIV_active
6634,O=C1CCC(N2C(=O)C3C4C=CC(CC4)C3C2=O)C(=O)N1,CI,0
21772,NNC(=O)c1cc(=NNC(=O)CC(=O)Nc2ccccc2Cl)c2ccccc2o1,CI,0
39001,COC(=O)c1cc2c3c(c1)Oc1cc(C(=O)OC)cc4c1C3c1c(cc...,CI,0
4110,O=C(N=NC(=O)Oc1ccccc1)Oc1ccccc1,CI,0
24671,CC1(C)OCC(C2OC(N3OC(CI)CC3c3ccccc3)C3OC(C)(C)O...,CI,0
...,...,...,...
11266,NC(CCSC(F)F)C(=O)O,CI,0
17651,O=S1(=O)c2ccccc2N=C(c2ccccc2Cl)CC1c1ccc(F)cc1,CI,0
26177,CC(=S)NC=C1C(=O)N(c2ccccc2)N=C1C,CI,0
23973,COc1ccc(S(=O)c2ccc(OC)cc2)cc1,CI,0


In [14]:
hiv_df_sampled = pd.concat([hiv_df_filtered_active, hiv_df_filtered_inactive], axis=0, ignore_index=True)
hiv_df_sampled

Unnamed: 0,smiles,activity,HIV_active
0,O=C(O)Cc1ccc(SSc2ccc(CC(=O)O)cc2)cc1,CM,1
1,NNP(=S)(NN)c1ccccc1,CM,1
2,O=Nc1ccc(O)c(N=O)c1O,CM,1
3,Oc1ccc(Cl)cc1C(c1cc(Cl)ccc1O)C(Cl)(Cl)Cl,CM,1
4,NNC(=O)c1ccccc1SSc1ccccc1C(=O)NN,CM,1
...,...,...,...
2938,NC(CCSC(F)F)C(=O)O,CI,0
2939,O=S1(=O)c2ccccc2N=C(c2ccccc2Cl)CC1c1ccc(F)cc1,CI,0
2940,CC(=S)NC=C1C(=O)N(c2ccccc2)N=C1C,CI,0
2941,COc1ccc(S(=O)c2ccc(OC)cc2)cc1,CI,0


In [15]:
# Randomly shuffle rows
hiv_df_sampled = hiv_df_sampled.sample(frac=1, random_state=42)

In [16]:
hiv_df_sampled.head()

Unnamed: 0,smiles,activity,HIV_active
840,C[N+](C)(C)C.O=C(Nc1ccc(C=Cc2ccc(NC(=O)c3cc(S(...,CM,1
1037,COC(=O)Cc1cc(O)c(CC=C(C)CCC=C(C)C(O)C(=O)C=C(C...,CM,1
2399,CC(=O)C(=Cc1cccc(Cl)c1)C(=O)c1ccccc1,CI,0
678,Cc1cc(C)c(S(=O)(O)=[OH+])c(C)c1.N[S+]1Cc2nc3cc...,CM,1
196,CCOc1cc(C2=NN3C(=S)NNC3=NN2)ccc1O,CM,1


In [17]:
hiv_df_sampled.to_csv('HIV_2.csv', index=False)
# .drop(['activity'], axis=1).
hiv_df_sampled_2 = pd.read_csv("HIV_2.csv")
hiv_df_sampled_2.head()
hiv_df_sampled_2.tail()

Unnamed: 0,smiles,activity,HIV_active
0,C[N+](C)(C)C.O=C(Nc1ccc(C=Cc2ccc(NC(=O)c3cc(S(...,CM,1
1,COC(=O)Cc1cc(O)c(CC=C(C)CCC=C(C)C(O)C(=O)C=C(C...,CM,1
2,CC(=O)C(=Cc1cccc(Cl)c1)C(=O)c1ccccc1,CI,0
3,Cc1cc(C)c(S(=O)(O)=[OH+])c(C)c1.N[S+]1Cc2nc3cc...,CM,1
4,CCOc1cc(C2=NN3C(=S)NNC3=NN2)ccc1O,CM,1


Unnamed: 0,smiles,activity,HIV_active
2938,COc1cccc(C2c3cc4c(cc3OC3COC(=O)C32)OCO4)c1OC,CI,0
2939,COP(=O)(C=Cc1cc(C(=O)OCc2ccccc2)n(S(=O)(=O)c2c...,CM,1
2940,O=C(CCc1ccc(O)cc1)NCCc1ccc(O)cc1,CM,1
2941,CCc1c(Cc2cc(C)cc(C)c2)n(COCCCO)c(=O)[nH]c1=O,CA,1
2942,CCSCCCCCCCCCCC(=O)OCC1OC(n2cc(C)c(=O)[nH]c2=O)...,CA,1


In [21]:
arguments = [
    '--data_path', 'HIV_2.csv',
    '--dataset_type', 'classification',
    '--save_dir', 'test_checkpoints_multimolecule',
    '--epochs', '30',
    '--save_smiles_splits',
    '--quiet',
    '--batch_size', '64',
    '--ignore_columns', 'activity',
    '--depth', '5',
    '--hidden_size', '300'
]

args = chemprop.args.TrainArgs().parse_args(arguments)

In [None]:
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

2943it [00:00, 160783.57it/s]
100%|██████████| 2943/2943 [00:00<00:00, 102512.49it/s]
100%|██████████| 2943/2943 [00:01<00:00, 2585.24it/s]
Fold 0
2943it [00:00, 210980.51it/s]
  0%|          | 0/30 [00:00<?, ?it/s]
  0%|          | 0/37 [00:00<?, ?it/s][A
  3%|▎         | 1/37 [00:01<00:48,  1.36s/it][A
  5%|▌         | 2/37 [00:01<00:31,  1.12it/s][A
  8%|▊         | 3/37 [00:02<00:24,  1.38it/s][A
 11%|█         | 4/37 [00:03<00:26,  1.25it/s][A
 14%|█▎        | 5/37 [00:04<00:26,  1.23it/s][A
 16%|█▌        | 6/37 [00:05<00:25,  1.21it/s][A
 19%|█▉        | 7/37 [00:05<00:21,  1.37it/s][A
 22%|██▏       | 8/37 [00:06<00:20,  1.43it/s][A
 24%|██▍       | 9/37 [00:06<00:18,  1.52it/s][A
 27%|██▋       | 10/37 [00:07<00:18,  1.44it/s][A
 30%|██▉       | 11/37 [00:08<00:17,  1.50it/s][A
 32%|███▏      | 12/37 [00:08<00:16,  1.54it/s][A
 35%|███▌      | 13/37 [00:09<00:14,  1.62it/s][A
 38%|███▊      | 14/37 [00:10<00:14,  1.58it/s][A
 41%|████      | 15/37 [00:11<00:17, 

In [23]:
mean_score, std_score

(0.8494890914112123, 0.0)

In [24]:
bp_df = pd.read_csv("BBBP.csv")
bp_df.head()

Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [25]:
bp_df.tail()

Unnamed: 0,num,name,p_np,smiles
2045,2049,licostinel,1,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl
2046,2050,ademetionine(adenosyl-methionine),1,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...
2047,2051,mesocarb,1,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...
2048,2052,tofisoline,1,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...
2049,2053,azidamfenicol,1,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...


In [26]:
bp_df.drop(['num', 'name', 'p_np'], axis=1).to_csv('BBBP_2.csv', index=False)

In [27]:
bp_df_2 = pd.read_csv("BBBP_2.csv")
bp_df_2.head()
bp_df_2.tail()

Unnamed: 0,smiles
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


Unnamed: 0,smiles
2045,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl
2046,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...
2047,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...
2048,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...
2049,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...


In [28]:
arguments = [
    '--test_path', 'BBBP_2.csv',
    '--preds_path', 'BBBP_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


2050it [00:00, 155975.82it/s]
100%|██████████| 2050/2050 [00:00<00:00, 97967.61it/s]

Validating SMILES



[19:10:29] Explicit valence for atom # 1 N, 4, is greater than permitted
[19:10:29] Explicit valence for atom # 6 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 6 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 11 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 12 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[19:10:30] Explicit valence for atom # 5 N, 4, is greater than permitted


Test size = 2,039


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".



  0%|          | 0/41 [00:00<?, ?it/s][A
  2%|▏         | 1/41 [00:02<01:49,  2.74s/it][A
  5%|▍         | 2/41 [00:02<00:49,  1.26s/it][A
  7%|▋         | 3/41 [00:03<00:32,  1.16it/s][A
 10%|▉         | 4/41 [00:03<00:21,  1.70it/s][A
 12%|█▏        | 5/41 [00:03<00:17,  2.12it/s][A
 15%|█▍        | 6/41 [00:04<00:13,  2.62it/s][A
 17%|█▋        | 7/41 [00:04<00:10,  3.31it/s][A
 20%|█▉        | 8/41 [00:04<00:09,  3.43it/s][A
 22%|██▏       | 9/41 [00:04<00:09,  3.50it/s][A
 24%|██▍       | 10/41 [00:04<00:07,  4.19it/s][A
 27%|██▋       | 11/41 [00:05<00:08,  3.60it/s][A
 29%|██▉       | 12/41 [00:05<00:07,  4.06it/s][A
 32%|███▏      | 13/41 [00:05<00:06,  4.23it/s][A
 34%|███▍      | 14/41 [00:05<00:05,  4.65it/s][A
 37%|███▋      | 15/41 [00:05<00:05,  4.61it/s][A
 39%|███▉      | 16/41 [00:06<00:05,  4.52it/s][A
 41%|████▏     | 17/41 [00:06<00:07,  3.41it/s][A
 44%|████▍     | 18/41 [00:06<00:07,  3.28it/s][A
 46%|████▋     | 19/41 [00:07<00:09,  2.39it/s]

Saving predictions to BBBP_preds.csv
Elapsed time = 0:00:15





In [29]:
bp_preds_df = pd.read_csv("BBBP_preds.csv")
bp_preds_df.head()

Unnamed: 0,smiles,HIV_active
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.1584082841873169
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,0.3171484768390655
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,0.2872501015663147
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,0.123438648879528
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,0.6773192286491394


In [30]:
bp_preds_df = pd.read_csv("BBBP_preds.csv")
bp_preds_df.head()

Unnamed: 0,smiles,HIV_active
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.1584082841873169
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,0.3171484768390655
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,0.2872501015663147
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,0.123438648879528
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,0.6773192286491394


In [31]:
bp_preds_df.tail()

Unnamed: 0,smiles,HIV_active
2045,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl,0.026406118646264
2046,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...,0.1512627899646759
2047,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...,0.2714560031890869
2048,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...,0.73172926902771
2049,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...,0.257811039686203


In [32]:
bp_preds_df.describe()

Unnamed: 0,smiles,HIV_active
count,2050,2050
unique,2050,2007
top,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,Invalid SMILES
freq,1,11


In [33]:
bp_preds_df = bp_preds_df[bp_preds_df['HIV_active'] != "Invalid SMILES"]
bp_preds_df.describe()

Unnamed: 0,smiles,HIV_active
count,2039,2039.0
unique,2039,2006.0
top,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.0545984283089637
freq,1,3.0


In [34]:
bp_preds_df['HIV_active'] = bp_preds_df['HIV_active'].astype(float)

In [35]:
bp_preds_df['HIV_active_2'] = bp_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.8 else 0)
bp_preds_df.head()


Unnamed: 0,smiles,HIV_active,HIV_active_2
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.158408,0
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,0.317148,0
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,0.28725,0
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,0.123439,0
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,0.677319,0


In [36]:
bp_preds_df.describe()

Unnamed: 0,HIV_active,HIV_active_2
count,2039.0,2039.0
mean,0.377255,0.153016
std,0.314573,0.360091
min,1.1e-05,0.0
25%,0.087381,0.0
50%,0.293052,0.0
75%,0.64571,0.0
max,0.999701,1.0


In [37]:
# Filter rows where 'target_column' is equal to 1
bp_preds_df_filtered = bp_preds_df[bp_preds_df['HIV_active_2'] == 1]
bp_preds_df_filtered


Unnamed: 0,smiles,HIV_active,HIV_active_2
6,CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@...,0.918768,1
11,CC1=CN([C@H]2C[C@H](F)[C@@H](CO)O2)C(=O)NC1=O,0.997787,1
36,CN(C)c1cc(C2=NC(N)=NN2)ccn1,0.900462,1
37,CN1CCN(CCCN2c3ccccc3Sc4ccc(Cl)cc24)CC1,0.823085,1
45,ClC1=CC=C(C2=C1)SC3=C(N2CCCNC)C=CC=C3,0.970116,1
...,...,...,...
1993,C1=C(C(N)=O)N=N[N]1CC2=C(C=CC=C2F)F,0.990616,1
2011,[C@]14([C@](OC(=O)CC)([C@@H](CC1C3[C@@](F)(C2(...,0.914451,1
2029,[C@H]13N([C@H](CC(C1)NC(C2=CC=CC(=C2OC)OC)=O)C...,0.869529,1
2033,C(C(C(CC)C)C(N)=O)C,0.964841,1


In [39]:
smiles_to_check = bp_preds_df_filtered['smiles'].to_list()

In [40]:
hiv_df_sampled_2[hiv_df_sampled_2['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active


In [41]:
hiv_df[hiv_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active
1046,CCC(C)C(CC)C(=O)NC(N)=O,CI,0


In [42]:
bp_df[bp_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,num,name,p_np,smiles
6,7,rolitetracycline,1,CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@...
11,12,alovudine,1,CC1=CN([C@H]2C[C@H](F)[C@@H](CO)O2)C(=O)NC1=O
36,37,11a,0,CN(C)c1cc(C2=NC(N)=NN2)ccn1
37,38,prochlorperazine,1,CN1CCN(CCCN2c3ccccc3Sc4ccc(Cl)cc24)CC1
45,46,Nor-1-chlorpromazine,1,ClC1=CC=C(C2=C1)SC3=C(N2CCCNC)C=CC=C3
...,...,...,...,...
1993,1997,rufinamide,1,C1=C(C(N)=O)N=N[N]1CC2=C(C=CC=C2F)F
2011,2015,ticabesone(ticabesone-propionate),1,[C@]14([C@](OC(=O)CC)([C@@H](CC1C3[C@@](F)(C2(...
2029,2033,tropapride,1,[C@H]13N([C@H](CC(C1)NC(C2=CC=CC(=C2OC)OC)=O)C...
2033,2037,valnoctamide,1,C(C(C(CC)C)C(N)=O)C


In [45]:
sub_df = pd.read_csv("substances.csv")
sub_df.head()

Unnamed: 0,zinc_id,smiles
0,ZINC000000000027,N[C@@H](CCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
1,ZINC000016090786,N[C@H](CCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
2,ZINC000001763088,N[C@H](CCCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
3,ZINC000002033385,N[C@@H](CCCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
4,ZINC000000001673,N[C@@H](Cc1ccc(N(CCCl)CCCl)cc1)C(=O)O


In [46]:
sub_df.tail()

Unnamed: 0,zinc_id,smiles
46,ZINC000196349655,O=C(O)CCSc1ccc(N(CCCl)CCCl)cc1
47,ZINC000064454242,N=NCCCc1ccc(N(CCCl)CCCl)cc1
48,ZINC000005161807,O=C(O)C/C=C/c1ccc(N(CCCl)CCCl)cc1
49,ZINC000001682294,O=C(O)CCOc1ccc(N(CCCl)CCCl)cc1
50,ZINC000079564304,O=C(O)CNC(=O)c1ccc(N(CCCl)CCCl)cc1


In [47]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   zinc_id  51 non-null     object
 1   smiles   51 non-null     object
dtypes: object(2)
memory usage: 944.0+ bytes


In [48]:
arguments = [
    '--test_path', 'substances.csv',
    '--preds_path', 'substances_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


51it [00:00, 58445.22it/s]
100%|██████████| 51/51 [00:00<00:00, 82178.07it/s]


Validating SMILES
Test size = 51


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  2.80it/s][A
100%|██████████| 1/1 [00:01<00:00,  1.15s/it]

Saving predictions to substances_preds.csv
Elapsed time = 0:00:01





In [49]:
fda_df = pd.read_csv("fda_approved.csv")
fda_df.head()

Unnamed: 0,zinc_id,smiles
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...
4,ZINC000000008492,Oc1cccc2cccnc12


In [50]:
arguments = [
    '--test_path', 'fda_approved.csv',
    '--preds_path', 'fda_approved_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


892it [00:00, 168179.41it/s]
100%|██████████| 892/892 [00:00<00:00, 135300.13it/s]

Validating SMILES





Test size = 892


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:02<00:36,  2.17s/it][A
 22%|██▏       | 4/18 [00:02<00:06,  2.07it/s][A
 50%|█████     | 9/18 [00:02<00:01,  5.19it/s][A
 89%|████████▉ | 16/18 [00:02<00:00, 10.96it/s][A
100%|██████████| 1/1 [00:03<00:00,  3.72s/it]

Saving predictions to fda_approved_preds.csv
Elapsed time = 0:00:04





In [51]:
fda_preds_df = pd.read_csv("fda_approved_preds.csv")
fda_preds_df.head()

Unnamed: 0,zinc_id,smiles,HIV_active
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O,0.181465
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0.350455
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1,0.197745
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...,0.365962
4,ZINC000000008492,Oc1cccc2cccnc12,0.188779


In [52]:
fda_preds_df = fda_preds_df[fda_preds_df['HIV_active'] != "Invalid SMILES"]
fda_preds_df.describe()
fda_preds_df['HIV_active'] = fda_preds_df['HIV_active'].astype(float)
fda_preds_df['HIV_active_2'] = fda_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.8 else 0)
fda_preds_df.head()

Unnamed: 0,HIV_active
count,892.0
mean,0.314126
std,0.154811
min,0.008633
25%,0.211279
50%,0.269859
75%,0.38151
max,0.978216


Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O,0.181465,0
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0.350455,0
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1,0.197745,0
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...,0.365962,0
4,ZINC000000008492,Oc1cccc2cccnc12,0.188779,0


In [53]:
# Filter rows where 'target_column' is equal to 1
fda_preds_df_filtered = fda_preds_df[fda_preds_df['HIV_active_2'] == 1]
fda_preds_df_filtered

Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
47,ZINC000003813010,O=c1[nH]c(=O)n([C@H]2C[C@H](O)[C@@H](CO)O2)cc1F,0.929885,1
81,ZINC000003818726,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,0.899193,1
94,ZINC000068153186,CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2...,0.881334,1
165,ZINC000003831490,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...,0.919778,1
181,ZINC000003806262,OC[C@H]1O[C@@H](n2cnc3c2N=CNC[C@H]3O)C[C@@H]1O,0.856103,1
677,ZINC000000012346,Nc1ccn([C@@H]2CS[C@H](CO)O2)c(=O)n1,0.854706,1
701,ZINC000003813061,CCCCNc1cc(C(=O)O)cc(S(N)(=O)=O)c1Oc1ccccc1,0.843766,1
715,ZINC000169289767,Cc1cc(-c2ccc(/N=N/c3c(S(=O)(=O)O)cc4cc(S(=O)(=...,0.978216,1
819,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O,0.950161,1
846,ZINC000096272772,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c...,0.809559,1


In [55]:
smiles_to_check = fda_preds_df_filtered['smiles'].to_list()
print(f"smiles to check: {smiles_to_check}")

smiles to check: ['O=c1[nH]c(=O)n([C@H]2C[C@H](O)[C@@H](CO)O2)cc1F', 'O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO', 'CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2F)c(-c2ccnc(N)n2)s1', 'O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O', 'OC[C@H]1O[C@@H](n2cnc3c2N=CNC[C@H]3O)C[C@@H]1O', 'Nc1ccn([C@@H]2CS[C@H](CO)O2)c(=O)n1', 'CCCCNc1cc(C(=O)O)cc(S(N)(=O)=O)c1Oc1ccccc1', 'Cc1cc(-c2ccc(/N=N/c3c(S(=O)(=O)O)cc4cc(S(=O)(=O)O)cc(N)c4c3O)c(C)c2)ccc1/N=N/c1c(S(=O)(=O)O)cc2cc(S(=O)(=O)O)cc(N)c2c1O', 'Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O', 'Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c(OC(C)C)cc1C1CCNCC1', 'Cc1cn([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO)O2)c(=O)[nH]c1=O']


In [56]:
hiv_df_sampled_2[hiv_df_sampled_2['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active


In [57]:
hiv_df[hiv_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active


In [58]:
bp_df[bp_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,num,name,p_np,smiles


In [59]:
fda_df[fda_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,zinc_id,smiles
47,ZINC000003813010,O=c1[nH]c(=O)n([C@H]2C[C@H](O)[C@@H](CO)O2)cc1F
81,ZINC000003818726,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO
94,ZINC000068153186,CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2...
165,ZINC000003831490,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...
181,ZINC000003806262,OC[C@H]1O[C@@H](n2cnc3c2N=CNC[C@H]3O)C[C@@H]1O
677,ZINC000000012346,Nc1ccn([C@@H]2CS[C@H](CO)O2)c(=O)n1
701,ZINC000003813061,CCCCNc1cc(C(=O)O)cc(S(N)(=O)=O)c1Oc1ccccc1
715,ZINC000169289767,Cc1cc(-c2ccc(/N=N/c3c(S(=O)(=O)O)cc4cc(S(=O)(=...
819,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O
846,ZINC000096272772,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c...


In [121]:
!wget https://zinc15.docking.org/substances/subsets/named.csv

--2024-03-10 05:58:28--  https://zinc15.docking.org/substances/subsets/named.csv
Resolving zinc15.docking.org (zinc15.docking.org)... 169.230.75.4
Connecting to zinc15.docking.org (zinc15.docking.org)|169.230.75.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘named.csv.1’

named.csv.1             [ <=>                ]   9.28K  --.-KB/s    in 0.04s   

2024-03-10 05:58:29 (242 KB/s) - ‘named.csv.1’ saved [9499]



In [123]:
zinc_df = pd.read_csv("named.csv")
zinc_df.head()
zinc_df.tail()

FileNotFoundError: [Errno 2] No such file or directory: 'named.csv'

In [72]:
arguments = [
    '--test_path', 'named.csv',
    '--preds_path', 'named_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


100it [00:00, 59764.95it/s]
100%|██████████| 100/100 [00:00<00:00, 66905.47it/s]


Validating SMILES
Test size = 100


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.59it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.01it/s]

Saving predictions to named_preds.csv
Elapsed time = 0:00:01





In [75]:
zinc_preds_df = pd.read_csv("named_preds.csv")
zinc_preds_df.head()
zinc_preds_df = zinc_preds_df[zinc_preds_df['HIV_active'] != "Invalid SMILES"]
zinc_preds_df.describe()
zinc_preds_df['HIV_active'] = zinc_preds_df['HIV_active'].astype(float)
zinc_preds_df['HIV_active_2'] = zinc_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.5 else 0)
zinc_preds_df.head()

Unnamed: 0,zinc_id,smiles,HIV_active
0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,0.359023
1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,0.395612
2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,0.376613
3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,0.419707
4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,0.459242


Unnamed: 0,HIV_active
count,100.0
mean,0.410988
std,0.03622
min,0.312791
25%,0.392779
50%,0.407688
75%,0.434701
max,0.503582


Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,0.359023,0
1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,0.395612,0
2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,0.376613,0
3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,0.419707,0
4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,0.459242,0


In [77]:
# Filter rows where 'target_column' is equal to 1
zinc_preds_df_filtered = zinc_preds_df[zinc_preds_df['HIV_active_2'] == 1]
zinc_preds_df_filtered

Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
72,ZINC000001680645,Nc1cccc2cc(S(=O)(=O)O)ccc12,0.503582,1


In [44]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
!mkdir '/content/drive/My Drive/Chemprop_Backup_HIV/'

In [70]:
!pwd

/content


In [46]:
!ls -al

total 2748
drwxr-xr-x 1 root root    4096 Mar 10 19:17 .
drwxr-xr-x 1 root root    4096 Mar 10 18:47 ..
-rw-r--r-- 1 root root  107579 Mar 10 19:10 BBBP_2.csv
-rw-r--r-- 1 root root  148743 Mar 10 19:10 BBBP.csv
-rw-r--r-- 1 root root  149692 Mar 10 19:10 BBBP_preds.csv
drwxr-xr-x 4 root root    4096 Mar  7 14:31 .config
drwx------ 6 root root    4096 Mar 10 19:17 drive
-rw-r--r-- 1 root root  174928 Mar 10 18:51 HIV_2.csv
-rw-r--r-- 1 root root 2193844 Mar 10 18:49 HIV.csv
drwxr-xr-x 2 root root    4096 Mar 10 18:49 .ipynb_checkpoints
drwxr-xr-x 1 root root    4096 Mar  7 14:32 sample_data
drwxr-xr-x 3 root root    4096 Mar 10 19:03 test_checkpoints_multimolecule


In [48]:
!cp -r /content/*.csv "/content/drive/My Drive/Chemprop_Backup_HIV/"