# Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Mar 11 09:40:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [None]:
!pip install chemprop
!pip install rdkit-pypi  # should be included in above after Chemprop v1.6 release

import chemprop
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

Collecting chemprop
  Downloading chemprop-1.6.1-py3-none-any.whl (166 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/166.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m122.9/166.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pandas-flavor>=0.2.0 (from chemprop)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Collecting tensorboardX>=2.0 (from chemprop)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting typed-argument-parser>=1.6.1 (from chemprop)
  Downloading typed-argument-parser-1.9.0.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m6.8 MB/s

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
hiv_df = pd.read_csv("HIV.csv")
hiv_df.head()

Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0


In [None]:
hiv_df.describe()

Unnamed: 0,HIV_active
count,41127.0
mean,0.035086
std,0.184001
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [None]:
unique_values = hiv_df['HIV_active'].unique()
print(f"Unique values in 'HIV_active': {unique_values}")

Unique values in 'HIV_active': [0 1]


In [None]:
unique_values = hiv_df['smiles'].unique()
print(f"Unique values in 'smiles': {unique_values}")
print(f"length of uniqe value: {len(unique_values)}")

Unique values in 'smiles': ['CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)=[O+]2'
 'C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3)CC(c3ccccc3)=[O+]2)[O+]=C(c2ccccc2)C1'
 'CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21' ...
 'Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C)CC4C3C2=O)cc1'
 'Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C)CC4C3C2=O)c1'
 'CCCCCC=C(c1cc(Cl)c(OC)c(-c2nc(C)no2)c1)c1cc(Cl)c(OC)c(-c2nc(C)no2)c1']
length of uniqe value: 41127


In [None]:
# Filter rows where 'your_column' is not equal to 1 or 0
filtered_df = hiv_df[(hiv_df['HIV_active'] != 1) & (hiv_df['HIV_active'] != 0)]
filtered_df

Unnamed: 0,smiles,activity,HIV_active


In [None]:
# Filter rows where 'target_column' is equal to 1h
hiv_df_filtered_active = hiv_df[hiv_df['HIV_active'] == 1]
hiv_df_filtered_active

Unnamed: 0,smiles,activity,HIV_active
11,O=C(O)Cc1ccc(SSc2ccc(CC(=O)O)cc2)cc1,CM,1
16,NNP(=S)(NN)c1ccccc1,CM,1
80,O=Nc1ccc(O)c(N=O)c1O,CM,1
203,Oc1ccc(Cl)cc1C(c1cc(Cl)ccc1O)C(Cl)(Cl)Cl,CM,1
234,NNC(=O)c1ccccc1SSc1ccccc1C(=O)NN,CM,1
...,...,...,...
41090,Cc1cn(COCCCOCC(=O)c2ccccc2)c(=O)[nH]c1=O,CM,1
41092,Cc1cn(C2CC3C(COC(CCC[Se]c4ccccc4)N3O)O2)c(=O)[...,CM,1
41093,Cc1cn(C2CC3C(COC(CCCC[Se]c4ccccc4)N3O)O2)c(=O)...,CM,1
41098,Cc1cn(C2CC3C(COC(CC[Se]C#N)N3O)O2)c(=O)[nH]c1=O,CM,1


In [None]:
# Filter rows where 'target_column' is equal to 1h
hiv_df_filtered_inactive = hiv_df[hiv_df['HIV_active'] == 0]
hiv_df_filtered_inactive = hiv_df_filtered_inactive.sample(n=1500, axis=0, random_state=42)
hiv_df_filtered_inactive

Unnamed: 0,smiles,activity,HIV_active
2428,O=C1c2ccccc2-c2nc3ccccc3nc21,CI,0
6197,O=C(CSc1cc(-c2ccc(Cl)cc2)s[s+]1)c1ccccc1,CI,0
17138,O=C(C=Nc1ccccc1C(=O)O)c1ccco1,CI,0
12261,CCCCCCCCCCCCCCCCCC[N+](C)(C)Cc1ccc(C[N+](C)(C)...,CI,0
3588,N#CSC1CCCCCCC1SC#N,CI,0
...,...,...,...
18477,CC(=O)OC1(C#N)CC2OC1C1C2N1C(=O)OC(C)(C)C,CI,0
1189,CCOC(=O)C1Cc2cc(C)c(C)cc2N(C)C1=O,CI,0
36657,CCOC(=O)N1CCN(c2ccc3c(C)cc(C)nc3n2)CC1,CI,0
27919,CN(C)C=Nc1ccc2c3c(cccc13)-c1ccccc1-2,CI,0


In [None]:
hiv_df_sampled = pd.concat([hiv_df_filtered_active, hiv_df_filtered_inactive], axis=0, ignore_index=True)
hiv_df_sampled

Unnamed: 0,smiles,activity,HIV_active
0,O=C(O)Cc1ccc(SSc2ccc(CC(=O)O)cc2)cc1,CM,1
1,NNP(=S)(NN)c1ccccc1,CM,1
2,O=Nc1ccc(O)c(N=O)c1O,CM,1
3,Oc1ccc(Cl)cc1C(c1cc(Cl)ccc1O)C(Cl)(Cl)Cl,CM,1
4,NNC(=O)c1ccccc1SSc1ccccc1C(=O)NN,CM,1
...,...,...,...
2938,CC(=O)OC1(C#N)CC2OC1C1C2N1C(=O)OC(C)(C)C,CI,0
2939,CCOC(=O)C1Cc2cc(C)c(C)cc2N(C)C1=O,CI,0
2940,CCOC(=O)N1CCN(c2ccc3c(C)cc(C)nc3n2)CC1,CI,0
2941,CN(C)C=Nc1ccc2c3c(cccc13)-c1ccccc1-2,CI,0


In [None]:
# Randomly shuffle rows
hiv_df_sampled = hiv_df_sampled.sample(frac=1, random_state=42)

In [None]:
hiv_df_sampled.head()

Unnamed: 0,smiles,activity,HIV_active
240,Cc1cc2c(c(=O)o1)C1=S(SC(c3ccccc3)=C1)S2,CM,1
2325,N#CN1CCC=C(c2cc3ccccc3[nH]2)C1,CI,0
1676,CCC1SC(C)C(=O)NC1=O,CI,0
1952,O=C1CC2(CCN(Cc3ccccc3)CC2)CC(=O)N1,CI,0
677,CC(=O)OC1SC(c2c(F)cccc2F)n2c1nc1ccccc12,CM,1


In [None]:
hiv_df_sampled.to_csv('HIV_2.csv', index=False)
# .drop(['activity'], axis=1).
hiv_df_sampled_2 = pd.read_csv("HIV_2.csv")
hiv_df_sampled_2.head()
hiv_df_sampled_2.tail()

Unnamed: 0,smiles,activity,HIV_active
0,Cc1cc2c(c(=O)o1)C1=S(SC(c3ccccc3)=C1)S2,CM,1
1,N#CN1CCC=C(c2cc3ccccc3[nH]2)C1,CI,0
2,CCC1SC(C)C(=O)NC1=O,CI,0
3,O=C1CC2(CCN(Cc3ccccc3)CC2)CC(=O)N1,CI,0
4,CC(=O)OC1SC(c2c(F)cccc2F)n2c1nc1ccccc12,CM,1


Unnamed: 0,smiles,activity,HIV_active
2938,O=C(CS)Nc1cccc(O)c1,CI,0
2939,O=C(Nc1ccc(N=Nc2ccc(S(=O)(=O)O)cc2)cc1)c1ccc(N...,CM,1
2940,NC(=O)CCN(CCC(N)=O)CCC(N)=O,CI,0
2941,Cn1nc2ccccc2cc1=O,CI,0
2942,C[n+]1c(C=NNC(=O)c2ccc(C(=O)NN=Cc3cn4ccccc4[n+...,CI,0


In [None]:
arguments = [
    '--data_path', 'HIV.csv',
    '--dataset_type', 'classification',
    '--save_dir', 'test_checkpoints_multimolecule',
    '--epochs', '30',
    '--save_smiles_splits',
    '--quiet',
    '--batch_size', '64',
    '--ignore_columns', 'activity',
    '--depth', '5',
    '--hidden_size', '1600'
]

args = chemprop.args.TrainArgs().parse_args(arguments)

In [None]:
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 69%|██████▊   | 354/515 [01:25<00:42,  3.82it/s][A
 69%|██████▉   | 355/515 [01:25<00:37,  4.24it/s][A
 69%|██████▉   | 357/515 [01:25<00:25,  6.14it/s][A
 70%|██████▉   | 359/515 [01:25<00:20,  7.71it/s][A
 70%|███████   | 361/515 [01:25<00:23,  6.49it/s][A
 70%|███████   | 362/515 [01:26<00:38,  3.94it/s][A
 70%|███████   | 363/515 [01:26<00:34,  4.37it/s][A
 71%|███████   | 365/515 [01:26<00:25,  5.90it/s][A
 71%|███████   | 366/515 [01:27<00:23,  6.43it/s][A
 71%|███████▏  | 368/515 [01:28<00:42,  3.49it/s][A
 72%|███████▏  | 370/515 [01:28<00:36,  3.94it/s][A
 72%|███████▏  | 371/515 [01:28<00:35,  4.04it/s][A
 72%|███████▏  | 373/515 [01:28<00:26,  5.43it/s][A
 73%|███████▎  | 375/515 [01:28<00:21,  6.51it/s][A
 73%|███████▎  | 376/515 [01:29<00:27,  5.00it/s][A
 73%|███████▎  | 377/515 [01:29<00:24,  5.57it/s][A
 73%|███████▎  | 378/515 [01:30<00:36,  3.75it/s][A
 74%|███████▎  | 379/515 [01:30<00

In [14]:
mean_score, std_score

(0.7778557998980004, 0.0)

In [15]:
bp_df = pd.read_csv("BBBP.csv")
bp_df.head()

Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [16]:
bp_df.tail()

Unnamed: 0,num,name,p_np,smiles
2045,2049,licostinel,1,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl
2046,2050,ademetionine(adenosyl-methionine),1,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...
2047,2051,mesocarb,1,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...
2048,2052,tofisoline,1,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...
2049,2053,azidamfenicol,1,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...


In [17]:
bp_df.drop(['num', 'name', 'p_np'], axis=1).to_csv('BBBP_2.csv', index=False)

In [18]:
bp_df_2 = pd.read_csv("BBBP_2.csv")
bp_df_2.head()
bp_df_2.tail()

Unnamed: 0,smiles
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


Unnamed: 0,smiles
2045,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl
2046,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...
2047,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...
2048,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...
2049,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...


In [19]:
arguments = [
    '--test_path', 'BBBP_2.csv',
    '--preds_path', 'BBBP_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


2050it [00:00, 226021.85it/s]
100%|██████████| 2050/2050 [00:00<00:00, 146406.77it/s]

Validating SMILES



[07:04:30] Explicit valence for atom # 1 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 6 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 6 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 11 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 12 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted
[07:04:30] Explicit valence for atom # 5 N, 4, is greater than permitted


Test size = 2,039


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/41 [00:00<?, ?it/s][A
  2%|▏         | 1/41 [00:02<01:21,  2.04s/it][A
 10%|▉         | 4/41 [00:02<00:22,  1.63it/s][A
 22%|██▏       | 9/41 [00:03<00:08,  3.57it/s][A
 29%|██▉       | 12/41 [00:03<00:06,  4.39it/s][A
 37%|███▋      | 15/41 [00:03<00:04,  6.19it/s][A
 41%|████▏     | 17/41 [00:04<00:04,  5.47it/s][A
 49%|████▉     | 20/41 [00:04<00:03,  6.22it/s][A
 56%|█████▌    | 23/41 [00:04<00:02,  8.27it/s][A
 61%|██████    | 25/41 [00:05<00:02,  7.12it/s][A
 68%|██████▊   | 28/41 [00:05<00:01,  8.08it/s][A
 80%|████████  | 33/41 [00:05<00:00, 10.42it/s][A
 88%|████████▊ | 36/41 [00:05<00:00, 12.54it/s][A
100%|██████████| 1/1 [00:06<00:00,  6.40s/it]

Saving predictions to BBBP_preds.csv
Elapsed time = 0:00:07





In [20]:
bp_preds_df = pd.read_csv("BBBP_preds.csv")
bp_preds_df.head()

Unnamed: 0,smiles,HIV_active
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.0214727353304624
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,0.0014868687139824
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,0.0369284190237522
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,0.0009061084710992
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,0.0067095081321895


In [21]:
bp_preds_df.tail()

Unnamed: 0,smiles,HIV_active
2045,C1=C(Cl)C(=C(C2=C1NC(=O)C(N2)=O)[N+](=O)[O-])Cl,0.0041548567824065
2046,[C@H]3([N]2C1=C(C(=NC=N1)N)N=C2)[C@@H]([C@@H](...,0.0151471029967069
2047,[O+]1=N[N](C=C1[N-]C(NC2=CC=CC=C2)=O)C(CC3=CC=...,0.0044944756664335
2048,C1=C(OC)C(=CC2=C1C(=[N+](C(=C2CC)C)[NH-])C3=CC...,0.0531546957790851
2049,[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-]...,0.004394699819386


In [22]:
bp_preds_df.describe()

Unnamed: 0,smiles,HIV_active
count,2050,2050
unique,2050,1994
top,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,Invalid SMILES
freq,1,11


In [23]:
bp_preds_df = bp_preds_df[bp_preds_df['HIV_active'] != "Invalid SMILES"]
bp_preds_df.describe()

Unnamed: 0,smiles,HIV_active
count,2039,2039.0
unique,2039,1993.0
top,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.0003313531633466
freq,1,3.0


In [24]:
bp_preds_df['HIV_active'] = bp_preds_df['HIV_active'].astype(float)

In [25]:
bp_preds_df['HIV_active_2'] = bp_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.8 else 0)
bp_preds_df.head()


Unnamed: 0,smiles,HIV_active,HIV_active_2
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,0.021473,0
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,0.001487,0
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,0.036928,0
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,0.000906,0
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,0.00671,0


In [26]:
bp_preds_df.describe()

Unnamed: 0,HIV_active,HIV_active_2
count,2039.0,2039.0
mean,0.0259817,0.001962
std,0.06617616,0.044259
min,4.265374e-10,0.0
25%,0.003987846,0.0
50%,0.01050125,0.0
75%,0.02310448,0.0
max,0.8798995,1.0


In [27]:
# Filter rows where 'target_column' is equal to 1
bp_preds_df_filtered = bp_preds_df[bp_preds_df['HIV_active_2'] == 1]
bp_preds_df_filtered


Unnamed: 0,smiles,HIV_active,HIV_active_2
11,CC1=CN([C@H]2C[C@H](F)[C@@H](CO)O2)C(=O)NC1=O,0.846002,1
289,OC[C@@H]1CC[C@@H](O1)n2cnc3C(=O)N=CNc23,0.828147,1
319,CC1=CN([C@@H]2O[C@H](CO)C=C2)C(=O)NC1=O,0.852836,1
346,NC1=NC(=O)N(C=C1)[C@H]2CC[C@@H](CO)O2,0.8799,1


In [28]:
smiles_to_check = bp_preds_df_filtered['smiles'].to_list()

In [29]:
hiv_df_sampled_2[hiv_df_sampled_2['smiles'].isin(smiles_to_check)]

NameError: name 'hiv_df_sampled_2' is not defined

In [30]:
hiv_df[hiv_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active


In [31]:
bp_df[bp_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,num,name,p_np,smiles
11,12,alovudine,1,CC1=CN([C@H]2C[C@H](F)[C@@H](CO)O2)C(=O)NC1=O
289,291,Didanosine,0,OC[C@@H]1CC[C@@H](O1)n2cnc3C(=O)N=CNc23
319,321,Stavudine,1,CC1=CN([C@@H]2O[C@H](CO)C=C2)C(=O)NC1=O
346,348,Zalcitabine,1,NC1=NC(=O)N(C=C1)[C@H]2CC[C@@H](CO)O2


In [32]:
bp_df_final = pd.merge(bp_df[bp_df['smiles'].isin(smiles_to_check)], bp_preds_df_filtered, on='smiles' )
bp_df_final

Unnamed: 0,num,name,p_np,smiles,HIV_active,HIV_active_2
0,12,alovudine,1,CC1=CN([C@H]2C[C@H](F)[C@@H](CO)O2)C(=O)NC1=O,0.846002,1
1,291,Didanosine,0,OC[C@@H]1CC[C@@H](O1)n2cnc3C(=O)N=CNc23,0.828147,1
2,321,Stavudine,1,CC1=CN([C@@H]2O[C@H](CO)C=C2)C(=O)NC1=O,0.852836,1
3,348,Zalcitabine,1,NC1=NC(=O)N(C=C1)[C@H]2CC[C@@H](CO)O2,0.8799,1


In [33]:
bp_df_final.to_csv('HIV_result.csv', index=False)

In [35]:
sub_df = pd.read_csv("substances.csv")
sub_df.head()

Unnamed: 0,zinc_id,smiles
0,ZINC000000000027,N[C@@H](CCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
1,ZINC000016090786,N[C@H](CCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
2,ZINC000001763088,N[C@H](CCCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
3,ZINC000002033385,N[C@@H](CCCc1ccc(N(CCCl)CCCl)cc1)C(=O)O
4,ZINC000000001673,N[C@@H](Cc1ccc(N(CCCl)CCCl)cc1)C(=O)O


In [36]:
sub_df.tail()

Unnamed: 0,zinc_id,smiles
46,ZINC000196349655,O=C(O)CCSc1ccc(N(CCCl)CCCl)cc1
47,ZINC000064454242,N=NCCCc1ccc(N(CCCl)CCCl)cc1
48,ZINC000005161807,O=C(O)C/C=C/c1ccc(N(CCCl)CCCl)cc1
49,ZINC000001682294,O=C(O)CCOc1ccc(N(CCCl)CCCl)cc1
50,ZINC000079564304,O=C(O)CNC(=O)c1ccc(N(CCCl)CCCl)cc1


In [37]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   zinc_id  51 non-null     object
 1   smiles   51 non-null     object
dtypes: object(2)
memory usage: 944.0+ bytes


In [38]:
arguments = [
    '--test_path', 'substances.csv',
    '--preds_path', 'substances_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


51it [00:00, 39141.72it/s]
100%|██████████| 51/51 [00:00<00:00, 69586.70it/s]


Validating SMILES
Test size = 51


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  2.27it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.26it/s]

Saving predictions to substances_preds.csv
Elapsed time = 0:00:01





In [41]:
fda_df = pd.read_csv("fda_approved.csv")
fda_df.head()

Unnamed: 0,zinc_id,smiles
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...
4,ZINC000000008492,Oc1cccc2cccnc12


In [42]:
arguments = [
    '--test_path', 'fda_approved.csv',
    '--preds_path', 'fda_approved_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


892it [00:00, 84347.53it/s]
100%|██████████| 892/892 [00:00<00:00, 63923.58it/s]

Validating SMILES





Test size = 892


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:02<00:34,  2.05s/it][A
 11%|█         | 2/18 [00:02<00:15,  1.03it/s][A
 22%|██▏       | 4/18 [00:02<00:05,  2.46it/s][A
 50%|█████     | 9/18 [00:02<00:01,  6.87it/s][A
 67%|██████▋   | 12/18 [00:02<00:00,  9.41it/s][A
100%|██████████| 1/1 [00:03<00:00,  3.30s/it]

Saving predictions to fda_approved_preds.csv
Elapsed time = 0:00:04





In [43]:
fda_preds_df = pd.read_csv("fda_approved_preds.csv")
fda_preds_df.head()

Unnamed: 0,zinc_id,smiles,HIV_active
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O,0.000114
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0.021717
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1,0.020179
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...,0.021397
4,ZINC000000008492,Oc1cccc2cccnc12,0.00681


In [44]:
fda_preds_df = fda_preds_df[fda_preds_df['HIV_active'] != "Invalid SMILES"]
fda_preds_df.describe()
fda_preds_df['HIV_active'] = fda_preds_df['HIV_active'].astype(float)
fda_preds_df['HIV_active_2'] = fda_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.8 else 0)
fda_preds_df.head()

Unnamed: 0,HIV_active
count,892.0
mean,0.03120089
std,0.07911301
min,2.711503e-08
25%,0.004847819
50%,0.01207062
75%,0.02594892
max,0.8708455


Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O,0.000114,0
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0.021717,0
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1,0.020179,0
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...,0.021397,0
4,ZINC000000008492,Oc1cccc2cccnc12,0.00681,0


In [45]:
# Filter rows where 'target_column' is equal to 1
fda_preds_df_filtered = fda_preds_df[fda_preds_df['HIV_active_2'] == 1]
fda_preds_df_filtered

Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
321,ZINC000013597823,O=c1[nH]cnc2c1ncn2[C@H]1CC[C@@H](CO)O1,0.870845,1
819,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O,0.844089,1


In [46]:
smiles_to_check = fda_preds_df_filtered['smiles'].to_list()
print(f"smiles to check: {smiles_to_check}")

smiles to check: ['O=c1[nH]cnc2c1ncn2[C@H]1CC[C@@H](CO)O1', 'Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O']


In [47]:
hiv_df_sampled_2[hiv_df_sampled_2['smiles'].isin(smiles_to_check)]

NameError: name 'hiv_df_sampled_2' is not defined

In [48]:
hiv_df[hiv_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,smiles,activity,HIV_active


In [49]:
bp_df[bp_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,num,name,p_np,smiles


In [50]:
fda_df[fda_df['smiles'].isin(smiles_to_check)]

Unnamed: 0,zinc_id,smiles
321,ZINC000013597823,O=c1[nH]cnc2c1ncn2[C@H]1CC[C@@H](CO)O1
819,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O


In [51]:
fda_df_final = pd.merge(fda_df[fda_df['smiles'].isin(smiles_to_check)], fda_preds_df_filtered, on='smiles' )
fda_df_final

Unnamed: 0,zinc_id_x,smiles,zinc_id_y,HIV_active,HIV_active_2
0,ZINC000013597823,O=c1[nH]cnc2c1ncn2[C@H]1CC[C@@H](CO)O1,ZINC000013597823,0.870845,1
1,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O,ZINC000000137884,0.844089,1


In [52]:
fda_df_final.to_csv('fda_approved_result.csv', index=False)

In [None]:
# !wget https://zinc15.docking.org/substances/subsets/named.csv

--2024-03-10 05:58:28--  https://zinc15.docking.org/substances/subsets/named.csv
Resolving zinc15.docking.org (zinc15.docking.org)... 169.230.75.4
Connecting to zinc15.docking.org (zinc15.docking.org)|169.230.75.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘named.csv.1’

named.csv.1             [ <=>                ]   9.28K  --.-KB/s    in 0.04s   

2024-03-10 05:58:29 (242 KB/s) - ‘named.csv.1’ saved [9499]



In [53]:
zinc_df = pd.read_csv("named.csv")
zinc_df.head()
zinc_df.tail()

Unnamed: 0,zinc_id,smiles
0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...
1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...
2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...
3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl
4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...


Unnamed: 0,zinc_id,smiles
22959,ZINC000015253718,C/C(=C\CO)CC/C=C(\C)CCC[C@@H](C)CCC[C@H](C)CCC...
22960,ZINC000043888360,CCCCCCCCCCCCCCCCCCCCC[C@@H](O)C[C@@H](O)CCCCC
22961,ZINC000096006009,Cc1noc(NS(=O)(=O)c2ccc(N)cc2)c1C
22962,ZINC000150375318,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)OC[C@@H](COC(=O...
22963,ZINC000085558850,CCCCC/C=C\C/C=C\C/C=C\CCCCC(=O)OC[C@H](CO[P@@]...


In [54]:
arguments = [
    '--test_path', 'named.csv',
    '--preds_path', 'named_preds.csv',
    '--checkpoint_dir', 'test_checkpoints_multimolecule',
    '--smiles_columns', 'smiles'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)

Loading training args
Setting molecule featurization parameters to default.
Loading data


34600it [00:00, 94636.51it/s] 
100%|██████████| 34600/34600 [00:00<00:00, 69449.30it/s]


Validating SMILES




Test size = 34,600


  0%|          | 0/1 [00:00<?, ?it/s]

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda



  0%|          | 0/692 [00:00<?, ?it/s][A
  0%|          | 1/692 [00:01<20:50,  1.81s/it][A
  0%|          | 2/692 [00:05<32:02,  2.79s/it][A
  0%|          | 3/692 [00:05<19:20,  1.68s/it][A
  1%|          | 5/692 [00:05<08:58,  1.28it/s][A
  1%|          | 8/692 [00:06<04:37,  2.46it/s][A
  1%|▏         | 9/692 [00:06<05:38,  2.02it/s][A
  1%|▏         | 10/692 [00:07<06:29,  1.75it/s][A
  2%|▏         | 11/692 [00:07<05:21,  2.12it/s][A
  2%|▏         | 13/692 [00:08<03:32,  3.19it/s][A
  2%|▏         | 16/692 [00:08<02:23,  4.71it/s][A
  2%|▏         | 17/692 [00:08<02:33,  4.39it/s][A
  3%|▎         | 18/692 [00:09<03:57,  2.84it/s][A
  3%|▎         | 19/692 [00:09<03:26,  3.26it/s][A
  3%|▎         | 21/692 [00:09<02:31,  4.44it/s][A
  3%|▎         | 24/692 [00:10<01:46,  6.29it/s][A
  4%|▎         | 25/692 [00:10<01:50,  6.05it/s][A
  4%|▍         | 26/692 [00:11<02:57,  3.75it/s][A
  4%|▍         | 27/692 [00:11<02:39,  4.16it/s][A
  4%|▍         | 29/692 [0

Saving predictions to named_preds.csv
Elapsed time = 0:02:50


In [55]:
zinc_preds_df = pd.read_csv("named_preds.csv")
zinc_preds_df.head()
zinc_preds_df = zinc_preds_df[zinc_preds_df['HIV_active'] != "Invalid SMILES"]
zinc_preds_df.describe()
zinc_preds_df['HIV_active'] = zinc_preds_df['HIV_active'].astype(float)
zinc_preds_df['HIV_active_2'] = zinc_preds_df['HIV_active'].apply(lambda x: 1 if x > 0.8 else 0)
zinc_preds_df.head()

Unnamed: 0,zinc_id,smiles,HIV_active
0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,0.034702
1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,0.021076
2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,0.006336
3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,0.000131
4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,0.28804


Unnamed: 0,HIV_active
count,34600.0
mean,0.03863435
std,0.0608922
min,2.786825e-10
25%,0.009784357
50%,0.02413954
75%,0.04112185
max,0.9241325


Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,0.034702,0
1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,0.021076,0
2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,0.006336,0
3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,0.000131,0
4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,0.28804,0


In [61]:
# Filter rows where 'target_column' is equal to 1
zinc_preds_df_filtered = zinc_preds_df[zinc_preds_df['HIV_active_2'] == 1]
zinc_preds_df_filtered

Unnamed: 0,zinc_id,smiles,HIV_active,HIV_active_2
486,ZINC000000137884,Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O,0.844089,1
664,ZINC000000039906,Nc1ccn([C@H]2CC[C@@H](CO)O2)c(=O)n1,0.8799,1
1673,ZINC000005551645,CC(C)=CCN1Cc2c(Cl)ccc3[nH]c(=S)n(c23)C[C@@H]1C,0.857422,1
2491,ZINC000001611085,Nc1nc(=O)n([C@H]2CC[C@H](CO)O2)cc1F,0.890377,1
7718,ZINC000043313038,CCCCCCCCCCCCSC[C@@H](CO[P@@](=O)(O)OC[C@H]1O[C...,0.835573,1
8880,ZINC000003809864,CCCc1cc(=O)oc2c3c(c4c(c12)OC(C)(C)CC4)O[C@@H](...,0.843889,1
11356,ZINC000016952419,Nc1nc(=O)n([C@H]2CC[C@@H](CO)O2)cc1F,0.894337,1
13087,ZINC000032016993,Nc1ccn([C@@H]2CC[C@H](CO[P@@](=O)(O)O[P@](=O)(...,0.869442,1
15124,ZINC000015042997,CC[C@H]1CC[C@@H](CC)O1,0.872187,1
15334,ZINC000013516800,Nc1ccn([C@H]2CC[C@@H](CO[P@@](=O)(O)O[P@@](=O)...,0.867748,1


In [62]:
zinc_preds_df_filtered.to_csv('zinc_final_result.csv', index=False)

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [63]:
!mkdir '/content/drive/My Drive/Chemprop_Backup_HIV_ALL_DATA/'

In [66]:
!pwd

/content


In [67]:
!ls -al

total 9612
drwxr-xr-x 1 root root    4096 Mar 11 07:15 .
drwxr-xr-x 1 root root    4096 Mar 11 05:35 ..
-rw-r--r-- 1 root root  107579 Mar 11 07:04 BBBP_2.csv
-rw-r--r-- 1 root root  148743 Mar 11 05:38 BBBP.csv
-rw-r--r-- 1 root root  152469 Mar 11 07:04 BBBP_preds.csv
drwxr-xr-x 4 root root    4096 Mar  7 14:31 .config
drwx------ 6 root root    4096 Mar 11 07:15 drive
-rw-r--r-- 1 root root   61865 Mar 11 07:07 fda_approved.csv
-rw-r--r-- 1 root root   80502 Mar 11 07:07 fda_approved_preds.csv
-rw-r--r-- 1 root root     242 Mar 11 07:11 fda_approved_result.csv
-rw-r--r-- 1 root root 2193844 Mar 11 05:37 HIV.csv
-rw-r--r-- 1 root root     359 Mar 11 07:06 HIV_result.csv
-rw-r--r-- 1 root root 3160219 Mar 11 07:12 named.csv
-rw-r--r-- 1 root root 3873366 Mar 11 07:15 named_preds.csv
drwxr-xr-x 1 root root    4096 Mar  7 14:32 sample_data
-rw-r--r-- 1 root root    2767 Mar 11 07:06 substances.csv
-rw-r--r-- 1 root root    3896 Mar 11 07:06 substances_preds.csv
drwxr-xr-x 3 root root    

In [68]:
!cp -r /content/*.csv "/content/drive/My Drive/Chemprop_Backup_HIV_ALL_DATA/"

In [69]:
!cp -r /content/test_checkpoints_multimolecule/ "/content/drive/My Drive/Chemprop_Backup_HIV_ALL_DATA/"