In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

# Computing properties for Training Dataset

In [3]:
from src.property_utils import compute_and_save_properties

# Compute and normalize properties
df = compute_and_save_properties(
    "../data/processed_5l/train.csv",
    "../data/processed_5l/train_properties_for_rl.csv",
    "../data/processed_5l/property_stats.json",
)

print(df.head())


Computing Properties: 100%|██████████| 278937/278937 [08:45<00:00, 530.47it/s]


 Saved normalization stats to: ../data/processed_5l/property_stats.json
 Saved property-augmented dataset to: ../data/processed_5l/train_properties_for_rl.csv
 Valid molecules processed: 278937 / 278937
                                              smiles  \
0  [H]c1nc2nc([H])c(-c3c([H])c([H])c([H])c([H])c3...   
1  [H]c1nc2c([H])c([H])c([H])c([H])c2nc1C([H])(C(...   
2  [H]OC([H])(C([H])([H])C(=O)N([H])[C@]([H])(C(=...   
3  [H]c1c([H])c(C2([H])OC3=C(C2=O)N([H])c2c([H])c...   
4  [H]N1C(=O)C(C([H])([H])[H])(C([H])([H])C([H])(...   

                                           canonical       QED       SAS  \
0                          c1ccc(-c2cnc3nccnc3n2)cc1  0.646067  0.132024   
1  CCOC(=O)C(C(=NNC(=O)OC)C(=O)Nc1ccc(C(C)=O)cc1)...  0.226345  0.247179   
2  CC(=O)N[C@@H](CO)C(=O)N[C@@H](CCC(N)=O)C(=O)N[...  0.046497  0.386380   
3      Cc1cc2c(cc1C)NC1=C(N2)OC(c2ccc3c(c2)OCO3)C1=O  0.876894  0.258461   
4                          CC1(CCCC2(C)CCNC2=O)OCCO1  0.832792  0.322644   

   

# Computing properties for Validation Dataset

In [None]:
from src.data_utils import build_vocabulary, save_vocab, prepare_encoded_dataset
import pandas as pd

# Load processed dataset
train_df = pd.read_csv("../data/processed_5l/val_properties.csv")

# Build vocab from canonical SMILES
# token2idx = build_vocabulary(train_df['canonical'].tolist())
# save_vocab(token2idx, "../data/processed_5l/vocab_5l.json")

# Encode and save tensor dataset
prepare_encoded_dataset(
    "../data/processed_5l/val_properties.csv",
    "../data/processed_5l/vocab_5l.json",
    "../data/processed_5l/val_encoded.pt",
    max_len=128
)

 Saved encoded tensor dataset to ../data/processed_5l/val_encoded.pt


In [6]:
import torch

train_encoded = torch.load("../data/processed_5l/train_encoded.pt")
print(type(train_encoded))

<class 'torch.Tensor'>


  train_encoded = torch.load("../data/processed_5l/train_encoded.pt")


In [7]:
sample = train_encoded[0]  # first molecule
print(sample)


tensor([ 1, 51, 11, 51, 51, 51,  4,  7, 51, 12, 51, 61, 51, 13, 61, 51, 51, 61,
        51, 13, 61, 12,  5, 51, 51, 11, 69,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])


In [8]:
print(train_encoded.shape)

torch.Size([278937, 128])


# Computing properties for testing Dataset

In [3]:
from src.property_utils import compute_and_save_properties

# Compute and normalize properties
df = compute_and_save_properties(
    "../data/processed_5l/test.csv",
    "../data/processed_5l/test_properties.csv"
)

print(df.head())


Computing Properties: 100%|██████████| 15497/15497 [00:26<00:00, 580.65it/s]


 Saved property-augmented dataset to: ../data/processed_5l/test_properties.csv
 Valid molecules processed: 15497 / 15497
                                              smiles  \
0  [H]OC1([H])N2C([H])([H])C([H])([H])C([H])([H])...   
1                                [H]O[127Te](=O)O[H]   
2  [H]c1c([H])c(N([H])C(=O)c2c([H])c(C([H])([H])[...   
3  [H]OC([H])(C1([H])C(=O)OC([H])(C([H])(OC(=O)c2...   
4  [H]C([H])([H])N1C([H])([H])C2([H])N(C(=O)C(=O)...   

                                           canonical       QED       SAS  \
0                     O=C1CCCC2C3CC(CN12)C1CCCCN1C3O  0.751900  0.451792   
1                                      O=[127Te](O)O  0.456496  0.644136   
2                   Cc1ccc(N)c(C(=O)Nc2ccc(Br)cc2)c1  0.878111  0.078334   
3  O=C(OCC(OC(=O)c1ccccc1)C1CC(C(O)C2CCCCC2)C(=O)...  0.485748  0.327618   
4                       CCOC(=O)C(=O)N1C2CCC1CN(C)C2  0.481748  0.363960   

       LogP      TPSA     MolWt  
0  0.610796  0.029694  0.085454  
1  0.580357  0.03