In [1]:
'''import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
import random

# Define the path to the SDF file
sdf_file = 'chembl_34_sdf\chembl_34.sdf'

# Define batch size
batch_size = 1000  # Adjust this number based on your hardware capability

# Function to process molecules in batches
def process_molecules(supplier, batch_size):
    mols = []
    for i, mol in enumerate(supplier):
        if mol is not None:
            Chem.SanitizeMol(mol)
            mols.append(mol)
        if (i+1) % batch_size == 0:
            yield mols
            mols = []
    yield mols  # Yield remaining molecules

# Load the SDF file in chunks and process in batches
suppl = Chem.SDMolSupplier(sdf_file, sanitize=False)
data = []
for mols_batch in process_molecules(suppl, batch_size):
    # Randomly sample from the batch if needed
    sampled_mols = random.sample(mols_batch, min(len(mols_batch), 100))
    
    # Convert Mol objects to SMILES strings and get ChEMBL ID
    for mol in mols_batch:
        smiles = Chem.MolToSmiles(mol)
        #chembl_id = mol.GetProp('_Name')  # Get the ChEMBL ID
        data.append(smiles) #, chembl_id))
    
# Create a DataFrame with SMILES strings and ChEMBL IDs
df_smiles = pd.DataFrame(data, columns=['SMILES']) #, 'ChEMBL_ID'])
    
# Export the DataFrame with SMILES strings and ChEMBL IDs to a CSV file
df_smiles.to_csv('sampled_molecules_smiles.csv', index=False)
data = []  # Clear the data list for the next batch
'''

"import pandas as pd\nfrom rdkit import Chem\nfrom rdkit.Chem import PandasTools\nimport random\n\n# Define the path to the SDF file\nsdf_file = 'chembl_34_sdf\\chembl_34.sdf'\n\n# Define batch size\nbatch_size = 1000  # Adjust this number based on your hardware capability\n\n# Function to process molecules in batches\ndef process_molecules(supplier, batch_size):\n    mols = []\n    for i, mol in enumerate(supplier):\n        if mol is not None:\n            Chem.SanitizeMol(mol)\n            mols.append(mol)\n        if (i+1) % batch_size == 0:\n            yield mols\n            mols = []\n    yield mols  # Yield remaining molecules\n\n# Load the SDF file in chunks and process in batches\nsuppl = Chem.SDMolSupplier(sdf_file, sanitize=False)\ndata = []\nfor mols_batch in process_molecules(suppl, batch_size):\n    # Randomly sample from the batch if needed\n    sampled_mols = random.sample(mols_batch, min(len(mols_batch), 100))\n    \n    # Convert Mol objects to SMILES strings and ge

In [6]:
import pandas as pd
from rdkit import Chem,rdBase
import random
from tqdm import tqdm

# Define the path to the SDF file
sdf_file = 'chembl_34_sdf/chembl_34.sdf'

# Define the total number of molecules to sample
total_samples = 100000
rdBase.DisableLog('rdApp.warning')
# Function to process molecules in batches and sample randomly
def process_molecules_random_sample(supplier, total_samples):
    mols = []
    total_mols = sum(1 for _ in supplier)  
    supplier.reset() 
    
    sample_indices = set(random.sample(range(total_mols), total_samples))
    print("Starting molecule processing...")
    with ProgressBar(max_value=total_samples, prefix='Sampling Molecules: ') as bar:
        with rdBase.BlockLogs():  # Temporarily block RDKit logs
            for i, mol in enumerate(suppl):
                if i in sample_indices:
                    if mol is not None:
                        Chem.RemoveStereochemistry(mol)
                        try:
                            Chem.SanitizeMol(mol)
                            mols.append(mol)
                            bar.update(len(mols))  
                        except Exception as e:
                            print(f"Error sanitizing molecule {i}: {e}")
                if len(mols) >= total_samples:
                    break
            
# Load the SDF file and process molecules
suppl = Chem.SDMolSupplier(sdf_file, sanitize=False)
sampled_mols = process_molecules_random_sample(suppl, total_samples)

# Convert Mol objects to SMILES strings
data = []
for mol in sampled_mols:
    smiles = Chem.MolToSmiles(mol)
    data.append(smiles)

# Create a DataFrame with SMILES strings
df_smiles = pd.DataFrame(data, columns=['SMILES'])

# Save the DataFrame to a CSV file
df_smiles.to_csv('sampled_100k_molecules_smiles.csv', index=False)


KeyboardInterrupt: 

In [None]:
import pandas as pd
import dask.dataframe as dd
from rdkit import Chem
from rdkit.Chem import Descriptors
from dask.distributed import Client

# Initialize Dask client
client = Client()

# Function to compute descriptors for a SMILES string
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        descriptor_names = [desc[0] for desc in Descriptors.descList[:]]
        descriptor_values = [desc[1](mol) for desc in Descriptors.descList[:]]
        return pd.Series(descriptor_values, index=descriptor_names)
    else:
        return pd.Series([None]*len(Descriptors.descList[:]), index=[desc[0] for desc in Descriptors.descList[:]])

# Function to filter and compute descriptors for each partition
def process_partition(partition):
    partition = partition.dropna(subset=['SMILES'])
    descriptors = partition['SMILES'].apply(compute_descriptors)
    return pd.concat([partition, descriptors], axis=1)

# Read data as Dask DataFrame
dask_df = dd.read_csv('sampled_100k_molecules_smiles.csv')

# Filter out invalid SMILES and compute descriptors in parallel
processed_dask_df = dask_df.map_partitions(process_partition)

# Compute and save the result
result_df = processed_dask_df.compute()
result_df.to_csv('processed_molecule_data.csv', index=False)

# Close the Dask client
client.close()


Perhaps you already have a cluster running?
Hosting the HTTP server on port 51497 instead


In [None]:
import pandas as pd

# Load the processed data
molecule_data_df = pd.read_csv('processed_molecule_data.csv')

# Print the columns to check the available descriptors
print(molecule_data_df.columns)

Index(['SMILES', 'MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex',
       'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=211)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras as keras

# Load the processed data
molecule_data_df = pd.read_csv('processed_molecule_data.csv')
print(molecule_data_df.isnull().sum())
# Prepare the feature matrix (exclude the SMILES column and the target column)
X = molecule_data_df.drop(columns=['SMILES', 'MaxAbsEStateIndex'])  # Adjust the target column name as needed
y = molecule_data_df['MaxAbsEStateIndex']  # Assuming 'MolWt' is the target descriptor
# Print the shapes of X and y to ensure they are not empty
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


SMILES               0
MaxAbsEStateIndex    0
MaxEStateIndex       0
MinAbsEStateIndex    0
MinEStateIndex       0
                    ..
fr_thiazole          0
fr_thiocyan          0
fr_thiophene         0
fr_unbrch_alkane     0
fr_urea              0
Length: 211, dtype: int64
X shape: (1, 209)
y shape: (1,)


ValueError: With n_samples=1, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Define a simple neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)

Epoch 1/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 17.1514 - mae: 1.8310 - val_loss: 8.8248e-04 - val_mae: 0.0129
Epoch 2/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 3.8503e-04 - mae: 0.0087 - val_loss: 3.5363e-04 - val_mae: 0.0051
Epoch 3/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.8213e-04 - mae: 0.0056 - val_loss: 2.6967e-04 - val_mae: 0.0015
Epoch 4/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 2.3289e-04 - mae: 0.0070 - val_loss: 2.6172e-04 - val_mae: 0.0023
Epoch 5/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 994us/step - loss: 2.2337e-04 - mae: 0.0053 - val_loss: 2.5348e-04 - val_mae: 0.0019
Epoch 6/50
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 913us/step - loss: 2.6368e-04 - mae: 0.0067 - val_loss: 4.5198e-04 - val_mae: 0.0102
Epoch 7/50
[1m2000/2000[0

KeyboardInterrupt: 

In [None]:
# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'\nFinal evaluation on test set: Loss = {loss}, MAE = {mae}')

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1477 - mae: 0.2272

Final evaluation on test set: Loss = 0.14496897161006927, MAE = 0.2273031324148178


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 753us/step


In [None]:
# Print predicted vs actual values (optional, for a quick check)
print("\nSample predictions:")
for actual, predicted in zip(y_test, y_pred):
    print(f"Actual MolWt: {actual}, Predicted MolWt: {predicted}")


Sample predictions:
Actual MolWt: 420.4610000000002, Predicted MolWt: [420.50806]
Actual MolWt: 244.298, Predicted MolWt: [244.27133]
Actual MolWt: 397.3970000000002, Predicted MolWt: [397.4864]
Actual MolWt: 384.8630000000001, Predicted MolWt: [384.47208]
Actual MolWt: 453.9710000000001, Predicted MolWt: [453.55276]
Actual MolWt: 265.382, Predicted MolWt: [265.2452]
Actual MolWt: 328.42, Predicted MolWt: [328.348]
Actual MolWt: 279.33899999999994, Predicted MolWt: [279.32056]
Actual MolWt: 436.55200000000025, Predicted MolWt: [436.44455]
Actual MolWt: 393.4660000000002, Predicted MolWt: [393.48727]
Actual MolWt: 179.61, Predicted MolWt: [179.34969]
Actual MolWt: 450.5600000000002, Predicted MolWt: [450.53564]
Actual MolWt: 363.874, Predicted MolWt: [363.46155]
Actual MolWt: 136.14999999999998, Predicted MolWt: [136.04355]
Actual MolWt: 291.35, Predicted MolWt: [291.3472]
Actual MolWt: 540.6350000000003, Predicted MolWt: [540.65875]
Actual MolWt: 267.336, Predicted MolWt: [267.31613]
