In [1]:
import sys, os, time, warnings
import pandas as pd
import numpy as np
import deepchem as dc
from pyspark.sql import SparkSession
from rdkit import Chem

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Start Spark session
spark = SparkSession.builder \
    .appName("Solubility Prediction with Spark") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
print("✅ Spark session started")


2025-05-03 18:49:54.982497: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-03 18:49:55.429559: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-05-03 18:49:55.429581: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-05-03 18:49:55.485757: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:49:56.665560: W tensorflow/stream_executor/platform/de

✅ Spark session started


In [2]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType

def load_and_clean_data(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    is_valid_udf = udf(lambda smi: Chem.MolFromSmiles(smi) is not None, BooleanType())
    df = df.withColumn("valid", is_valid_udf(col("smiles"))).filter(col("valid"))
    return df.select("smiles", "measured log solubility in mols per litre")

start_load = time.time()
sdf = load_and_clean_data("/home/mounir/Downloads/esol.csv")
data_df = sdf.toPandas()
end_load = time.time()
print(f"✅ Data loaded and cleaned using Spark in {end_load - start_load:.2f} seconds")


[Stage 2:>                                                          (0 + 1) / 1]

✅ Data loaded and cleaned using Spark in 6.22 seconds


                                                                                

In [3]:
featurizer = dc.feat.ConvMolFeaturizer()
mols = data_df['smiles'].apply(Chem.MolFromSmiles)
features = featurizer.featurize(mols)
valid_idx = [i for i, f in enumerate(features) if f is not None]
features = [f for f in features if f is not None]
labels = data_df.iloc[valid_idx]['measured log solubility in mols per litre'].values

dataset = dc.data.NumpyDataset(X=np.array(features), y=labels)


In [4]:
splitter = dc.splits.RandomSplitter()
train_set, valid_set = splitter.train_test_split(dataset, frac_train=0.8, seed=42)


In [5]:
from itertools import product

def spark_grid_search_graphconv(param_grid):
    grid = list(product(
        param_grid['batch_size'],
        param_grid['graph_conv_layers'],
        param_grid['dense_layer_size'],
        param_grid['dropout']
    ))
    grid_rdd = spark.sparkContext.parallelize(grid)

    bc_train = spark.sparkContext.broadcast(train_set)
    bc_valid = spark.sparkContext.broadcast(valid_set)

    def evaluate_model(config):
        bs, layers, dense, drop = config
        model = dc.models.GraphConvModel(
            1, mode='regression',
            batch_size=bs,
            graph_conv_layers=layers,
            dense_layer_size=dense,
            dropout=drop
        )
        try:
            model.fit(bc_train.value, nb_epoch=30)
            r2 = model.evaluate(bc_valid.value, [dc.metrics.Metric(dc.metrics.pearson_r2_score)])['pearson_r2_score']
            return (r2, bs, layers, dense, drop)
        except:
            return (-float('inf'), bs, layers, dense, drop)

    return grid_rdd.map(evaluate_model).collect()

params = {
    'batch_size': [32, 16],
    'graph_conv_layers': [[128, 128], [256, 256]],
    'dense_layer_size': [256, 128],
    'dropout': [0.0]
}

start_grid = time.time()
results = spark_grid_search_graphconv(params)
end_grid = time.time()

best = max(results, key=lambda x: x[0])
print(f"\n✅ Best Spark Grid Search Config:")
print(f"R²: {best[0]:.4f}\nBatch size: {best[1]}, Layers: {best[2]}, Dense: {best[3]}, Dropout: {best[4]}")
print(f"⏱️ Grid search completed in {end_grid - start_grid:.2f} seconds")


2025-05-03 18:50:12.761709: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:50:12.761709: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:50:12.761738: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:50:12.761738: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:50:12.761762: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r


✅ Best Spark Grid Search Config:
R²: 0.8564
Batch size: 32, Layers: [256, 256], Dense: 256, Dropout: 0.0
⏱️ Grid search completed in 163.67 seconds


                                                                                

In [8]:
model_gcm = dc.models.GraphConvModel(
    1, mode='regression',
    batch_size=best[1],
    graph_conv_layers=best[2],
    dense_layer_size=best[3],
    dropout=best[4]
)

start_train = time.time()
model_gcm.fit(train_set, nb_epoch=100)
end_train = time.time()
print(f"✅ Final model trained in {end_train - start_train:.2f} seconds")


✅ Final model trained in 85.04 seconds


In [9]:
metrics = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]

gcm_train_scores = model_gcm.evaluate(train_set, metrics)
gcm_valid_scores = model_gcm.evaluate(valid_set, metrics)

print("\n📊 GCM Evaluation Results:")
print("Train:", gcm_train_scores)
print("Valid:", gcm_valid_scores)



📊 GCM Evaluation Results:
Train: {'mean-rms_score': 0.28323619404949835, 'mean-mae_score': 0.22017369638386694, 'mean-pearson_r2_score': 0.9867738633445262}
Valid: {'mean-rms_score': 0.6805603958091256, 'mean-mae_score': 0.5188648514290035, 'mean-pearson_r2_score': 0.8927138127804631}


In [10]:
import time
from itertools import product
import deepchem as dc
from pyspark.sql import SparkSession

# 🧠 Step 0: Define Spark session
spark = SparkSession.builder.appName("MPNNGridSearch").getOrCreate()

# 📌 Step 1: Define task and load ESOL dataset with WeaveFeaturizer
tasks = ['measured log solubility in mols per litre']
featurizer = dc.feat.WeaveFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=featurizer)

print("🔄 Loading and featurizing ESOL dataset...")
dataset = loader.create_dataset("/home/mounir/Downloads/esol.csv")

# 🔀 Step 2: Train/Validation Split
splitter = dc.splits.RandomSplitter()
train_set, valid_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

# 🚀 Step 3: Define Spark-based Grid Search for MPNN
def spark_grid_search_mpnn(param_grid):
    grid = list(product(
        param_grid['batch_size'],
        param_grid['n_atom_feat'],
        param_grid['n_pair_feat'],
        param_grid['n_hidden']
    ))
    grid_rdd = spark.sparkContext.parallelize(grid)

    # Broadcast train/valid sets
    bc_train = spark.sparkContext.broadcast(train_set)
    bc_valid = spark.sparkContext.broadcast(valid_set)

    def evaluate_mpnn(config):
        bs, atom_feat, pair_feat, hidden = config
        model = dc.models.MPNNModel(
            n_tasks=1, mode='regression',
            batch_size=bs,
            n_atom_feat=atom_feat,
            n_pair_feat=pair_feat,
            n_hidden=hidden,
            learning_rate=0.0001,
            T=3, M=5
        )
        try:
            model.fit(bc_train.value, nb_epoch=30)
            metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
            r2 = model.evaluate(bc_valid.value, [metric])['pearson_r2_score']
            return (r2, bs, atom_feat, pair_feat, hidden)
        except:
            return (-float('inf'), bs, atom_feat, pair_feat, hidden)

    return grid_rdd.map(evaluate_mpnn).collect()

# 🧪 Step 4: Run Grid Search
mpnn_params = {
    'batch_size': [32, 16],
    'n_atom_feat': [75],
    'n_pair_feat': [14],
    'n_hidden': [100]
}

print("🔍 Starting MPNN grid search...")
start_mpnn_grid = time.time()
mpnn_results = spark_grid_search_mpnn(mpnn_params)
end_mpnn_grid = time.time()

# 📈 Step 5: Report best result
best_mpnn = max(mpnn_results, key=lambda x: x[0])
print(f"\n✅ Best MPNN Config (Spark):")
print(f"R²: {best_mpnn[0]:.4f}")
print(f"Params: batch={best_mpnn[1]}, atom_feat={best_mpnn[2]}, pair_feat={best_mpnn[3]}, hidden={best_mpnn[4]}")
print(f"⏱️ Grid search time: {end_mpnn_grid - start_mpnn_grid:.2f}s")

# 🏁 Step 6: Train Final Model with Best Hyperparameters
print("🚀 Training final MPNN model with best hyperparameters...")
mpnn_model = dc.models.MPNNModel(
    n_tasks=1, mode='regression',
    batch_size=best_mpnn[1],
    n_atom_feat=best_mpnn[2],
    n_pair_feat=best_mpnn[3],
    n_hidden=best_mpnn[4],
    learning_rate=0.0001,
    T=3, M=5
)

start_mpnn_train = time.time()
mpnn_model.fit(train_set, nb_epoch=100)
end_mpnn_train = time.time()
print(f"✅ Final MPNN model trained in {end_mpnn_train - start_mpnn_train:.2f}s")


🔄 Loading and featurizing ESOL dataset...


25/05/03 18:55:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


🔍 Starting MPNN grid search...


2025-05-03 18:55:27.691851: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:55:27.691851: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:55:27.692124: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:55:27.692447: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-03 18:55:27.694500: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r


✅ Best MPNN Config (Spark):
R²: 0.9215
Params: batch=16, atom_feat=75, pair_feat=14, hidden=100
⏱️ Grid search time: 1258.22s
🚀 Training final MPNN model with best hyperparameters...


2025-05-03 19:16:29.312665: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 175840000 exceeds 10% of free system memory.
2025-05-03 19:16:29.312979: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 175840000 exceeds 10% of free system memory.
2025-05-03 19:16:29.313045: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 175840000 exceeds 10% of free system memory.
2025-05-03 19:16:29.570381: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 175840000 exceeds 10% of free system memory.
2025-05-03 19:16:29.633542: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 175840000 exceeds 10% of free system memory.


✅ Final MPNN model trained in 2323.87s


In [11]:
# Evaluate best MPNN model
mpnn_metrics = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]

mpnn_train_scores = mpnn_model.evaluate(train_set, mpnn_metrics)
mpnn_valid_scores = mpnn_model.evaluate(valid_set, mpnn_metrics)

print("\n📊 MPNN Evaluation Results:")
print("Train:", mpnn_train_scores)
print("Valid:", mpnn_valid_scores)



📊 MPNN Evaluation Results:
Train: {'mean-rms_score': 0.1928989628892072, 'mean-mae_score': 0.14326998514466055, 'mean-pearson_r2_score': 0.9918172976521502}
Valid: {'mean-rms_score': 0.6183405969062387, 'mean-mae_score': 0.44768700133013506, 'mean-pearson_r2_score': 0.9119158937600114}


In [14]:
# 📊 Evaluate both models
metrics = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]


results = {
    'gcm': {
        'mean-rms_score': gcm_valid_scores.get('rms_score', float('nan')),
        'mean-mae_score': gcm_valid_scores.get('mae_score', float('nan')),
        'mean-pearson_r2_score': gcm_valid_scores.get('pearson_r2_score', float('nan'))
    },
    'mpnn': {
        'mean-rms_score': mpnn_valid_scores.get('rms_score', float('nan')),
        'mean-mae_score': mpnn_valid_scores.get('mae_score', float('nan')),
        'mean-pearson_r2_score': mpnn_valid_scores.get('pearson_r2_score', float('nan'))
    }
}


In [15]:
from IPython.display import display, clear_output
from ipywidgets import widgets, VBox, HBox
from rdkit import Chem
from rdkit.Chem import Draw

# Use correct featurizers
featurizer_gcm = dc.feat.ConvMolFeaturizer()
featurizer_mpnn = dc.feat.WeaveFeaturizer()

def predict_with_model(smiles, selected_model):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None, "❌ Invalid SMILES"

    try:
        if selected_model == "GraphConvModel":
            feats = featurizer_gcm.featurize([mol])
            ds = dc.data.NumpyDataset(X=np.array(feats))
            pred = model_gcm.predict(ds)[0][0]
            mae = results['gcm']['mean-mae_score']
        elif selected_model == "MPNNModel":
            feats = featurizer_mpnn.featurize([mol])
            ds = dc.data.NumpyDataset(X=np.array(feats))
            pred = mpnn_model.predict(ds)[0][0]
            mae = results['mpnn']['mean-mae_score']
        else:
            return None, f"❌ {selected_model} is not available."
    except Exception as e:
        return None, f"Prediction error: {e}"

    lower, upper = pred - mae, pred + mae
    return mol, f"✅ Predicted log solubility: {pred:.2f} (± {mae:.2f}) → [{lower:.2f}, {upper:.2f}]"

def launch_predictor_interface():
    model_selector = widgets.Dropdown(
        options=['GraphConvModel', 'MPNNModel'],
        value='GraphConvModel',
        description='Model:',
        style={'description_width': 'initial'}
    )
    smiles_input = widgets.Text(
        value='CCO',
        description='SMILES:',
        placeholder='Enter a SMILES string',
        layout=widgets.Layout(width='60%')
    )
    predict_button = widgets.Button(description='Predict', button_style='success')
    output = widgets.Output()

    def on_click(_):
        with output:
            clear_output()
            smiles = smiles_input.value.strip()
            model_choice = model_selector.value
            mol, result = predict_with_model(smiles, model_choice)

            if mol:
                display(Draw.MolToImage(mol, size=(250, 250)))
            print(result)

    predict_button.on_click(on_click)
    display(VBox([
        widgets.HTML("<h3 style='color:#1f77b4;'>🔬 Interactive Solubility Predictor</h3>"),
        HBox([smiles_input, model_selector, predict_button]),
        output
    ]))

launch_predictor_interface()


VBox(children=(HTML(value="<h3 style='color:#1f77b4;'>🔬 Interactive Solubility Predictor</h3>"), HBox(children…