In [36]:
# Uncomment below if needed
# !pip install deepchem rdkit-pypi pyspark

import time
import numpy as np
import pandas as pd
from rdkit import Chem
import deepchem as dc
from deepchem.feat import ConvMolFeaturizer
from deepchem.models import GraphConvModel
from deepchem.splits import RandomSplitter
from deepchem.models.optimizers import ExponentialDecay
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import BooleanType, DoubleType
from tqdm import tqdm
from deepchem.metrics import Metric, pearson_r2_score, mean_absolute_error

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Spark
spark = SparkSession.builder.appName("LogPPredictor").getOrCreate()


INFO:py4j.clientserver:Error while sending or receiving.
Traceback (most recent call last):
  File "/home/mounir/spark_logp_fixed_env/lib/python3.10/site-packages/py4j/clientserver.py", line 503, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer
INFO:py4j.clientserver:Closing down clientserver connection
INFO:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/mounir/spark_logp_fixed_env/lib/python3.10/site-packages/py4j/clientserver.py", line 503, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mounir/spark_logp_fixed_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/mounir/spark_logp_fixed_en

In [37]:
class LogPPredictor:
    def __init__(self):
        self.metrics = {
            'performance': {'train_r2': None, 'test_r2': None, 'test_mae': None},
            'timing': {'total_training': None}
        }
        self.featurizer = ConvMolFeaturizer()
        self.model = None
        self.dataset = None
        self.dataset_df = None

    def load_data(self, data_paths):
        logger.info("Loading and processing data...")
        dfs = []
        for path in data_paths:
            try:
                df = spark.read.csv(path, header=True, inferSchema=True)
                target_col = next((c for c in ['exp', 'logP'] if c in df.columns), None)
                if not target_col:
                    raise ValueError("Dataset must contain 'exp' or 'logP'")
                if target_col == 'logP':
                    df = df.withColumnRenamed('logP', 'exp')
                df = df.select('smiles', 'exp')

                validate_smiles = udf(lambda x: Chem.MolFromSmiles(x) is not None, BooleanType())
                df = df.filter(validate_smiles(col('smiles')))
                df = df.withColumn('exp', col('exp').cast(DoubleType())) \
                       .filter(col('exp').isNotNull()) \
                       .filter((col('exp') >= -10) & (col('exp') <= 10))
                dfs.append(df)
            except Exception as e:
                logger.error(f"Failed loading {path}: {e}")
                raise

        if not dfs:
            raise ValueError("No valid datasets loaded")

        combined_df = dfs[0]
        for df in dfs[1:]:
            combined_df = combined_df.union(df)

        pandas_df = combined_df.toPandas()
        features = self.featurizer.featurize(pandas_df['smiles'].apply(Chem.MolFromSmiles))
        valid_indices = [i for i, f in enumerate(features) if f is not None]
        valid_features = [f for f in features if f is not None]

        self.dataset = dc.data.NumpyDataset(
            X=np.array(valid_features),
            y=pandas_df['exp'].values[valid_indices]
        )
        self.dataset_df = pandas_df.iloc[valid_indices]
        logger.info("Featurization complete.")

    def train_model(self, max_epochs=200, patience=20):
        if not self.dataset:
            raise ValueError("No dataset to train on.")

        splitter = RandomSplitter()
        train, valid, test = splitter.train_valid_test_split(
            self.dataset, frac_train=0.7, frac_valid=0.15, frac_test=0.15, seed=42
        )

        lr_schedule = ExponentialDecay(0.001, decay_rate=0.95, decay_steps=1000)
        self.model = GraphConvModel(
            n_tasks=1, mode='regression',
            graph_conv_layers=[128, 128, 64],
            dense_layer_size=256, dropout=0.3,
            batch_size=32,
            optimizer_kwargs={'learning_rate': lr_schedule},
            batch_normalize=False
        )

        best_score = -float('inf')
        no_improve = 0
        start = time.time()

        print("🧠 Training started...")
        for epoch in tqdm(range(max_epochs), desc="Epochs"):
            self.model.fit(train, nb_epoch=1)

            val_score = self.model.evaluate(valid, [Metric(pearson_r2_score)])['pearson_r2_score']
            if val_score > best_score:
                best_score = val_score
                no_improve = 0
                self.model.save_checkpoint()
            else:
                no_improve += 1

            if no_improve >= patience:
                logger.info(f"⏹ Early stopping at epoch {epoch} (best R² = {best_score:.4f})")
                break

        self.model.restore()
        self.metrics['timing']['total_training'] = time.time() - start

        metrics = [Metric(pearson_r2_score), Metric(mean_absolute_error)]
        self.metrics['performance']['train_r2'] = self.model.evaluate(train, metrics)['pearson_r2_score']
        test_results = self.model.evaluate(test, metrics)
        self.metrics['performance']['test_r2'] = test_results['pearson_r2_score']
        self.metrics['performance']['test_mae'] = test_results['mean_absolute_error']

        print(f"✅ Training finished in {self.metrics['timing']['total_training']:.2f} seconds")
        print(f"Train R²: {self.metrics['performance']['train_r2']:.3f}")
        print(f"Test R²: {self.metrics['performance']['test_r2']:.3f}")
        print(f"Test MAE: {self.metrics['performance']['test_mae']:.3f}")

    def predict_logP(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            raise ValueError("Invalid SMILES")
        feat = self.featurizer.featurize([mol])
        if not feat or feat[0] is None:
            raise ValueError("Featurization failed")
        return float(self.model.predict(dc.data.NumpyDataset(X=np.array(feat)))[0][0])


In [38]:
predictor = LogPPredictor()

DATA_PATHS = [
    "/home/mounir/Downloads/Lipophilicity.csv",
    "/home/mounir/Downloads/logP_dataset.csv"
]

predictor.load_data(DATA_PATHS)


INFO:__main__:Loading and processing data...
INFO:deepchem.feat.base_classes:Featurizing datapoint 0                         
INFO:deepchem.feat.base_classes:Featurizing datapoint 1000
INFO:deepchem.feat.base_classes:Featurizing datapoint 2000
INFO:deepchem.feat.base_classes:Featurizing datapoint 3000
INFO:deepchem.feat.base_classes:Featurizing datapoint 4000
INFO:deepchem.feat.base_classes:Featurizing datapoint 5000
INFO:deepchem.feat.base_classes:Featurizing datapoint 6000
INFO:deepchem.feat.base_classes:Featurizing datapoint 7000
INFO:deepchem.feat.base_classes:Featurizing datapoint 8000
INFO:deepchem.feat.base_classes:Featurizing datapoint 9000
INFO:deepchem.feat.base_classes:Featurizing datapoint 10000
INFO:deepchem.feat.base_classes:Featurizing datapoint 11000
INFO:deepchem.feat.base_classes:Featurizing datapoint 12000
INFO:deepchem.feat.base_classes:Featurizing datapoint 13000
INFO:deepchem.feat.base_classes:Featurizing datapoint 14000
INFO:deepchem.feat.base_classes:Featurizing

In [39]:
print("Training model...")
predictor.train_model()


INFO:deepchem.splits.splitters:Computing train/valid/test indices


Training model...
🧠 Training started...


INFO:deepchem.models.keras_model:Ending global_step 100: Average loss 1.57579
INFO:deepchem.models.keras_model:Ending global_step 200: Average loss 1.1828
INFO:deepchem.models.keras_model:Ending global_step 300: Average loss 0.845502
INFO:deepchem.models.keras_model:Ending global_step 400: Average loss 0.790004
INFO:deepchem.models.keras_model:Ending global_step 412: Average loss 0.752826
INFO:deepchem.models.keras_model:TIMING: model fitting took 20.249 s
INFO:deepchem.metrics.metric:computed_metrics: [0.6345631084641901]
Epochs:   0%|▏                                | 1/200 [00:22<1:14:47, 22.55s/it]INFO:deepchem.models.keras_model:Ending global_step 500: Average loss 0.763539
INFO:deepchem.models.keras_model:Ending global_step 600: Average loss 0.701614
INFO:deepchem.models.keras_model:Ending global_step 700: Average loss 0.655352
INFO:deepchem.models.keras_model:Ending global_step 800: Average loss 0.56137
INFO:deepchem.models.keras_model:Ending global_step 824: Average loss 0.5820

✅ Training finished in 423.36 seconds
Train R²: 0.847
Test R²: 0.842
Test MAE: 0.409


In [40]:
print("\n=== Evaluation Metrics ===")
print(f"Train R²: {predictor.metrics['performance']['train_r2']:.3f}")
print(f"Test R²: {predictor.metrics['performance']['test_r2']:.3f}")
print(f"Test MAE: {predictor.metrics['performance']['test_mae']:.3f}")
print(f"Training Time: {predictor.metrics['timing']['total_training']:.2f} seconds")



=== Evaluation Metrics ===
Train R²: 0.847
Test R²: 0.842
Test MAE: 0.409
Training Time: 423.36 seconds


In [41]:
from IPython.display import display, clear_output
from ipywidgets import widgets
from rdkit import Chem
from rdkit.Chem import Draw

def predict_logP_interval_direct(smiles, model, featurizer):
    """Predict logP ± 0.5 range for a given SMILES"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")

    features = featurizer.featurize([mol])
    dataset = dc.data.NumpyDataset(features)
    prediction = model.predict(dataset)[0][0]
    return prediction - 0.5, prediction + 0.5

def launch_simple_predictor_interface(model, featurizer, dataset_df=None):
    """Interactive logP prediction interface"""
    smiles_input = widgets.Text(
        value='CCO',
        placeholder='Enter SMILES (e.g., CCO)',
        description='SMILES:',
        layout=widgets.Layout(width='80%')
    )
    
    predict_button = widgets.Button(
        description="Predict", button_style='info'
    )
    output = widgets.Output()

    def make_prediction(_):
        with output:
            clear_output(wait=True)
            smiles = smiles_input.value.strip()
            if not smiles:
                print("Please enter a SMILES string.")
                return

            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol:
                    display(Draw.MolToImage(mol, size=(300, 300)))
                else:
                    print("Invalid SMILES string.")
                    return

                if dataset_df is not None and smiles in dataset_df['smiles'].values:
                    actual = dataset_df[dataset_df['smiles'] == smiles]['exp'].values[0]
                    print(f"Known compound - Actual logP: {actual:.2f}")

                low, high = predict_logP_interval_direct(smiles, model, featurizer)
                print(f"\nPredicted logP range: {low:.2f} to {high:.2f}")
                print(f"Estimated logP: {(low + high)/2:.2f}")

                # Simple interpretation
                if high < 0:
                    print("💧 Highly hydrophilic (water-soluble)")
                elif low < 1:
                    print("🧪 Moderately hydrophilic")
                elif high < 3:
                    print("⚖️ Balanced solubility")
                elif low < 5:
                    print("🛢 Moderately hydrophobic")
                else:
                    print("🔥 Highly hydrophobic (lipid-soluble)")

            except Exception as e:
                print(f"Prediction error: {str(e)}")

    predict_button.on_click(make_prediction)

    display(widgets.VBox([
        widgets.HTML("<h2 style='color:#1f77b4;'>LogP Predictor Interface</h2>"),
        widgets.HTML("<p>Enter a SMILES string to estimate its logP and solubility interpretation</p>"),
        widgets.HBox([smiles_input, predict_button]),
        output
    ]))

# Run this after model training
launch_simple_predictor_interface(
    model=predictor.model,
    featurizer=predictor.featurizer,
    dataset_df=predictor.dataset_df
)


VBox(children=(HTML(value="<h2 style='color:#1f77b4;'>LogP Predictor Interface</h2>"), HTML(value='<p>Enter a …

In [42]:
pip list

IOStream.flush timed out
Package                      Version
---------------------------- --------------
absl-py                      2.2.2
anyio                        4.9.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.5
attrs                        25.3.0
babel                        2.17.0
beautifulsoup4               4.13.4
bleach                       6.2.0
cachetools                   5.5.2
certifi                      2025.1.31
cffi                         1.17.1
charset-normalizer           3.4.1
comm                         0.2.2
contourpy                    1.3.1
cycler                       0.12.1
debugpy                      1.8.14
decorator                    5.2.1
deepchem                     2.7.1
defusedxml                   0.7.1
exceptiongroup               1.2.2
executing                    2.2.0
fastjso