### EXCRETION - HALF LIFE

In [7]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc.single_pred.adme import ADME
from tdc import Evaluator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from time import time

In [2]:
!pip install scikit-learn-intelex



### USING INTEL EXTENSION FOR SCIKIT LEARN

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Featurizer class definition
class Featurizer:
    def __init__(self, y_column, smiles_col='X', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


In [6]:
data = pd.read_csv('half_life_obach.csv')

# Split the data into features and target variable
X = data.drop(columns=['Y'])  # Features
y = data['Y']  # Target variable

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Re-add the target variable to the features for each split
X_train['Y'] = y_train.values
X_valid['Y'] = y_valid.values
X_test['Y'] = y_test.values

# Featurizer to generate molecular fingerprints
featurizer = ECFPFeaturizer(y_column='Y', smiles_col='X')  # Ensure the SMILES column name is correct

# Apply featurizer to generate fingerprints
X_train_featurized, y_train_featurized = featurizer(X_train)
X_valid_featurized, y_valid_featurized = featurizer(X_valid)
X_test_featurized, y_test_featurized = featurizer(X_test)

In [8]:
def train(X_train, y_train, X_valid, y_valid):
    # Set up parameters for the RandomForestRegressor
    params = {"n_estimators": 150, "random_state": 44, "n_jobs": -1}
    
    # Start the timer
    start = time()  
    
    # Train the model using the Intel® extension for Scikit-learn
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    # Calculate training time
    train_time = time() - start
    
    # Make predictions on the validation set
    valid_predictions = model.predict(X_valid)
    
    # Calculate validation metrics
    valid_rmse = mean_squared_error(y_valid, valid_predictions, squared=False)
    valid_mae = mean_absolute_error(y_valid, valid_predictions)
    valid_r2 = r2_score(y_valid, valid_predictions)
    
    # Print the validation results and training time
    print(f'Validation RMSE: {valid_rmse:.4f}, MAE: {valid_mae:.4f}, R2: {valid_r2:.4f}')
    print(f"Intel® extension for Scikit-learn training time: {train_time:.2f} seconds")
    
    return model

In [9]:
def predict(model, X_test):
    predictions = model.predict(X_test)
    return predictions


In [10]:
# Train model
model = train(X_train_featurized, y_train_featurized, X_valid_featurized, y_valid_featurized)

# Make predictions on the test set
predictions = predict(model, X_test_featurized)

# Evaluate predictions
rmse = mean_squared_error(y_test_featurized, predictions, squared=False)
mae = mean_absolute_error(y_test_featurized, predictions)
r2 = r2_score(y_test_featurized, predictions)

print(f'RMSE: {rmse}, MAE: {mae}, R2: {r2}')


Validation RMSE: 44.4596, MAE: 15.0276, R2: -6.1803
Intel® extension for Scikit-learn training time: 1.28 seconds
RMSE: 118.7920774330499, MAE: 35.003840561056116, R2: 0.25174990265947894


### WITHOUT USING INTEL ONEAPI EXTENSION FOR SCIKIT LEARN

In [11]:
from sklearnex import unpatch_sklearn

unpatch_sklearn()

In [14]:
def train(X_train, y_train, X_valid, y_valid):
    # Set up parameters for the RandomForestRegressor
    params = {"n_estimators": 150, "random_state": 44, "n_jobs": -1}
    
    # Start the timer
    start = time()  
    
    # Train the model using the Intel® extension for Scikit-learn
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    # Calculate training time
    train_time = time() - start
    
    # Make predictions on the validation set
    valid_predictions = model.predict(X_valid)
    
    # Calculate validation metrics
    valid_rmse = mean_squared_error(y_valid, valid_predictions, squared=False)
    valid_mae = mean_absolute_error(y_valid, valid_predictions)
    valid_r2 = r2_score(y_valid, valid_predictions)
    
    # Print the validation results and training time
    print(f'Validation RMSE: {valid_rmse:.4f}, MAE: {valid_mae:.4f}, R2: {valid_r2:.4f}')
    print(f"Intel® extension for Scikit-learn training time: {train_time:.2f} seconds")
    
    return model

In [13]:
# Train model
model = train(X_train_featurized, y_train_featurized, X_valid_featurized, y_valid_featurized)

# Make predictions on the test set
predictions = predict(model, X_test_featurized)

# Evaluate predictions
rmse = mean_squared_error(y_test_featurized, predictions, squared=False)
mae = mean_absolute_error(y_test_featurized, predictions)
r2 = r2_score(y_test_featurized, predictions)

print(f'RMSE: {rmse}, MAE: {mae}, R2: {r2}')


Validation RMSE: 44.4596, MAE: 15.0276, R2: -6.1803
Intel® extension for Scikit-learn training time: 6.22 seconds
RMSE: 118.7920774330499, MAE: 35.003840561056116, R2: 0.25174990265947894


### SAVING THE MODEL

In [15]:
# Save the trained model as a pickle file
with open('Excretion_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [16]:
class Pre_Featurizer:
    def __init__(self, y_column, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class Pre_ECFPFeaturizer(Pre_Featurizer):
    def __init__(self, y_column=None, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)

            # Only append labels if y_column is present in the dataframe
            if self.y_column and self.y_column in df.columns:
                labels.append(row[self.y_column])

        fingerprints = np.array(fingerprints)
        labels = np.array(labels) if labels else None
        return fingerprints, labels

In [17]:
# Load the model from pickle file
with open('Excretion_model.pkl', 'rb') as f:
    model = pickle.load(f)

def predict_single_smiles(smiles, model, featurizer):
    # Create a dataframe for the single SMILES string (since the featurizer expects a dataframe)
    df = pd.DataFrame({featurizer.smiles_col: [smiles]})

    # Featurize the SMILES string
    X_new, _ = featurizer(df)  # We don't need the labels here, so they can be ignored

    # Make prediction (for regression, no need for predict_proba)
    prediction_value = model.predict(X_new)[0]  # Predicted solubility value

    return prediction_value

# Initialize the same featurizer as used during training
featurizer = Pre_ECFPFeaturizer()  # No need to pass y_column for prediction
smiles_string = "CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O"  # CC(C)CC1=CC=C(C=C1)C(C)C(=O)O

# Make predictions for the single SMILES string
predicted_value = predict_single_smiles(smiles_string, model, featurizer)

print(f'Predicted Half life Value: {predicted_value}')



Predicted Half life Value: 23.83733333333333
