### TOXICITY

In [24]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tdc.single_pred.adme import ADME
from tdc import Evaluator
from rdkit import Chem
from rdkit.Chem import AllChem
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

In [1]:
!pip install scikit-learn-intelex



### USING INTEL EXTENSION FOR SCIKIT LEARN

In [2]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [9]:
from time import time

In [4]:
class Featurizer:
    def __init__(self, y_column, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


In [13]:
def train(X_train, y_train, X_valid, y_valid):
    # Model parameters
    params = {"n_estimators": 150, "random_state": 44, "n_jobs": -1}
    
    # Start the timer
    start_time = time()
    
    # Initialize and train the RandomForestClassifier model
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # Calculate training time
    train_time = time() - start_time
    
    # Make predictions on the validation set
    val_predictions_proba = model.predict_proba(X_valid)[:, 1]  # Probability for class 1
    val_preds_class = model.predict(X_valid)  # Predicted class labels
    
    # Compute metrics
    auc_val = roc_auc_score(y_valid, val_predictions_proba)
    
    # Print validation metrics and training time
    print(f'Validation Accuracy: {auc_val:.4f}')
    print(f"Intel® extension for Scikit-learn training time: {train_time:.2f} s")
    
    return model

In [14]:
def predict(model, X_test):
    test_predictions_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
    test_predictions_class = model.predict(X_test)  # Predicted class labels
    return test_predictions_proba, test_predictions_class


In [15]:
data = pd.read_csv('herg_karim.tab', sep='\t')

# Split the data into features and target variable
X = data.drop(columns=['Y'])  # Features
y = data['Y']  # Target variable

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Re-add the target variable to the features for each split
X_train['Y'] = y_train.values
X_valid['Y'] = y_valid.values
X_test['Y'] = y_test.values

# Featurizer to generate molecular fingerprints
featurizer = ECFPFeaturizer(y_column='Y', smiles_col='Drug')  # Ensure the SMILES column name is correct

# Apply featurizer to generate fingerprints
X_train_featurized, y_train_featurized = featurizer(X_train)
X_valid_featurized, y_valid_featurized = featurizer(X_valid)
X_test_featurized, y_test_featurized = featurizer(X_test)

In [16]:
# Train the model
model = train(X_train_featurized, y_train_featurized, X_valid_featurized, y_valid_featurized)

# Make predictions on the test set
predictions_proba, predictions_class = predict(model, X_test_featurized)

# Evaluate on the test set
auc_test = roc_auc_score(y_test_featurized, predictions_proba)

print(f'Test Accuracy: {auc_test}')


Validation Accuracy: 0.9232
Intel® extension for Scikit-learn training time: 2.25 s
Test Accuracy: 0.9145075274906791


### WITHOUT USING INTEL ONEAPI EXTENSION FOR SCIKIT LEARN

In [17]:
from sklearnex import unpatch_sklearn

unpatch_sklearn()

In [20]:
def train(X_train, y_train, X_valid, y_valid):
    # Model parameters
    params = {"n_estimators": 150, "random_state": 44, "n_jobs": -1}
    
    # Start the timer
    start_time = time()
    
    # Initialize and train the RandomForestClassifier model
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # Calculate training time
    train_time = time() - start_time 
    
    # Make predictions on the validation set
    val_predictions_proba = model.predict_proba(X_valid)[:, 1]  # Probability for class 1
    val_preds_class = model.predict(X_valid)  # Predicted class labels
    
    # Compute metrics
    auc_val = roc_auc_score(y_valid, val_predictions_proba)
    
    # Print validation metrics and training time
    print(f'Validation Accuracy: {auc_val:.4f}')
    print(f"Without Intel® extension for Scikit-learn training time: {train_time:.2f} s")
    
    return model

In [19]:
# Train the model
model = train(X_train_featurized, y_train_featurized, X_valid_featurized, y_valid_featurized)

# Make predictions on the test set
predictions_proba, predictions_class = predict(model, X_test_featurized)

# Evaluate on the test set
auc_test = roc_auc_score(y_test_featurized, predictions_proba)

print(f'Test Accuracy: {auc_test}')


Validation Accuracy: 0.9232
Without Intel® extension for Scikit-learn training time: 7.10 s
Test Accuracy: 0.9145075274906791


### SAVING THE MODEL

In [21]:
# Save the trained model to a pickle file
with open('toxicity_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [22]:
# Function to predict for a single SMILES string
def predict_single_smiles(smiles, model, featurizer):
    # Create a dataframe for the single SMILES string (since the featurizer expects a dataframe)
    df = pd.DataFrame({featurizer.smiles_col: [smiles]})

    # Featurize the SMILES string
    X_new, _ = featurizer(df)  # We don't need the labels here, so they can be ignored

    # Make prediction (predict_proba returns probabilities for each class)
    prediction_proba = model.predict_proba(X_new)[:, 1]  # Probability for class 1
    prediction_class = model.predict(X_new)  # Predicted class label

    return prediction_class[0], prediction_proba[0]

In [23]:
class Featurizer:
    def __init__(self, y_column=None, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column=None, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            
            # Only add label if y_column exists (not needed for single SMILES prediction)
            if self.y_column and self.y_column in df.columns:
                y = row[self.y_column]
                labels.append(y)

        fingerprints = np.array(fingerprints)
        labels = np.array(labels) if labels else None
        return fingerprints, labels


# Load the trained model from the pickle file
with open('BBBP_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Initialize the featurizer (without a y_column for prediction)
featurizer = ECFPFeaturizer(smiles_col='Drug')  # y_column not needed for predictions

# Example SMILES string for prediction
smiles_string = "CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O"  # ECC(C)CC1=CC=C(C=C1)C(C)C(=O)O 

# Make predictions for the single SMILES string
predicted_class, predicted_probability = predict_single_smiles(smiles_string, model, featurizer)

# Output the predictions
print(f'Predicted Class: {predicted_class}')
print(f'Predicted Probability for Class 1: {predicted_probability}')

Predicted Class: 1
Predicted Probability for Class 1: 0.96
