In [12]:
#generate the molecular properties of the input molecules using the RDKit, using only the properties used in the model.
import joblib
import numpy as np

from rdkit import RDLogger
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem.rdmolops import SanitizeFlags

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [16]:
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList]])

In [3]:
csv_directory = ['oral_abs_class',
 'hia_class',
 'crl_toxicity_class',
 'ML_input_p450-cyp3a4',
 'ames_mutagenicity_class',
 'ML_input_p450-cyp2c19',
 'hep_g2_toxicity_class',
 'nih_toxicity_class',
 'herg_blockers_class',
 'hek_toxicity_class',
 'hacat_toxicity_class',
 'ML_input_p450-cyp1a2',
 'bbb_class',
 'ML_input_p450-cyp2d6',
 'ML_input_p450-cyp2c9',]

In [14]:
def smiles_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")
    
    # Calculate molecular descriptors
    descriptor = np.array(desc_calc.CalcDescriptors(mol)).reshape(1, -1)
    descriptor[np.isinf(descriptor)] = np.nan  # Replace infinite values with NaN

    return descriptor

In [4]:
# def preprocess_new_molecule(smiles, imputer, scaler, selected_features):
#     mol = Chem.MolFromSmiles(smiles)
#     if mol is None:
#         raise ValueError(f"Invalid SMILES: {smiles}")
    
#     # Calculate molecular descriptors
#     descriptor = np.array(desc_calc.CalcDescriptors(mol)).reshape(1, -1)
#     descriptor[np.isinf(descriptor)] = np.nan  # Replace infinite values with NaN

#     # Impute missing values
#     descriptor = imputer.transform(descriptor)

#     # Scale the descriptors
#     descriptor = scaler.transform(descriptor)

#     # Select features based on variance threshold
#     descriptor = descriptor[:, selected_features]

#     return descriptor

In [6]:
mol_descriptors = []

In [10]:
smiles = "C1=CC=CC=C1"

In [46]:
for csv_name in csv_directory:
    print(f"Processing {csv_name}")
    
    # load the preprocessing objects
    imputer = joblib.load(f"data/{csv_name}/{csv_name}_imputer.pkl")
    scaler = joblib.load(f"data/{csv_name}/{csv_name}_scaler.pkl")
    selected_features = joblib.load(f"data/{csv_name}/{csv_name}_selected_features.pkl")
    variance_threshold = joblib.load(f"data/{csv_name}/{csv_name}_variance_threshold.pkl")


    descriptor = smiles_to_descriptors(smiles)

    descriptor = imputer.transform(descriptor)

    descriptor = scaler.transform(descriptor)

    descriptor = descriptor[:, selected_features]  # Select by the saved indices  

    
    mol_descriptors.append(descriptor)




Processing oral_abs_class
Processing hia_class
Processing crl_toxicity_class
Processing ML_input_p450-cyp3a4
Processing ames_mutagenicity_class
Processing ML_input_p450-cyp2c19
Processing hep_g2_toxicity_class
Processing nih_toxicity_class
Processing herg_blockers_class
Processing hek_toxicity_class
Processing hacat_toxicity_class
Processing ML_input_p450-cyp1a2
Processing bbb_class
Processing ML_input_p450-cyp2d6
Processing ML_input_p450-cyp2c9


In [None]:
testsmiles = 'CC1=CC(=C(C=C1)C(=O)O)O'

In [None]:
def make_predictions(smiles):
    results = {}

    for csv_name in csv_directory:
        try:
            # Load the preprocessing objects and model for each dataset
            imputer = joblib.load(f'./data/{csv_name}/{csv_name}_class_imputer.pkl')
            scaler = joblib.load(f'./data/{csv_name}/{csv_name}_class_scaler.pkl')
            selected_features = joblib.load(f'./data/{csv_name}/{csv_name}_class_selected_features.pkl')
            
            # model = joblib.load(f'./data/{csv_name}/{csv_name}_model.pkl')  # Assuming the trained model is saved here

            # Preprocess the new SMILES
            descriptors = preprocess_new_molecule(smiles, imputer, scaler, selected_features)

            # Make the prediction
            # prediction = model.predict(descriptors)

            # Store the result
    #         results[csv_name] = prediction[0]  # Assuming the model returns an array
    #     except Exception as e:
    #         # Handle any errors that occur during prediction
    #         results[csv_name] = f"Error: {str(e)}"

    # return results

In [None]:
#import libraries
import torch
import torch.nn.functional as F


In [49]:
class Linear_3L(torch.nn.Module):
    def __init__(self,input_dim, *args, **kwargs) -> None:
        super(Linear_3L, self).__init__(*args, **kwargs)
        
        self.Lin0 = torch.nn.Linear(input_dim, 2000)
        self.batchnorm1 = torch.nn.BatchNorm1d(2000)
        self.dropout = torch.nn.Dropout(0.75)

        self.Lin1 = torch.nn.Linear(2000, 500)
        self.batchnorm2 = torch.nn.BatchNorm1d(500)
        self.Lin2 = torch.nn.Linear(500, 10)
        self.batchnorm3 = torch.nn.BatchNorm1d(10)

        self.linout = Linear(10, 1)

    def forward(self, x):
        # x = data.x

        #L1
        out = F.relu(self.Lin0(x))
        out = F.relu(self.batchnorm1(out))
        out = self.dropout(out)
        # print('L1')

        out = F.relu(self.Lin1(out))
        out = F.relu(self.batchnorm2(out))
        out = self.dropout(out)
        # print('L2')

        out = F.relu(self.Lin2(out))
        out = F.relu(self.batchnorm3(out))
        out = self.dropout(out)
        # print('L3')

        out = torch.sigmoid(self.linout(out))
        out = out.view(-1)

        return out

NameError: name 'torch' is not defined

In [2]:
imputer = joblib.load(f'./data/{csv_name}/{csv_name}_imputer.pkl')
scaler = joblib.load(f'./data/{csv_name}/{csv_name}_scaler.pkl')
selected_features = joblib.load(f'./data/{csv_name}/{csv_name}_selected_features.pkl')

NameError: name 'csv_name' is not defined