This notebook is used to run the random forest model on ECFP4.

-  import libraries

In [1]:
import os
import numpy as np
import pandas as pd

import random

from rdkit import Chem

from rdkit import DataStructs
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [2]:
from rdkit import rdBase
rdBase.DisableLog('rdApp.error')

import warnings
warnings.filterwarnings('ignore')

-  define a function to get ECFP features: ECFP4 with radius 2, nBits 2048; <br>

In [3]:
radius = 2
nbits = 2048

In [4]:
# use fingerprints as features
def get_fp(x):
    mol = Chem.MolFromSmiles(x)
    #get the morgan fingerprint based on default setting
    features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nbits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)
    return features

-  read data splits and apply RF for prediction

In [5]:
#specify total folds
num_folds = 30

#specify the dataset split type: scaffold vs random
split_types = ['scaffold','random']

#benchmark: BACE, BBBP, HIV | ESOL, FreeSolv, Lipop
#opioids: benchmark, CYP2D6, CYP3A4 | MOR, DOR, KOR
folder = 'benchmark' 

# specify task setting - benchmark: benchmark; opioids: reg, cutoff6
task_setting = "benchmark"

# specify mol_props based on folder
if folder == 'benchmark': 
    mol_props = ['BACE', 'BBBP', 'HIV', 'ESOL', 'FreeSolv',]
elif folder == 'opioids': 
    mol_props = ['MDR1', 'CYP3A4', 'CYP2D6', 'MOR', 'DOR', 'KOR']

# specify dataset types based on task setting
if task_setting == 'benchmark':
    dataset_types = ['classification', 'classification', 'classification', 'regression', 'regression', 'regression']
elif task_setting == 'reg':
    dataset_types = ['regression', 'regression', 'regression', 'regression', 'regression', 'regression']
elif task_setting == 'cutoff6':
    dataset_types = ['classification', 'classification', 'classification', 'classification', 'classification', 'classification']

In [6]:
for split_type in split_types:
    print(split_type)
    for mol_prop in mol_props:
        print(mol_prop)
        # get dataset_type
        dataset_type = dataset_types[mol_props.index(mol_prop)]
        training_time_list = [0]*num_folds
        for fold in range(num_folds):
            print(fold)
            train_df = pd.read_csv('../data/{folder}/{split_type}_split/{task}/{name}_{split_type}_train_v{i}.csv'.format(task=task_setting, folder=folder, split_type=split_type, name=mol_prop, i=fold))
            valid_df = pd.read_csv('../data/{folder}/{split_type}_split/{task}/{name}_{split_type}_valid_v{i}.csv'.format(task=task_setting, folder=folder, split_type=split_type, name=mol_prop, i=fold))
            test_df = pd.read_csv('../data/{folder}/{split_type}_split/{task}/{name}_{split_type}_test_v{i}.csv'.format(task=task_setting, folder=folder, split_type=split_type, name=mol_prop, i=fold))

            # convert to np arrays
            X_train, Y_train = np.array(list(train_df['SMILES'])), np.array(list(train_df['label']))
            X_valid, Y_valid = np.array(list(valid_df['SMILES'])), np.array(list(valid_df['label']))
            X_test, Y_test = np.array(list(test_df['SMILES'])), np.array(list(test_df['label']))

            # according to Yang et al 2019: radius 2 bit size 2048 random forest trees 500
            # get the ECFP4 fingerprints
            X_train_fp = [get_fp(x) for x in X_train]
            X_train_fp = np.stack(X_train_fp)
            X_valid_fp = [get_fp(x) for x in X_valid]
            X_valid_fp = np.stack(X_valid_fp)
            X_test_fp = [get_fp(x) for x in X_test]
            X_test_fp = np.stack(X_test_fp)

            if dataset_type == 'regression':
                # make a RF regressor 
                clf = RandomForestRegressor(n_estimators=500, random_state=42) 

                # training
                clf.fit(X_train_fp, Y_train)

                # get predictions on the test set
                Y_pred = clf.predict(X_test_fp)

            else:
                # make a RF classifier
                clf = RandomForestClassifier(n_estimators=500, random_state=42)

                # training
                clf.fit(X_train_fp, Y_train)

                # get predictions on the test set
                Y_pred = clf.predict(X_test_fp)

                # get the class probability 
                Y_scores = clf.predict_proba(X_test_fp)[:, 1]


            #save the test_result file
            if dataset_type == 'regression':
                #assemble the test_result_df by collecting prediction results for each molecule
                test_result_df = pd.DataFrame({'preds': Y_pred, 'labels': Y_test, 'SMILES': X_test}, columns=['preds', 'labels', 'SMILES'])
            else:
                #assemble the test_result_df by collecting prediction probability for each molecule
                test_result_df = pd.DataFrame({'preds': Y_scores, 'labels': Y_test, 'SMILES': X_test}, columns=['preds', 'labels', 'SMILES'])
            #make a directory 
            try:
                os.makedirs('../results/raw_predictions/RF/{task}/{mol_prop}/{split_type}'\
                         .format(mol_prop=mol_prop, radius=radius, nbits=nbits, split_type=split_type, task=task_setting))
            except FileExistsError: 
                print("Directory already made!")
                
            #add other experiment settings 
            test_result_df['mol_prop'] = mol_prop
            test_result_df['model_name'] = 'RF'
            #convert the split_type text from scaffold to scaffold_balanced for later processing convenience
            if split_type == 'scaffold':
                test_result_df['split_type'] = 'scaffold_balanced'
            elif split_type == 'random':
                test_result_df['split_type'] = 'random'
            test_result_df['fold'] = fold
            
            #save to csv file
            test_result_df.to_csv('../results/raw_predictions/RF/{task}/{mol_prop}/{split_type}/r{radius}_b{nbits}_test_result_fold{fold}.csv'\
                                  .format(mol_prop=mol_prop, fold=fold, radius=radius, nbits=nbits, split_type=split_type, task=task_setting), index=False)

    