In [1]:
pip install tqdm joblib

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd #data manipulation
from rdkit import Chem #Chemistry
from rdkit.Chem import Descriptors
import numpy as np #matrix algebra
#from lightgbm import LGBMRegressor, plot_importance #LightGBM models
from sklearn.model_selection import train_test_split #ML trainining
from sklearn.metrics import r2_score, mean_squared_error #ML stats
import seaborn as sns #Plotting
#from yellowbrick.regressor import prediction_error, ResidualsPlot

In [2]:
dfTrain = pd.read_csv("training_data.csv")
dfTest = pd.read_csv("test_data.csv")

In [3]:
smiles_stringsTrain = dfTrain['SMILES'].tolist()
smiles_stringsTest = dfTest['SMILES'].tolist()

In [4]:
smilesTrain = [Chem.MolFromSmiles(mol) for mol in smiles_stringsTrain]
smilesTest = [Chem.MolFromSmiles(mol) for mol in smiles_stringsTest]

In [5]:
descrsTrain = [Descriptors.CalcMolDescriptors(mol) for mol in smilesTrain]
descrsTest = [Descriptors.CalcMolDescriptors(mol) for mol in smilesTest]
df2Train = pd.DataFrame(descrsTrain)
df2Test = pd.DataFrame(descrsTest)

In [6]:
train_X = df2Train.bfill()
train_y = dfTrain.pIC50
test_X = df2Test.bfill()
test_y = dfTest.pIC50

In [7]:
from tqdm import tqdm
import joblib
import numpy as np

from joblib import Parallel, delayed
from tqdm.auto import tqdm

class TQDMParallel(Parallel):
    def __init__(self, *args, **kwargs):
        self._pbar = kwargs.pop('pbar', None)
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(total=len(self._iterable), desc=self._pbar) as self._pbar:
            return super().__call__(*args, **kwargs)

    def print_progress(self):
        if self._pbar is not None:
            self._pbar.n = self.n_completed_tasks
            self._pbar.refresh()
        else:
            super().print_progress()

In [8]:
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [12]:
# Create the RandomForest model
regr = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)

# Patch joblib's Parallel to use the custom TQDMParallel
joblib.Parallel = TQDMParallel

# Train the model
with joblib.parallel_backend('threading', n_jobs=-1):
    regr.fit(train_X, train_y)
    y_pred = regr.predict(test_X)

In [14]:
# Coefficient of determination
r2 = r2_score(test_y, y_pred)
# Root mean squared error
rmsd = mean_squared_error(test_y, y_pred)**0.5
# Bias
bias = np.mean(y_pred - test_y)
# Standard deviation of the error of prediction
sdep = np.mean(((y_pred - test_y) - np.mean(y_pred - test_y))**2)**0.5

print(r2)
print(rmsd)
print(bias)
print(sdep)
plt.plot(y_pred, test_y)
plt.show()

0.0007012504979255763
1.1717760751143211
0.1560502958152952
1.1613386566313388


NameError: name 'plt' is not defined

In [None]:
print(r2)
print(rmsd)
print(bias)
print(sdep)

In [None]:
#! /bin/usr/env python

# Random forest with nested CV

from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import numpy as np
import pandas as pd

# Number of MC CV cycles
mc_cv = 20

# Number of CV folds for hyperparameter tuning:
hyper_cv = 10

# Fraction of data to use as test set:
test_frac = 0.3

# Name of file to write all predictions to:
predictions_filename = 'RF_predictions.csv'

# Variables to save model performance statistics:
r2_sum = 0
rmsd_sum = 0
bias_sum = 0
sdep_sum = 0

# List to save individual predictions from the models trained from
# each train/test split:
all_preds = np.empty((mc_cv, len(y)), dtype=float)
all_preds[:] = np.nan

# Initialise train test split:
train_test_split = ShuffleSplit(mc_cv, test_size=test_frac)

# Monte Carlo CV:
for n, [train_idx, test_idx] in enumerate(train_test_split.split(train_X)):

    # Separate data into training and test sets:
    # Have to use ".iloc" if x and y are pandas DataFrames and Series objects,
    # if they are just numpy arrays remove ".iloc".
    x_train = x.iloc[train_idx]
    x_test = x.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    # Centre and scale all x features to have mean=0 and var=1:
    # (Not required for random forest, but important for some other ML methods)
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    rf = RandomForestRegressor()

    # Train RF model:
    rf.fit(x_train, y_train)

    # Use trained RF model to predict y data for the test set:
    y_pred = rf.predict(x_test)

    # Assess performace of model based on predictions:

    # Coefficient of determination
    r2 = r2_score(y_test, y_pred)
    # Root mean squared error
    rmsd = mean_squared_error(y_test, y_pred)**0.5
    # Bias
    bias = np.mean(y_pred - y_test)
    # Standard deviation of the error of prediction
    sdep = np.mean(((y_pred - y_test) - np.mean(y_pred - y_test))**2)**0.5

    # Save running sum of results:
    r2_sum += r2
    rmsd_sum += rmsd
    bias_sum += bias
    sdep_sum += sdep

    # Save individual predictions:
    all_preds[n,test_idx] = y_pred

# Average results over resamples:
r2_av = r2_sum/mc_cv
rmsd_av = rmsd_sum/mc_cv
bias_av = bias_sum/mc_cv
sdep_av = sdep_sum/mc_cv

# Write average results to a file:
results_file = open(results_filename, 'w')
results_file.write('r2: {:.3f}\n'.format(r2_av))
results_file.write('rmsd: {:.3f}\n'.format(rmsd_av))
results_file.write('bias: {:.3f}\n'.format(bias_av))
results_file.write('sdep: {:.3f}\n'.format(sdep_av))
results_file.close()

# Save all individual predictions to file:
predictions_file = open(predictions_filename, 'w')
# Write header:
predictions_file.write(','.join([str(i) for i in y.index]) + '\n')
# Write individual predictions from each MC CV cycle:
for n in range(mc_cv):
    predictions_file.write(','.join([str(p) if not np.isnan(p) else '' for p in all_preds[n]]) + '\n')
predictions_file.close()