# Training regression models

This notebook contains code to train regression models and reproduce our results. To learn more about training with [chemprop](https://github.com/chemprop/chemprop), refer to documentation [here](https://chemprop.readthedocs.io/en/latest/index.html).

In [None]:
import pathlib

import chemprop
import matplotlib.pyplot as plt

For each ADMET group, we have regression targets and hyperparameters for the model

To learn more about hyperparameters, refer to chemprop documentation on [train arguments](https://chemprop.readthedocs.io/en/latest/args.html#train-arguments).

In [None]:
regression_data = {
    "Absorption": {
        "Caco-2": {
            "depth": 5,
            "dropout": 0.25,
            "ffn_hidden_size": 800,
            "ffn_num_layers": 2,
            "hidden_size": 800
        },
        "Solubility": {
            "depth": 2,
            "dropout": 0.0,
            "ffn_hidden_size": 300,
            "ffn_num_layers": 3,
            "hidden_size": 300
        },
        "Lipophilicity": {
            "depth": 3,
            "dropout": 0.05,
            "ffn_hidden_size": 2000,
            "ffn_num_layers": 3,
            "hidden_size": 2000
        },
    },
    "Distribution": {
        "PPBR": {
            "depth": 6,
            "dropout": 0.25,
            "ffn_hidden_size": 500,
            "ffn_num_layers": 3,
            "hidden_size": 500
        },
        "VDss": {
            "depth": 2,
            "dropout": 0.3,
            "ffn_hidden_size": 300,
            "ffn_num_layers": 3,
            "hidden_size": 300
        },
    },
    "Metabolism": {}, # No regression targets for metabolism
    "Excretion": {
        "Clearance-Hepatocyte": {
            "depth": 2,
            "dropout": 0.25,
            "ffn_hidden_size": 600,
            "ffn_num_layers": 3,
            "hidden_size": 600
        },
        "Clearance-Microsome": {
            "depth": 3,
            "dropout": 0.05,
            "ffn_hidden_size": 1100,
            "ffn_num_layers": 3,
            "hidden_size": 1100
        },
        "Half-Life": {
            "depth": 3,
            "dropout": 0.35,
            "ffn_hidden_size": 1100,
            "ffn_num_layers": 3,
            "hidden_size": 1100
        },
    },
    "Toxicity": {
        "LD50": None, # Hyperparameters for LD50 will be added later
    },
}

data_root = pathlib.Path("../Datasets")
train_results = pathlib.Path("TrainResults")
train_results.mkdir(exist_ok=True)

The following functions can be used to get spearman correlation between true and predicted target values, as well as plots of true vs. predicted values and true vs. residuals.

In [None]:
def get_spearman_corr(y_true, y_pred):
    return y_true.corr(y_pred, method='spearman')

def get_true_vs_pred_plot(y_true, y_pred):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(y_true, y_pred)
    ax.set_xlabel('True values')
    ax.set_ylabel('Predicted values')
    return fig

def get_residuals_plot(y_true, y_pred):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(y_true, y_true - y_pred)
    ax.set_xlabel('True values')
    ax.set_ylabel('Residuals')
    return fig

In [None]:
EPOCH_NUM = 100
NUM_FOLDS = 1
BATCH_SIZE = 256

for admet_group, datasets in regression_data.items():
    for dataset_name, hyperparams in datasets.items():
        dataset_path = data_root / admet_group / f"{dataset_name}.csv"

        # You can modify the following parameters
        train_args_list = [
            '--data_path', str(dataset_path),  # Path to data CSV file
            '--dataset_type', 'regression',  # Dataset type
            '--save_dir', str(train_results / f'{dataset_name}'),  # Directory where model checkpoints will be saved
            '--epochs', str(EPOCH_NUM),  # Number of training epochs
            '--num_folds', str(NUM_FOLDS),  # Number of cross validation folds
            '--batch_size', str(BATCH_SIZE),  # Batch size. Decrease if running out of memory, or increase if GPU is underutilized
            '--extra_metrics', 'r2', 'mse', 'mae',  # Additional metrics to compute
            '--save_smiles_splits',  # Save train and val smiles splits
            '--smiles_columns', 'Drug',  # Name of the column containing SMILES strings to pass as input to model. Can be list of columns
            '--target_columns', 'Y',  # Name of the column containing targets to predict. Can be list of columns
        ]

        if hyperparams is not None:
            for param, value in hyperparams.items():
                train_args_list.extend([f'--{param}', str(value)])

        args = chemprop.args.TrainArgs().parse_args(train_args_list)

        mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)
        break
    break