In this notebook, demonstration of model preparation and training is done using the *lyso_project* module. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from xgboost import XGBClassifier

from collections import Counter

In [None]:
# import lyso_project

import sys
sys.dont_write_bytecode = True
sys.path.append("/path/to/lyso_project")

In [None]:
import lyso_project
from lyso_project import descriptor_maker, preprocess_cp_dataset, utils

*preprocess_cp_dataset* is used to perform filtering on the internal dataset (training and time-split data). For further details, please refer to the **Materials and Method** section of the paper.

In [None]:
# Read the processed training data as a Pandas DataFrame

df_train = pd.read_parquet('df_training.parquet')
df_train.info()
df_train.head()

In [None]:
df_train['Lyso_Class'].value_counts().plot(kind='bar')
plt.title("Class separation - Full dataset")
plt.show()

In [None]:
# Prepare the 'y' variable for model training and cross-validation
Y_train = df_train['Lyso_Class'].copy()
len(Y_train)

In [None]:
# Get the ratio of non-lyso and lyso class in the dataset

# count examples in each class
counter = Counter(Y_train)

# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
# Make Morgan Fingerprint of radius 2. Bit info must be saved to perform X-FP analysis.

X_train_fp_rad2, bi_train_2 = descriptor_maker.fetch_fp_from_df(df_train, radius = 2, bit_info = True, n_bits= 4096)

In [None]:
# Make Morgan Fingerprint of radius 3. Bit info must be saved to perform X-FP analysis.

X_train_fp_rad3, bi_train_3 = descriptor_maker.fetch_fp_from_df(df_train, radius = 3, bit_info = True, n_bits= 4096)

In [None]:
# Make Morgan Fingerprint of radius 4. Bit info must be saved to perform X-FP analysis.

X_train_fp_rad4, bi_train_4 = descriptor_maker.fetch_fp_from_df(df_train, radius = 4, bit_info = True, n_bits= 4096)

In [None]:
print(X_train_fp_rad2.shape)
print(X_train_fp_rad3.shape)
print(X_train_fp_rad4.shape)

In [None]:
#reading custom descriptors
custom_descriptors = []

with open("understandable_descriptors.txt") as reader:
    for line in reader:
        line = line.rstrip()
        custom_descriptors.append(line)

print(len(custom_descriptors))

In [None]:
# X_rdkit

X_train_rdkit = descriptor_maker.rdkit_descriptor_calculator(df_train, add_logP=True, add_bpKa1=True, add_bpKa2=True, custom_descriptor_list = custom_descriptors)
X_train_rdkit.info()
X_train_rdkit.head()

In [None]:
# X_rdkit without logP and bpKa1 and bpKa2

X_train_rdkit_without_bpka_logP = descriptor_maker.rdkit_descriptor_calculator(df_train, add_logP=False, add_bpKa1=False, add_bpKa2=False, custom_descriptor_list = custom_descriptors)
X_train_rdkit_without_bpka_logP.info()
X_train_rdkit_without_bpka_logP.head()

In [None]:
# Making Avalon fingerprints
X_train_avalon = descriptor_maker.fetch_avalon_fp_from_df(df_train)

print(X_train_avalon.shape)

In [None]:
# MACCS fingerprints

X_train_maccs = descriptor_maker.fetch_maccs_fingerprint_from_df(df_train)

print(X_train_maccs.shape)

In [None]:
# provide path to save log files. Please ensure that the directory exists.

save_file = "logs_1"
save_path = "path/to/logs" # replace "path/to/logs" with desired path

Model objects are made per input type and stratified k-fold CV is done.

Note: If you also include a test set during model training process, additional model metrics are also calculated and saved. 

In [None]:
from lyso_project.utils import ModelRunner

In [None]:
# morgan fp radius 2 model
clf1 = ModelRunner("Morgan_FP_radius2")
clf1.model_input(X_train = X_train_fp_rad2, Y_train = Y_train)
clf1.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf1.save_model(path = save_path)
clf1.write_txt_log(filename = save_file, path = save_path)
clf1.write_toml_logs(path = save_path)

In [None]:
# morgan fp radius 3 model
clf2 = ModelRunner("Morgan_FP_radius3")
clf2.model_input(X_train = X_train_fp_rad3, Y_train = Y_train)
clf2.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf2.save_model(path = save_path)
clf2.write_txt_log(filename = save_file, path = save_path)
clf2.write_toml_logs(path = save_path)

In [None]:
# morgan fp radius 4 model 
clf3 = ModelRunner("Morgan_FP_radius4")
clf3.model_input(X_train = X_train_fp_rad4, Y_train = Y_train)
clf3.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf3.save_model(path = save_path)
clf3.write_txt_log(filename = save_file, path = save_path)
clf3.write_toml_logs(path = save_path)

In [None]:
# rdkit desc model
clf4 = ModelRunner("Select_RDKit_desc_with_logP_bpKa1")
clf4.model_input(X_train = X_train_rdkit, Y_train= Y_train, scaled=False)
clf4.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf4.save_model(path = save_path)
clf4.write_txt_log(filename = save_file, path = save_path)
clf4.write_toml_logs(path = save_path)

In [None]:
# rdkit desc model without bpKa1 and logP
clf5 = ModelRunner("Select_RDKit_desc")
clf5.model_input(X_train = X_train_rdkit_without_bpka_logP, Y_train= Y_train, scaled=False)
clf5.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf5.save_model(path = save_path)
clf5.write_txt_log(filename = save_file, path = save_path)
clf5.write_toml_logs(path = save_path)

In [None]:
# avalon fingerprints model
clf6 = ModelRunner("Avalon_fp")
clf6.model_input(X_train = X_train_avalon, Y_train= Y_train)
clf6.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf6.save_model(path = save_path)
clf6.write_txt_log(filename = save_file, path = save_path)
clf6.write_toml_logs(path = save_path)

In [None]:
# MACCS model
clf7 = ModelRunner("MACCS")
clf7.model_input(X_train = X_train_maccs, Y_train= Y_train)
clf7.make_model(scale_pos_weight = estimate, n_jobs = 5, folds_cv = 5)
clf7.save_model(path = save_path)
clf7.write_txt_log(filename = save_file, path = save_path)
clf7.write_toml_logs(path = save_path)

Plot of the models' cross-validation performances is displayed. If path is provided, it is also saved.

In case a test set is provided, an additional plot of models' performances is also prepared.

In [None]:
from lyso_project import toml_log_reader_with_plotter as tp
tp.toml_reader_from_dict_plus_plotter(path_in = save_path)