Skip to content

Commit

Permalink
DeepCALLC
Browse files Browse the repository at this point in the history
Merge logic of CALLC with DeepLC.
  • Loading branch information
RobbinBouwmeester committed Jan 10, 2022
1 parent f21f464 commit 675f5ab
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 7,834 deletions.
40 changes: 36 additions & 4 deletions deeplc/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
from deeplc._exceptions import CalibrationError, DeepLCError
from tensorflow.keras.models import load_model

from deeplc.trainl3 import train_en

# "Custom" activation function
lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.1, max_value=20.0)

Expand Down Expand Up @@ -178,7 +180,8 @@ def __init__(self,
write_library=False,
use_library=None,
reload_library=False,
pygam_calibration=False
pygam_calibration=False,
deepcallc_mod=False,
):

# if a config file is defined overwrite standard parameters
Expand Down Expand Up @@ -243,6 +246,13 @@ def __init__(self,
if self.pygam_calibration:
from pygam import LinearGAM, s

self.deepcallc_mod = deepcallc_mod

if self.deepcallc_mod:
self.write_library=False
self.use_library=None
self.reload_library=False

def __str__(self):
return("""
_____ _ _____
Expand Down Expand Up @@ -531,6 +541,8 @@ def make_preds_core(self,
# TODO this is madness! Only allow dicts to come through this function...
if isinstance(self.model, dict):
ret_preds = []
if self.deepcallc_mod:
deepcallc_x = {}
for m_group_name,m_name in self.model.items():
try:
X
Expand All @@ -545,6 +557,8 @@ def make_preds_core(self,
uncal_preds = []
pass




if self.write_library:
try:
Expand All @@ -568,6 +582,8 @@ def make_preds_core(self,
p2 = list(self.calibration_core([LIBRARY[ri+"|"+m_name] for ri in rem_idents],self.calibrate_dict[m_name],self.calibrate_min[m_name],self.calibrate_max[m_name]))
ret_preds2.append(p2)

deepcallc_x[m_name] = p

ret_preds = np.array([sum(a)/len(a) for a in zip(*ret_preds)])
ret_preds2 = np.array([sum(a)/len(a) for a in zip(*ret_preds2)])
elif not mod_name:
Expand Down Expand Up @@ -770,6 +786,10 @@ def make_preds_core(self,
del mod
except UnboundLocalError:
logger.debug("Variable mod not defined, so will not be deleted")


if self.deepcallc_mod and isinstance(self.model, dict):
ret_preds_shape = self.deepcallc_model.predict(pd.DataFrame(deepcallc_x))

return ret_preds_shape

Expand Down Expand Up @@ -1065,6 +1085,7 @@ def calibrate_preds(self,
-------
"""

if isinstance(self.model, str):
self.model = [self.model]

Expand Down Expand Up @@ -1124,7 +1145,11 @@ def calibrate_preds(self,
correction_factor=correction_factor,
mod_name=m)
m_name = m.split("/")[-1]
m_group_name = "_".join(m_name.split("_")[:-1])

if self.deepcallc_mod:
m_group_name = "deepcallc"
else:
m_group_name = "_".join(m_name.split("_")[:-1])

try:
pred_dict[m_group_name][m] = preds
Expand Down Expand Up @@ -1158,8 +1183,11 @@ def calibrate_preds(self,
(perf / len(preds)))

if perf < best_perf:
m_group_name = "_".join(m.split("_")[:-1]).split("/")[-1]
# TODO is deepcopy really required?
if self.deepcallc_mod:
m_group_name = "deepcallc"
else:
m_group_name = "_".join(m.split("_")[:-1]).split("/")[-1]
# TODO is deepcopy really required?

best_calibrate_dict = copy.deepcopy(mod_calibrate_dict[m_group_name])
best_calibrate_min = copy.deepcopy(mod_calibrate_min_dict[m_group_name])
Expand All @@ -1173,6 +1201,10 @@ def calibrate_preds(self,
self.calibrate_max = best_calibrate_max
self.model = best_model

if self.deepcallc_mod:
self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]),seq_df["tr"])


logger.debug("Model with the best performance got selected: %s" %(best_model))


Expand Down
95 changes: 95 additions & 0 deletions deeplc/trainl3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Robbin Bouwmeester
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This code is used to train retention time predictors and store
predictions from a CV procedure for further analysis.
This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
from the Marie Sklodowska-Curie EU Framework for Research and Innovation
Horizon 2020, under Grant Agreement No. 675132.
"""

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from scipy.stats import uniform
from numpy import arange
from scipy.stats import pearsonr

from operator import itemgetter
from numpy import median
from collections import Counter

def train_en(X,y,n_jobs=16,cv=None):
"""
Function that trains Layer 3 of CALLC (elastic net)
Parameters
----------
X : pd.DataFrame
dataframe with molecular descriptors
y : pd.Series
vector with observed retention times
n_jobs : int
number of jobs to spawn
cv : sklearn.model_selection.KFold
cv object
Returns
-------
sklearn.linear_model.ElasticNet
elastic net model trained in Layer 3
list
list with predictions
list
list with features used to train Layer 3
"""
preds = []

model = ElasticNet()
crossv_mod = clone(model)
ret_mod = clone(model)

set_reg = [0.01,1.0,10.0,100.0,1000.0,10000.0,10000.0,100000.0,1000000.0,1000000000,1000000]
set_reg.extend([x/2 for x in set_reg])
set_reg.extend([x/3 for x in set_reg])

params = {
'alpha': set_reg,
'l1_ratio' : [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
'copy_X':[True],
'normalize' : [False],
'positive' : [True],
'fit_intercept' : [True,False]
}

grid = GridSearchCV(model, params,cv=cv,scoring='neg_mean_absolute_error',verbose=0,n_jobs=n_jobs,refit=True)
grid.fit(X,y)

cv_pred = cv
crossv_mod.set_params(**grid.best_params_)
preds = cross_val_predict(crossv_mod, X=X, y=y, cv=cv_pred, n_jobs=n_jobs, verbose=0)

ret_mod.set_params(**grid.best_params_)
ret_mod.fit(X,y)

coef_indexes = [i for i,coef in enumerate(ret_mod.coef_) if coef > 0.0]

return ret_mod

0 comments on commit 675f5ab

Please sign in to comment.