DeepCALLC

Merge logic of CALLC with DeepLC.
compomics · Jan 10, 2022 · 675f5ab · 675f5ab
1 parent f21f464
commit 675f5ab
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 7,834 deletions.
diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
@@ -43,6 +43,8 @@
 from deeplc._exceptions import CalibrationError, DeepLCError
 from tensorflow.keras.models import load_model
 
+from deeplc.trainl3 import train_en
+
 # "Custom" activation function
 lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.1, max_value=20.0)
 
@@ -178,7 +180,8 @@ def __init__(self,
                  write_library=False,
                  use_library=None,
                  reload_library=False,
-                 pygam_calibration=False
+                 pygam_calibration=False,
+                 deepcallc_mod=False,
                  ):
 
         # if a config file is defined overwrite standard parameters
@@ -243,6 +246,13 @@ def __init__(self,
         if self.pygam_calibration:
             from pygam import LinearGAM, s
 
+        self.deepcallc_mod = deepcallc_mod
+
+        if self.deepcallc_mod:
+            self.write_library=False
+            self.use_library=None
+            self.reload_library=False
+
     def __str__(self):
         return("""
   _____                  _      _____
@@ -531,6 +541,8 @@ def make_preds_core(self,
                 # TODO this is madness! Only allow dicts to come through this function...
                 if isinstance(self.model, dict):
                     ret_preds = []
+                    if self.deepcallc_mod:
+                        deepcallc_x = {}
                     for m_group_name,m_name in self.model.items():
                         try:
                             X
@@ -545,6 +557,8 @@ def make_preds_core(self,
                             uncal_preds = []
                             pass
 
+
+
 
                         if self.write_library:
                             try:
@@ -568,6 +582,8 @@ def make_preds_core(self,
                         p2 = list(self.calibration_core([LIBRARY[ri+"|"+m_name] for ri  in rem_idents],self.calibrate_dict[m_name],self.calibrate_min[m_name],self.calibrate_max[m_name]))
                         ret_preds2.append(p2)
 
+                        deepcallc_x[m_name] = p
+
                     ret_preds = np.array([sum(a)/len(a) for a in zip(*ret_preds)])
                     ret_preds2 = np.array([sum(a)/len(a) for a in zip(*ret_preds2)])
                 elif not mod_name:
@@ -770,6 +786,10 @@ def make_preds_core(self,
             del mod
         except UnboundLocalError:
             logger.debug("Variable mod not defined, so will not be deleted")
+
+
+        if self.deepcallc_mod and isinstance(self.model, dict):
+            ret_preds_shape = self.deepcallc_model.predict(pd.DataFrame(deepcallc_x))
 
         return ret_preds_shape
 
@@ -1065,6 +1085,7 @@ def calibrate_preds(self,
         -------
 
         """
+
         if isinstance(self.model, str):
             self.model = [self.model]
 
@@ -1124,7 +1145,11 @@ def calibrate_preds(self,
                                     correction_factor=correction_factor,
                                     mod_name=m)
             m_name = m.split("/")[-1]
-            m_group_name = "_".join(m_name.split("_")[:-1])
+
+            if self.deepcallc_mod:
+                m_group_name = "deepcallc"
+            else:
+                m_group_name = "_".join(m_name.split("_")[:-1])
 
             try:
                 pred_dict[m_group_name][m] = preds
@@ -1158,8 +1183,11 @@ def calibrate_preds(self,
                     (perf / len(preds)))
 
             if perf < best_perf:
-                m_group_name =  "_".join(m.split("_")[:-1]).split("/")[-1]
-                # TODO is deepcopy really required?
+                if self.deepcallc_mod:
+                    m_group_name = "deepcallc"
+                else:
+                    m_group_name = "_".join(m.split("_")[:-1]).split("/")[-1]
+                    # TODO is deepcopy really required?
 
                 best_calibrate_dict = copy.deepcopy(mod_calibrate_dict[m_group_name])
                 best_calibrate_min = copy.deepcopy(mod_calibrate_min_dict[m_group_name])
@@ -1173,6 +1201,10 @@ def calibrate_preds(self,
         self.calibrate_max = best_calibrate_max
         self.model = best_model
 
+        if self.deepcallc_mod:
+            self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]),seq_df["tr"])
+
+
         logger.debug("Model with the best performance got selected: %s" %(best_model))
 
 

diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py
@@ -0,0 +1,95 @@
+"""
+Robbin Bouwmeester
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+This code is used to train retention time predictors and store
+predictions from a CV procedure for further analysis.
+
+This project was made possible by MASSTRPLAN. MASSTRPLAN received funding 
+from the Marie Sklodowska-Curie EU Framework for Research and Innovation 
+Horizon 2020, under Grant Agreement No. 675132.
+"""
+
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.linear_model import ElasticNet
+from sklearn.metrics import mean_absolute_error
+from sklearn.feature_selection import SelectFromModel
+from sklearn.model_selection import cross_val_predict
+from sklearn.model_selection import KFold
+from sklearn.base import clone
+from sklearn.model_selection import GridSearchCV
+from scipy.stats import randint
+from scipy.stats import uniform
+from numpy import arange
+from scipy.stats import pearsonr
+
+from operator import itemgetter
+from numpy import median
+from collections import Counter
+
+def train_en(X,y,n_jobs=16,cv=None):
+    """
+    Function that trains Layer 3 of CALLC (elastic net)
+    
+    Parameters
+    ----------
+    X : pd.DataFrame
+        dataframe with molecular descriptors
+    y : pd.Series
+        vector with observed retention times
+    n_jobs : int
+        number of jobs to spawn
+    cv : sklearn.model_selection.KFold
+        cv object
+    
+    Returns
+    -------
+    sklearn.linear_model.ElasticNet
+        elastic net model trained in Layer 3
+    list
+        list with predictions
+    list
+        list with features used to train Layer 3
+    """
+    preds = []
+
+    model = ElasticNet()
+    crossv_mod = clone(model)
+    ret_mod = clone(model)
+
+    set_reg = [0.01,1.0,10.0,100.0,1000.0,10000.0,10000.0,100000.0,1000000.0,1000000000,1000000]
+    set_reg.extend([x/2 for x in set_reg])
+    set_reg.extend([x/3 for x in set_reg])
+
+    params = {
+       'alpha': set_reg,
+       'l1_ratio' : [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
+       'copy_X':[True],
+       'normalize' : [False],
+       'positive' : [True],
+       'fit_intercept'  : [True,False]
+    }
+
+    grid = GridSearchCV(model, params,cv=cv,scoring='neg_mean_absolute_error',verbose=0,n_jobs=n_jobs,refit=True)
+    grid.fit(X,y)
+
+    cv_pred = cv
+    crossv_mod.set_params(**grid.best_params_)
+    preds = cross_val_predict(crossv_mod, X=X, y=y, cv=cv_pred, n_jobs=n_jobs, verbose=0)
+
+    ret_mod.set_params(**grid.best_params_)
+    ret_mod.fit(X,y)
+
+    coef_indexes = [i for i,coef in enumerate(ret_mod.coef_) if coef > 0.0]
+
+    return ret_mod