In [None]:
# ===============================================================
# ===============================================================
# ===      NOTEBOOK FOR RANDOM FOREST AND MLP TRAINING        ===
# ===             WITH F1 SCORES AND AUC SCORES               ===
# ===============================================================
# ===============================================================

__date__ = '30-Jan-22'
__author__ = 'jeremy charlier'
__revised__ = '26-Mar-22'

"""comments:
the encoder class is inherited from
[1]: "CRISPR-Net: A Recurrent Convolutional Network Quantiﬁes
CRISPR Off-Target Activities with Mismatches and Indels", J. Lin et al
https://onlinelibrary.wiley.com/doi/epdf/10.1002/advs.201903562
"""

import sys
path_to_module = 'MODULE_PATH'   # append drive directory to python sys path
sys.path.append(path_to_module)
sys.path.append(path_to_module+'/code/')   # location of python source code
sys.path.append(path_to_module)

In [None]:
import random
random.seed(42)
import pickle as pkl
import numpy as np
import pandas
import pandas as pd
from sklearn.metrics import (
  classification_report, roc_auc_score,
  confusion_matrix, f1_score,
  roc_curve, precision_score, recall_score,
  auc, average_precision_score,
  precision_recall_curve, accuracy_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from imblearn.under_sampling import RandomUnderSampler as rus
#
from sklearn.metrics.pairwise import cosine_similarity as cos_dist
from sklearn.metrics.pairwise import euclidean_distances as euc_dist
from sklearn.metrics.pairwise import manhattan_distances as man_dist
#
from transferlearning_datapipeline import datapipeline
from transferlearning_modelpipeline import modelpipeline
import transferlearning_utils as tlrn
#
p = print

In [None]:
# ---    TRANSFER LEARNING LOOP WITH RESULTS    ---
# ---   AS BUNCH AND ESTIMATORS AS PARAMETERS   ---
# -------------------------------------------------

def transferlearningPipelineBatch(
    estimators,
    datanms,
    # datafcts,
    encodeddata = None, sampfloat = None, verbose = False):
  """Transfer learning pipeline with Batch Training."""
  if not verbose:
    p("\n --- transfer learning pipeline in progress ... --- \n")
  #
  batch = len(estimators) # batch nbr. for training
  mdlaucs = np.zeros((len(datanms),batch)) # model auc batch training
  tlaucs = np.zeros((len(datanms),len(datanms)-1,batch)) # tl aucs
  mdlf1s = np.zeros((len(datanms),batch)) # model f1 batch training
  tlf1s = np.zeros((len(datanms),len(datanms)-1,batch)) # tl f1 scores
  mdlavgprecs = np.zeros((len(datanms),batch)) # model avg prec
  tlavgprecs = np.zeros((len(datanms),len(datanms)-1,batch)) # tl avg prec
  for itrain in range(len(datanms)):
    data = encodeddata[itrain]
    for ibatch in range(batch):
      #itrain=0
      xtrain, xtest, ytrain, ytest = tlrn.dataSplitRF(data)
      estimator = estimators[ibatch]
      #
      if sampfloat is not None:
        if itrain!=0 and itrain!=1:
          xtrainres, ytrainres = rus(
            sampling_strategy = sampfloat,
            random_state = 0
          ).fit_resample(xtrain, ytrain)
        elif itrain==1: # change sampling strat for CIRCLE-SEQ
          xtrainres, ytrainres = rus(
            sampling_strategy = 0.2, #0.05,
            random_state = 0
          ).fit_resample(xtrain, ytrain)
        else:
          xtrainres, ytrainres = xtrain, ytrain
        # ENDIF
      # ENDIF
      if verbose:
        p("\n!!! MODEL TRAINING ON %s !!!\n" % datanms[itrain][-30:])
      mdl = modelpipeline(
        estimator,
        xtrainres, ytrainres,
        xtest, ytest, verbose
      ).modelTrain()
      if verbose:
        p("--- end of model training ---")
      mdl = mdl.modelPredict()
      mdlaucs[itrain,ibatch] = roc_auc_score(
        mdl.y_test,
        mdl.yscore[:,1]
      )
      mdlf1s[itrain,ibatch] = f1_score(
        mdl.y_test,
        mdl.ypred,
        pos_label=1
      )
      mdlavgprecs[itrain,ibatch] = average_precision_score(
        mdl.y_test,
        mdl.yscore[:,1],
        pos_label = 1
      )
      # TRANSFER LEARNING LOOP
      cntj = 0 # COUNTER FOR TL AUCS
      for jtrain in range(len(datanms)):
        if jtrain != itrain:
          datanmtl = datanms[jtrain]
          if verbose:
            p("\n!!! TRANSFER LEARNING FOR %s !!! \n" % datanmtl[-30:])
            p("--- transfer learning data import ---")
          datatl = encodeddata[jtrain]
          xtldata = tlrn.reshapeArr(datatl.data)
          ytldata = pd.Series(datatl.target)
          if verbose: tlrn.printClassImbalance(ytldata) ;
          if verbose:
            p("--- end of transfer learning data import ---")
            p("--- transfer learning model metrics ---")
          mdltl = modelpipeline(
            mdl.estimator,
            xtldata, ytldata,
            xtldata, ytldata,
            verbose
          ).modelPredict()
          tlaucs[itrain, cntj, ibatch] = roc_auc_score(
            mdltl.y_test,
            mdltl.yscore[:,1]
          )
          tlf1s[itrain, cntj, ibatch] = f1_score(
            mdltl.y_test,
            mdltl.ypred,
            pos_label = 1
          )
          tlavgprecs[itrain, cntj, ibatch] = average_precision_score(
            mdltl.y_test,
            mdltl.yscore[:,1],
            pos_label = 1
          )
          cntj += 1
        # ENDIF
      # ENDFOR
    # ENDFOR
    mdlauc = mdlaucs[itrain].sum()/batch
    if verbose: p('\nbatch avg auc', mdlauc) ;
  # ENDFOR
  if verbose: p('batch aucs', mdlaucs);
  if not verbose: p("\n --- transfer learning pipeline done --- \n")
  return Bunch(
    modelaucs = mdlaucs, transfrlrngaucs = tlaucs,
    modelf1s = mdlf1s, transfrlrngf1s = tlf1s,
    modelavgprecs = mdlavgprecs, transfrlrngavgprecs = tlavgprecs)
# END FUNCTION transferlearningPipelineBatch

In [None]:
is_read_pkl_encoded_data = True
if is_read_pkl_encoded_data:
  f = open(path_to_module+'/data/encoded_data.pkl', 'rb')
  encdata = pkl.load(f)
  f.close()
else:
  encdata = datapipeline()

In [None]:
# ---   MLP EXPERIMENTS   ---
# ---------------------------

# === NEW SECTION 220326 ===
from sklearn.neural_network import MLPClassifier as MLP
# === END SECTION 220326 ===
is_postmetrics = False
is_cos = True
is_euc = False
is_man = False
is_roc_auc = True
is_f1 = True
is_avgprec = True
#
lbls=[
  'listgarten_elevation_cd33.csv',
  'CIRCLE_seq_10gRNA_wholeDataset.csv',
  'SITE-Seq_offTarget_wholeDataset.csv',
  'listgarten_elevation_guideseq.csv',
  'Listgarten_22gRNA_wholeDataset.csv',
  'Kleinstiver_5gRNA_wholeDataset.csv',
  'listgarten_elevation_hmg.csv',
  'guideseq.csv'
]
# cos dist for 10k iterations
cosdist = np.array([
  [0.9881, 0.5514, 0.5392, 0.5483, 0.5674, 0.5329, 0.5373, 0.4995],
  [0.5699, 0.8768, 0.6228, 0.8371, 0.5786, 0.5704, 0.5886, 0.8004],
  [0.5506, 0.631, 0.8855, 0.6256, 0.5674, 0.5685, 0.5829, 0.5676],
  [0.5643, 0.8672, 0.6354, 0.8854, 0.5937, 0.5755, 0.6156, 0.8231],
  [0.5662, 0.5672, 0.5642, 0.5654, 0.8784, 0.5738, 0.5447, 0.5025],
  [0.5819, 0.5931, 0.5865, 0.5971, 0.598, 0.8972, 0.5736, 0.5437],
  [0.5763, 0.5887, 0.5893, 0.5932, 0.5663, 0.5483, 0.9662, 0.5304],
  [0.5794, 0.8879, 0.636, 0.8899, 0.5937, 0.5738, 0.6375, 1.0]
])
ests = [
  # RandomForestClassifier(n_estimators=10, random_state=0),
  # RandomForestClassifier(n_estimators=10, random_state=10),
  # RandomForestClassifier(n_estimators=10, random_state=20),
  # RandomForestClassifier(n_estimators=10, random_state=30),
  # RandomForestClassifier(n_estimators=20, random_state=40),
  # === NEW SECTION 220326 ===
  MLP(
    hidden_layer_sizes=(50,), #100 previous | 100, 50, 25
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter = 1000)
  # === END SECTION 220326 ===
]
tlbunch = transferlearningPipelineBatch(
  ests,
  lbls,
  encodeddata = encdata,
  sampfloat = .8,
  verbose = False
)
if is_postmetrics:
  p('\n--- METRICS POST-PROCESSING ---\n')
  aggauc, aggf1, aggap = tlrn.postprocmetrics(tlbunch)
  if is_cos:
    if is_roc_auc:
      p('\n--- CORRELATION ROC AUC FOR COSINE DISTANCE ---\n')
      tlrn.corrpipeline(lbls, cos_dist, np.asarray(aggauc))
    if is_f1:
      p('\n--- CORRELATION F1 SCORE FOR COSINE DISTANCE ---\n')
      tlrn.corrpipeline(lbls, cos_dist, np.asarray(aggf1))
    if is_avgprec:
      p('\n--- CORRELATION AVG PREC SCORE FOR COSINE DISTANCE ---\n')
      tlrn.corrpipeline(lbls, cos_dist, np.asarray(aggap))
  if is_euc:
    if is_roc_auc:
      p('\n--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, euc_dist, np.asarray(aggauc))
    if is_f1:
      p('\n--- CORRELATION F1 SCORE FOR EUCLIDEAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, euc_dist, np.asarray(aggf1))
    if is_avgprec:
      p('\n--- CORRELATION AVG PREC SCORE FOR EUCLIDEAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, euc_dist, np.asarray(aggap))
  if is_man:
    if is_roc_auc:
      p('\n--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, man_dist, np.asarray(aggauc))
    if is_f1:
      p('\n--- CORRELATION F1 SCORE FOR MANHATTAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, man_dist, np.asarray(aggf1))
    if is_avgprec:
      p('\n--- CORRELATION AVG PREC SCORE FOR MANHATTAN DISTANCE ---\n')
      tlrn.corrpipeline(lbls, man_dist, np.asarray(aggap))


 --- transfer learning pipeline in progress ... --- 


 --- transfer learning pipeline done --- 



In [None]:
# for 0.2 samp ratio in circle seq with MLP
import transferlearning_utils as utils
aggauc, aggf1, aggap = utils.postprocmetrics(tlbunch)
utils.corrpipeline(lbls, cosdist, np.asarray(aggauc))
print()
utils.corrpipeline(lbls, cosdist, np.asarray(aggap))

listgarten_elevation_cd33.csv      : 0.370
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.326
SITE-Seq_offTarget_wholeDataset.csv: 0.848
listgarten_elevation_guideseq.csv  : 0.087
Listgarten_22gRNA_wholeDataset.csv : 0.616
Kleinstiver_5gRNA_wholeDataset.csv : 0.755
listgarten_elevation_hmg.csv       : 0.318
guideseq.csv                       : 0.353

all correlation                    : 0.459

listgarten_elevation_cd33.csv      : 0.979
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.469
SITE-Seq_offTarget_wholeDataset.csv: 0.521
listgarten_elevation_guideseq.csv  : -0.130
Listgarten_22gRNA_wholeDataset.csv : -0.121
Kleinstiver_5gRNA_wholeDataset.csv : -0.204
listgarten_elevation_hmg.csv       : -0.096
guideseq.csv                       : 0.086

all correlation                    : 0.188


In [None]:
aggauc = np.asarray(np.round(aggauc, 3))
aggap = np.asarray(np.round(aggap, 3))
aggf1 = np.asarray(np.round(aggf1, 3))
for irow in range(len(cosdist)):
  p('\nauc =' , aggauc[irow], '|', np.round(aggauc[irow].mean(), 3))
  p('avg prec = ', aggap[irow], '|', np.round(aggap[irow].mean(), 3))
  p('cos sim = ', cosdist[irow], '|', np.round(cosdist[irow].mean(), 3))


auc = [0.921 0.804 0.67  0.879 0.952 0.923 0.863 0.62 ] | 0.829
avg prec =  [0.905 0.046 0.05  0.011 0.005 0.009 0.05  0.118] | 0.149
cos sim =  [0.9354 0.5383 0.5229 0.5357 0.5511 0.5174 0.5279 0.5007] | 0.579

auc = [0.982 0.648 0.61  0.994 0.925 0.916 0.793 0.66 ] | 0.816
avg prec =  [0.568 0.56  0.036 0.326 0.002 0.014 0.015 0.098] | 0.202
cos sim =  [0.8514 0.5617 0.5973 0.8079 0.5629 0.5545 0.5697 0.7954] | 0.663

auc = [0.977 0.693 0.744 0.704 0.708 0.792 0.775 0.816] | 0.776
avg prec =  [0.667 0.655 0.042 0.007 0.001 0.002 0.026 0.298] | 0.212
cos sim =  [0.8692 0.5396 0.6031 0.5991 0.5518 0.5572 0.5668 0.5556] | 0.605

auc = [0.986 0.519 0.779 0.493 0.872 0.941 0.932 0.597] | 0.765
avg prec =  [0.203 0.471 0.041 0.03  0.001 0.032 0.154 0.086] | 0.127
cos sim =  [0.8623 0.5583 0.8436 0.62   0.5738 0.5579 0.5866 0.8199] | 0.678

auc = [0.926 0.673 0.777 0.542 0.82  0.831 0.59  0.347] | 0.688
avg prec =  [0.01  0.6   0.036 0.022 0.004 0.005 0.011 0.048] | 0.092
cos sim =  [0.844

In [None]:
# for 0.2 samp ratio in circle seq
import transferlearning_utils as utils
aggauc, aggf1, aggap = utils.postprocmetrics(tlbunch)
utils.corrpipeline(lbls, cosdist, np.asarray(aggauc))
print()
utils.corrpipeline(lbls, cosdist, np.asarray(aggap))

listgarten_elevation_cd33.csv      : 0.767
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.121
SITE-Seq_offTarget_wholeDataset.csv: 0.850
listgarten_elevation_guideseq.csv  : 0.126
Listgarten_22gRNA_wholeDataset.csv : 0.834
Kleinstiver_5gRNA_wholeDataset.csv : 0.704
listgarten_elevation_hmg.csv       : 0.423
guideseq.csv                       : 0.326

all correlation                    : 0.519

listgarten_elevation_cd33.csv      : 0.984
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.116
SITE-Seq_offTarget_wholeDataset.csv: 0.424
listgarten_elevation_guideseq.csv  : -0.251
Listgarten_22gRNA_wholeDataset.csv : -0.142
Kleinstiver_5gRNA_wholeDataset.csv : -0.212
listgarten_elevation_hmg.csv       : -0.091
guideseq.csv                       : 0.166

all correlation                    : 0.124


In [None]:
# for 0.05 samp ratio in circle seq
import transferlearning_utils as utils
aggauc, aggf1, aggap = utils.postprocmetrics(tlbunch)
utils.corrpipeline(lbls, cosdist, np.asarray(aggauc))
print()
utils.corrpipeline(lbls, cosdist, np.asarray(aggap))

listgarten_elevation_cd33.csv      : 0.467
CIRCLE_seq_10gRNA_wholeDataset.csv : -0.011
SITE-Seq_offTarget_wholeDataset.csv: 0.761
listgarten_elevation_guideseq.csv  : 0.083
Listgarten_22gRNA_wholeDataset.csv : 0.747
Kleinstiver_5gRNA_wholeDataset.csv : 0.771
listgarten_elevation_hmg.csv       : 0.359
guideseq.csv                       : 0.294

all correlation                    : 0.434

listgarten_elevation_cd33.csv      : 0.989
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.145
SITE-Seq_offTarget_wholeDataset.csv: 0.462
listgarten_elevation_guideseq.csv  : -0.227
Listgarten_22gRNA_wholeDataset.csv : -0.136
Kleinstiver_5gRNA_wholeDataset.csv : -0.192
listgarten_elevation_hmg.csv       : -0.079
guideseq.csv                       : 0.147

all correlation                    : 0.139


In [None]:
aggauc, aggf1, aggap = postprocmetrics(tlbunch)
corrpipeline(lbls, cosdist, np.asarray(aggauc))
p()
corrpipeline(lbls, cosdist, np.asarray(aggap))

listgarten_elevation_cd33.csv      : 0.781
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.156
SITE-Seq_offTarget_wholeDataset.csv: 0.763
listgarten_elevation_guideseq.csv  : 0.215
Listgarten_22gRNA_wholeDataset.csv : 0.871
Kleinstiver_5gRNA_wholeDataset.csv : 0.610
listgarten_elevation_hmg.csv       : 0.520
guideseq.csv                       : 0.293

all correlation                    : 0.526

listgarten_elevation_cd33.csv      : 0.984
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.130
SITE-Seq_offTarget_wholeDataset.csv: 0.267
listgarten_elevation_guideseq.csv  : -0.261
Listgarten_22gRNA_wholeDataset.csv : -0.141
Kleinstiver_5gRNA_wholeDataset.csv : -0.234
listgarten_elevation_hmg.csv       : -0.104
guideseq.csv                       : 0.098

all correlation                    : 0.092


In [None]:
np.asarray(_urnd(aggauc))

array([[0.866, 0.758, 0.68 , 0.686, 0.622, 0.633, 0.604, 0.513],
       [0.895, 0.61 , 0.709, 0.948, 0.699, 0.682, 0.627, 0.248],
       [0.911, 0.634, 0.725, 0.632, 0.657, 0.642, 0.722, 0.427],
       [0.917, 0.493, 0.724, 0.631, 0.591, 0.621, 0.776, 0.345],
       [0.797, 0.597, 0.592, 0.501, 0.572, 0.575, 0.423, 0.429],
       [0.785, 0.514, 0.569, 0.629, 0.655, 0.611, 0.7  , 0.706],
       [0.734, 0.434, 0.647, 0.533, 0.763, 0.53 , 0.557, 0.316],
       [0.915, 0.543, 0.366, 0.55 , 0.261, 0.466, 0.503, 0.312]])

In [None]:
np.asarray(_urnd(aggap))

array([[0.822, 0.029, 0.037, 0.002, 0.   , 0.001, 0.007, 0.073],
       [0.381, 0.54 , 0.101, 0.056, 0.   , 0.001, 0.008, 0.045],
       [0.322, 0.569, 0.068, 0.003, 0.   , 0.001, 0.017, 0.095],
       [0.026, 0.458, 0.041, 0.031, 0.   , 0.001, 0.015, 0.052],
       [0.001, 0.549, 0.016, 0.017, 0.001, 0.001, 0.005, 0.069],
       [0.004, 0.478, 0.015, 0.03 , 0.002, 0.001, 0.012, 0.164],
       [0.017, 0.432, 0.023, 0.02 , 0.004, 0.   , 0.001, 0.05 ],
       [0.321, 0.493, 0.01 , 0.021, 0.001, 0.   , 0.001, 0.004]])

In [None]:
corrpipeline(lbls, cosdist, np.asarray(aggauc))

listgarten_elevation_cd33.csv      : 0.903
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.166
SITE-Seq_offTarget_wholeDataset.csv: 0.064
listgarten_elevation_guideseq.csv  : -0.658
Listgarten_22gRNA_wholeDataset.csv : -0.099
Kleinstiver_5gRNA_wholeDataset.csv : 0.302
listgarten_elevation_hmg.csv       : -0.282

all correlation                    : 0.057


In [None]:
# ---   FOR RANDOM FORESTS   ---
# ------------------------------

lbls=[
  'listgarten_elevation_cd33.pkl',
  'CIRCLE_seq_10gRNA_wholeDataset.csv',
  'SITE-Seq_offTarget_wholeDataset.csv',
  'listgarten_elevation_guideseq.pkl',
  'Listgarten_22gRNA_wholeDataset.csv',
  'Kleinstiver_5gRNA_wholeDataset.csv',
  'listgarten_elevation_hmg.pkl'
]

cosdist=np.array([
  [0.9319, 0.531, 0.5185, 0.5306, 0.5472, 0.5092, 0.5231],
  [0.5551, 0.843, 0.5909, 0.806, 0.5544, 0.5481, 0.5561],
  [0.5353, 0.6065, 0.8542, 0.6102, 0.5443, 0.5531, 0.554],
  [0.5505, 0.8384, 0.6093, 0.8582, 0.5665, 0.5567, 0.5796],
  [0.548, 0.5479, 0.5441, 0.5485, 0.8495, 0.5546, 0.5293],
  [0.572, 0.5688, 0.5639, 0.5738, 0.5765, 0.8668, 0.5559],
  [0.5638, 0.5632, 0.5711, 0.5727, 0.5406, 0.5253, 0.9092]])
eucdist=np.array([
  [1.3977, 5.2809, 5.4279, 5.341, 5.2283, 5.4269, 5.2436],
  [5.1996, 3.2476, 5.3034, 3.5988, 5.5554, 5.6006, 5.4666],
  [5.3715, 5.1515, 3.0817, 5.1959, 5.6527, 5.6003, 5.5161],
  [5.2901, 3.314, 5.2003, 3.1697, 5.5135, 5.592, 5.3902],
  [5.2459, 5.6058, 5.6767, 5.6207, 3.0845, 5.5407, 5.623],
  [5.1016, 5.454, 5.5316, 5.4568, 5.4171, 2.9346, 5.4315],
  [5.0977, 5.4297, 5.442, 5.4455, 5.5744, 5.6849, 2.269]])
mandist=np.array([
  [3.536, 28.088, 29.552, 28.504, 27.48, 29.68, 27.448],
  [27.012, 10.992, 28.588, 13.564, 31.012, 31.492, 29.868],
  [28.84, 27.46, 10.368, 27.976, 32.128, 31.528, 30.512],
  [27.864, 11.324, 27.72, 10.232, 30.44, 31.312, 28.664],
  [27.576, 31.4, 32.264, 31.672, 10.664, 30.824, 31.552],
  [26.0319, 29.8526, 30.7291, 29.8645, 29.3865, 9.3147, 29.9044],
  [26.032, 29.772, 29.784, 29.424, 31.288, 32.344, 6.28]])

In [None]:
ests=[
  RandomForestClassifier(n_estimators=10, random_state=0),
  RandomForestClassifier(n_estimators=10, random_state=10),
  RandomForestClassifier(n_estimators=10, random_state=20),
  RandomForestClassifier(n_estimators=10, random_state=30),
  RandomForestClassifier(n_estimators=10, random_state=40)]
tlbunch=transferlearningPipelineBatch(
  ests, lbls,
  # nmfcts,
  encodeddata=encdata, sampfloat=.2, verbose=False)
#
p('\n--- METRICS POST-PROCESSING ---\n')
aggauc, aggf1, aggap = postprocmetrics(tlbunch)
p('\n--- CORRELATION ROC AUC FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggauc))
p('\n--- CORRELATION F1 SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggf1))
p('\n--- CORRELATION AVG PREC SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggap))


 --- transfer learning pipeline in progress ... --- 


 --- transfer learning pipeline done --- 


--- METRICS POST-PROCESSING ---


--- CORRELATION ROC AUC FOR COSINE DISTANCE ---

listgarten_elevation_cd33.pkl      : 0.903
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.166
SITE-Seq_offTarget_wholeDataset.csv: 0.026
listgarten_elevation_guideseq.pkl  : -0.665
Listgarten_22gRNA_wholeDataset.csv : -0.083
Kleinstiver_5gRNA_wholeDataset.csv : 0.188
listgarten_elevation_hmg.pkl       : -0.377

all correlation                    : 0.023

--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---

listgarten_elevation_cd33.pkl      : -0.897
CIRCLE_seq_10gRNA_wholeDataset.csv : -0.231
SITE-Seq_offTarget_wholeDataset.csv: -0.171
listgarten_elevation_guideseq.pkl  : 0.591
Listgarten_22gRNA_wholeDataset.csv : -0.032
Kleinstiver_5gRNA_wholeDataset.csv : -0.320
listgarten_elevation_hmg.pkl       : 0.322

all correlation                    : -0.105

--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---

listgarten_e

In [None]:
tlbunch=transferlearningPipelineBatch(
  ests, lbls,
  # nmfcts,
  encodeddata=encdata, sampfloat=0.7, verbose=False)
p('\n--- METRICS POST-PROCESSING ---\n')
aggauc, aggf1, aggap = postprocmetrics(tlbunch)
p('\n--- CORRELATION ROC AUC FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggauc))
p('\n--- CORRELATION F1 SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggf1))
p('\n--- CORRELATION AVG PREC FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggap))


 --- transfer learning pipeline in progress ... --- 


 --- transfer learning pipeline done --- 


--- METRICS POST-PROCESSING ---


--- CORRELATION ROC AUC FOR COSINE DISTANCE ---

listgarten_elevation_cd33.pkl      : 0.903
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.169
SITE-Seq_offTarget_wholeDataset.csv: 0.107
listgarten_elevation_guideseq.pkl  : -0.623
Listgarten_22gRNA_wholeDataset.csv : -0.030
Kleinstiver_5gRNA_wholeDataset.csv : 0.405
listgarten_elevation_hmg.pkl       : -0.020

all correlation                    : 0.13

--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---

listgarten_elevation_cd33.pkl      : -0.897
CIRCLE_seq_10gRNA_wholeDataset.csv : -0.238
SITE-Seq_offTarget_wholeDataset.csv: -0.247
listgarten_elevation_guideseq.pkl  : 0.547
Listgarten_22gRNA_wholeDataset.csv : -0.072
Kleinstiver_5gRNA_wholeDataset.csv : -0.525
listgarten_elevation_hmg.pkl       : -0.030

all correlation                    : -0.209

--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---

listgarten_e

In [None]:
ests=[
  RandomForestClassifier(n_estimators=10, random_state=0),
  RandomForestClassifier(n_estimators=10, random_state=10),
  RandomForestClassifier(n_estimators=10, random_state=20),
  RandomForestClassifier(n_estimators=10, random_state=30),
  RandomForestClassifier(n_estimators=10, random_state=40)]
tlbunch=transferlearningPipelineBatch(
  ests, lbls,
  # nmfcts,
  encodeddata=encdata, sampfloat=.8, verbose=False)
#
p('\n--- METRICS POST-PROCESSING ---\n')
aggauc, aggf1, aggap = postprocmetrics(tlbunch)
p('\n--- CORRELATION ROC AUC FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggauc))
p('\n--- CORRELATION F1 SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggf1))
p('\n--- CORRELATION AVG PREC SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggap))


 --- transfer learning pipeline in progress ... --- 


 --- transfer learning pipeline done --- 


--- METRICS POST-PROCESSING ---


--- CORRELATION ROC AUC FOR COSINE DISTANCE ---

listgarten_elevation_cd33.pkl      : 0.903
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.160
SITE-Seq_offTarget_wholeDataset.csv: 0.063
listgarten_elevation_guideseq.pkl  : -0.644
Listgarten_22gRNA_wholeDataset.csv : -0.147
Kleinstiver_5gRNA_wholeDataset.csv : 0.083
listgarten_elevation_hmg.pkl       : -0.167

all correlation                    : 0.036

--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---

listgarten_elevation_cd33.pkl      : -0.897
CIRCLE_seq_10gRNA_wholeDataset.csv : -0.226
SITE-Seq_offTarget_wholeDataset.csv: -0.213
listgarten_elevation_guideseq.pkl  : 0.575
Listgarten_22gRNA_wholeDataset.csv : 0.035
Kleinstiver_5gRNA_wholeDataset.csv : -0.210
listgarten_elevation_hmg.pkl       : 0.105

all correlation                    : -0.119

--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---

listgarten_el

In [None]:
ests=[
  RandomForestClassifier(n_estimators=10, random_state=0),
  RandomForestClassifier(n_estimators=10, random_state=10),
  RandomForestClassifier(n_estimators=10, random_state=20),
  RandomForestClassifier(n_estimators=10, random_state=30),
  RandomForestClassifier(n_estimators=10, random_state=40)]
tlbunch=transferlearningPipelineBatch(
  ests, lbls,
  # nmfcts,
  encodeddata=encdata, sampfloat=1.0, verbose=False)
#
p('\n--- METRICS POST-PROCESSING ---\n')
aggauc, aggf1, aggap = postprocmetrics(tlbunch)
p('\n--- CORRELATION ROC AUC FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggauc))
p('\n--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggauc))
p('\n--- CORRELATION F1 SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggf1))
p('\n--- CORRELATION F1 SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggf1))
p('\n--- CORRELATION AVG PREC SCORE FOR COSINE DISTANCE ---\n')
corrpipeline(lbls, cosdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR EUCLIDEAN DISTANCE ---\n')
corrpipeline(lbls, eucdist, np.asarray(aggap))
p('\n--- CORRELATION AVG PREC SCORE FOR MANHATTAN DISTANCE ---\n')
corrpipeline(lbls, mandist, np.asarray(aggap))


 --- transfer learning pipeline in progress ... --- 


 --- transfer learning pipeline done --- 


--- METRICS POST-PROCESSING ---


--- CORRELATION ROC AUC FOR COSINE DISTANCE ---

listgarten_elevation_cd33.pkl      : 0.903
CIRCLE_seq_10gRNA_wholeDataset.csv : 0.165
SITE-Seq_offTarget_wholeDataset.csv: -0.031
listgarten_elevation_guideseq.pkl  : -0.637
Listgarten_22gRNA_wholeDataset.csv : 0.016
Kleinstiver_5gRNA_wholeDataset.csv : 0.347
listgarten_elevation_hmg.pkl       : -0.164

all correlation                    : 0.086

--- CORRELATION ROC AUC FOR EUCLIDEAN DISTANCE ---

listgarten_elevation_cd33.pkl      : -0.897
CIRCLE_seq_10gRNA_wholeDataset.csv : -0.235
SITE-Seq_offTarget_wholeDataset.csv: -0.117
listgarten_elevation_guideseq.pkl  : 0.563
Listgarten_22gRNA_wholeDataset.csv : -0.129
Kleinstiver_5gRNA_wholeDataset.csv : -0.459
listgarten_elevation_hmg.pkl       : 0.091

all correlation                    : -0.169

--- CORRELATION ROC AUC FOR MANHATTAN DISTANCE ---

listgarten_e

In [None]:
tlbunch.modelaucs

array([[0.8623256 , 0.86733438, 0.85691753, 0.85895765, 0.86642671],
       [0.92504152, 0.92483331, 0.92277581, 0.92510014, 0.92400786],
       [0.91039654, 0.91331647, 0.91621001, 0.90756085, 0.90816857],
       [0.91888079, 0.92332277, 0.92075563, 0.91621555, 0.92371416],
       [0.82200419, 0.76815874, 0.71171611, 0.83967649, 0.86724759],
       [0.76006192, 0.70802206, 0.82928719, 0.73274314, 0.67000659],
       [0.728033  , 0.64683262, 0.71263025, 0.65180491, 0.66932683]])