In [2]:
# ===============================================================
# ===============================================================
# ===      NOTEBOOK TO HIGHLIGHT HOW TO USE THE ENCODED       ===
# ===     BUNCH DATA AND TRAIN A RANDOM FOREST CLASSIFIER     ===
# ===============================================================
# ===============================================================
__date__= '19-Feb-23'
__author__='jeremy charlier'
__revised__='19-Feb-23'
#
import pickle as pkl
import numpy as np
from sklearn.model_selection import train_test_split as ttSplit
from sklearn.metrics import (
  classification_report, roc_auc_score,
  confusion_matrix, f1_score,
  roc_curve, precision_score, recall_score,
  auc, average_precision_score, 
  precision_recall_curve, accuracy_score)
from sklearn.ensemble import RandomForestClassifier as RF
P = print
#
class ModelPipeline():
  def __init__(self, estimator, xtrain, ytrain, xtest, ytest):
    self.estimator=estimator
    self.x_train=xtrain
    self.y_train=ytrain
    self.x_test=xtest
    self.y_test=ytest
    self.ypred=np.zeros((len(self.y_test)))
    self.yscore=np.zeros((len(self.y_test),2))
  # end of function __init__
  #
  def __getstate__(self):
    return self.__dict__.copy()
  # end of function __getstate__
  #
  def brierScore(y_test, yscore):
    """Compute the Brier score (0 = best, 1 = worst). 
    Parameters
    ----------
    y_test : array-like
      true target series
    yscore : array-like
      predicted scores
    Returns
    -------
    bscore : float 
      Brier score
    """
    bscore=(1/len(y_test))
    bscore*=np.sum(np.power(yscore[:,1]-y_test, 2))
    return bscore
  # end of function brierScore
  #
  def dispConfMatrixAsArray(y_test, ypred, disp=True):
    """Display and return the confusion matrix as array.
    Parameters
    ----------
    y_test : array-like
      true target series
    ypred : array-like
      predicted target series
    disp : boolean
      diplay the confusion matrix
    Returns
    -------
    confmatrix : array-like
      pandas dataframe of the confusion matrix
    """
    confmatrix=confusion_matrix(y_test,ypred)
    tn,fp,fn,tp=confmatrix.ravel()
    if disp==True:
      P('\nConfusion Matrix')
      P("%-3s" % 'TN:', "%-5s" % tn,
        "|  %-3s" % 'FP:', "%-5s" % fp)
      P("%-3s" % 'FN:', "%-5s" % fn,
        "|  %-3s" % 'TP:', "%-5s" % tp)
    return confmatrix
  # end of function dispConfMatrixAsArray
  #
  def getClassificationMetricsForPreds(self):
    """Compute metrics for classification models using the predicted class.
    Parameters
    ----------
    self : class-like
      class object
    """
    posLabel = np.unique(self.y_test)
    P("%-40s" % ("Mean Accuracy:"),
      "{:.3f}".format(self.estimator.score(self.x_test, self.y_test))
    )
    for n in posLabel:
      P("%-40s" % ("F1 Score Class " + str(n) + " :"), 
        "{:.3f}".format(
          f1_score(self.y_test,self.ypred,pos_label=n))
      )
      P("%-40s" % ("Recall Score Class "+str(n)+" :"), 
        "{:.3f}".format(
          recall_score(self.y_test,self.ypred,pos_label=n))
      )
    # end for
  # end of function getClassificationMetricsForPreds
  #
  def getClassificationMetricsForScores(self):
    """Compute metrics for classification models using the scores.
    Parameters
    ----------
    self : class-like
      class object
    """
    posLabel = np.unique(self.y_test)
    P("%-40s" % ("ROC AUC Score:"),
      "{:.3f}".format(roc_auc_score(self.y_test, self.yscore[:,1]))
    )
    P("%-40s" % ("Brier Score:"), "{:.3f}".format(
      ModelPipeline.brierScore(self.y_test, self.yscore))
    )
    for n in posLabel:
      P("%-40s" % ("Avrg Precision Score Class "+str(n)+" :"), 
        "{:.3f}".format(
          average_precision_score(self.y_test,self.yscore[:,1],pos_label=n))
      )
    # end for
  # end of function getClassificationMetricsForScores
  #
  def getClassificationMetrics(self):
    """Compute metrics for classification models.
    Parameters
    ----------
    self : class-like
      class object
    """
    P("\nModel Metrics:")
    ModelPipeline.getClassificationMetricsForPreds(self)
    if not "RidgeClassifier" in str(self.estimator):
      ModelPipeline.getClassificationMetricsForScores(self)
    # end if
    _ = ModelPipeline.dispConfMatrixAsArray(self.y_test,self.ypred,disp=True)
  # end of function getClassificationMetrics
  #
  def modelTrain(self):
    """Training pipeline.
    """
    self.estimator=self.estimator.fit(self.x_train,self.y_train)
    return self
  # end of function modelTrain
  #
  def modelPredict(self):
    """Predict pipeline.
    """
    self.ypred=self.estimator.predict(self.x_test)
    if not "RidgeClassifier" in str(self.estimator):
      self.yscore=self.estimator.predict_proba(self.x_test)
    # end if
    ModelPipeline.getClassificationMetrics(self)
    return self
  # end of function modeltrain
# end of class ModelPipeline
#
#
def reshapeArr(arr, est='rf'):
  shp = np.prod(arr.shape[1:])
  if est == 'rf':
    arr = arr.astype('float32')
    arr = arr.reshape(-1,shp)
  return arr
#
#
def printClassImbalance(y):
  nmin = min((y==1).sum(), (y==0).sum())
  nmaj = max((y==1).sum(), (y==0).sum())
  P(">> Nbr of samples in minority class: %s" % nmin)
  P(">> Nbr of samples in majority class: %s" % nmaj)
  P(">> Class imbalance: %s " % np.round(nmin/len(y), 3))
#
#

In [3]:
# === READ ENCODED DATA ===
# change path_to_module by the folder path
# update encoded_data_for_experiments.pkl by the name of your data file
with open(path_to_module+'encoded_data_for_experiments.pkl', 'rb') as f: 
  data = pkl.load(f)
P('Available encoded data sets for experiments:')
P(*list(data.keys()), sep="\n")

Available encoded data sets for experiments:
listgarten_elevation_cd33
CIRCLE_seq_10gRNA
SITE_seq_offtarget
elevation_guideseq
Listgarten_22gRNA
Kleinstiver_5gRNA
listgarten_elevation_hmg


In [4]:
# === DATA SPLIT ===
# select listgarten_elevation_cd33 for binary classification
dataset = 'listgarten_elevation_cd33'
X = data[dataset].data
y = data[dataset].target
#
# print class imbalance
P('Class Imbalance Statistics:')
printClassImbalance(y)
#
# train-test split with stratification
xtrain, xtest, ytrain, ytest = ttSplit(
  X, y, test_size = .3, random_state = 42, stratify = y)
#
# 3 dimensionals are converted to 2d for classifier
xtrain = reshapeArr(xtrain)
xtest = reshapeArr(xtest)

Class Imbalance Statistics:
>> Nbr of samples in minority class: 2273
>> Nbr of samples in majority class: 2580
>> Class imbalance: 0.468 


In [5]:
# === MODEL FIT AND PREDICT ===
clf = RF(n_estimators = 100, random_state = 42)
mdl = ModelPipeline(
  clf, xtrain, ytrain, xtest, ytest).modelTrain()
mdl = mdl.modelPredict()


Model Metrics:
Mean Accuracy:                           0.740
F1 Score Class 0.0 :                     0.716
Recall Score Class 0.0 :                 0.615
F1 Score Class 1.0 :                     0.761
Recall Score Class 1.0 :                 0.883
ROC AUC Score:                           0.885
Brier Score:                             0.157
Avrg Precision Score Class 0.0 :         0.352
Avrg Precision Score Class 1.0 :         0.882

Confusion Matrix
TN: 476   |  FP: 298  
FN: 80    |  TP: 602  
