## Data and Package Importing

In [1]:
from QC import *
from ml_utils import *
sc.settings.set_figure_params(dpi=80)

In [None]:
# reload fcc module while working
import sys
del sys.modules['ml_utils']
from ml_utils import *

### Load labeled data

In [2]:
s1 = sc.read_h5ad('3247-AS-1-GCCAAT_S1_labeled.h5ad')

In [3]:
%%time
sc.pp.normalize_total(s1, target_sum=10000) # each gene count value is divided by the total number of counts for that respective cell  
sc.pp.log1p(s1) # log1p normalization
sc.pp.scale(s1, max_value=10) # scaling by variance and centering to zero for visualization
sc.tl.pca(s1, n_comps=100) # perform PCA

CPU times: user 1min 59s, sys: 7.3 s, total: 2min 6s
Wall time: 29.9 s


In [13]:
# binarize droplet labels to see how classifiers can detect live cells 
s1.obs['label_bin'] = 0
s1.obs.loc[s1.obs['label']!='live', 'label_bin'] = 0
s1.obs.loc[s1.obs['label']=='live', 'label_bin'] = 1

---
### Machine Learning Classifiers

Test using wrapper function from `ml_utils.py`

In [14]:
s1.obs['label_bin'].unique() # order of labels for plots

array([1, 0])

In [None]:
%%time
# Logistic Regression Classifier
lr = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial', max_iter=1000)
lr_out = roc_kfold(clf=lr, X=s1.obsm['X_pca'], y=s1.obs['label_bin'], k=5, seed=18)

In [None]:
%%time
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc_out = roc_kfold(clf=gbc, X=s1.obsm['X_pca'], y=s1.obs['label_bin'], k=5, seed=18)

In [None]:
%%time
# AdaBoost Classifier
abc = AdaBoostClassifier()
abc_out = roc_kfold(clf=abc, X=s1.obsm['X_pca'], y=s1.obs['label_bin'], k=5, seed=18)

---

So it looks like the __Gradient Boosting Classifier__ is our best bet based on AUROC and Accuracy values.  
Now, we can optimize the hyperparameters using `RandomizedSearchCV` to tweak the classifier for the best result and compare optimized classifier.

In [None]:
from pprint import pprint

In [None]:
# Generate grid of hyperparameters to test
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
loss = ['deviance', 'exponential']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# learning rate
learning_rate = np.linspace(0.1, 1, 10)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'loss': loss,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate}
pprint(random_grid)

In [None]:
%%time
# Use the random grid to search for best hyperparameters
# First create the base model to tune
gbc = GradientBoostingClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gbc_random = RandomizedSearchCV(estimator = gbc, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=18, n_jobs = -1)
# Fit the random search model
gbc_random.fit(s1.obsm['X_pca'], s1.obs['alive'])

In [None]:
gbc_random.best_params_

In [None]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features='auto', max_depth=70, loss='exponential', learning_rate=0.3)
mets = roc_kfold(clf=gbc, X=s1.obsm['X_pca'], y=s1.obs['alive'], k=5, seed=18)