## Predicting success or failure of OpenCell targets
__Keith Cheveralls__<br>
__October 2021__

This notebook documents attempts to understand what features were important for determining a given protein could be successfully tagged using our split-FP approach to endogenous tagging. ('successful' meaning that mNeonGreen signal was detected by fluorescence microscopy).

The results from this analysis are not used in the final 2021-opencell manuscript.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import scanpy as sc
import sklearn

from sklearn import inspection, metrics, model_selection
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot as plt
from matplotlib import rcParams

In [None]:
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)
rcParams['font.family'] = 'sans-serif'
rcParams['font.size'] = 12
rcParams['legend.fontsize'] = 12
rcParams['axes.grid'] = False
rcParams['figure.figsize'] = (5, 3)

In [None]:
df = pd.read_excel('../data/Suppl_Table_3_library_success.xlsx', sheet_name='library_success')
df = df.loc[df.library_success != 'WHOLE_PROTEOME'].copy()

In [None]:
df.rename(columns={s: s.lower().replace('?', '') for s in df.columns}, inplace=True)
df.rename(columns={'is_essential': 'essential'}, inplace=True)

df.essential.replace('Essential', True, inplace=True)
df.essential.replace('Non-essential', False, inplace=True)

df.library_success.replace('successful', True, inplace=True)
df.library_success.replace('unsuccessful', False, inplace=True)

# nicknames for feature columns
df.rename(columns={'log_hek_rna_tpm': 'rna', 'log_hek_conc_nm': 'ms', 'hdr_unsorted': 'hdr'}, inplace=True)

# boolean flag for n-terminal tag
df['nterm'] = df.terminus_tagged == 'N'

# drop NAs (before coerching 'essential' to boolean)
df.dropna(axis=0, how='any', subset=['rna', 'ms', 'essential'], inplace=True)

# coerce essential column to boolean
df['essential'] = df.essential.astype(bool)

df.shape

In [None]:
# targets with unsorted HDR
df_uns = df.loc[~df.hdr.isna()].copy()
df_uns.shape

### Exploratory analysis

In [None]:
# mean of each feature (of targets: 27% essential, 52% n-terminus, 76% successful)
df.mean()

In [None]:
df.groupby('library_success').mean()

In [None]:
sns.histplot(df, x='rna', hue='library_success', stat='density', common_norm=False)

In [None]:
sns.histplot(df, x='ms', hue='library_success', stat='density', common_norm=False)

In [None]:
sns.histplot(df, x='ms', hue='essential', stat='density', common_norm=False)

In [None]:
sns.histplot(df, x='ms', hue='nterm')

In [None]:
sns.histplot(df_uns, x='hdr', hue='library_success', stat='density', common_norm=False)

In [None]:
df_uns['thresh'] = df_uns.ms > 29.3
sns.histplot(df_uns.loc[~df_uns.library_success], x='hdr', hue='thresh', stat='density', common_norm=False)

In [None]:
sns.histplot(df_uns, x='hdr', hue='essential', stat='density', common_norm=False)

In [None]:
df_uns.median()

In [None]:
df_uns.loc[~df_uns.library_success].median()

In [None]:
sns.regplot(
    data=df, x='ms', y='library_success', logistic=True, n_boot=500, y_jitter=.05
)

### Logistic regression for all targets

In [None]:
columns = ['rna', 'ms', 'nterm', 'essential']

In [None]:
def predict(df, columns, kind, split=False, balance=False):

    y = df.library_success.values
    X = df[columns].values

    if split:
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, stratify=y, random_state=None
        )
    else:
        X_train, X_test = X.copy(), X.copy()
        y_train, y_test = y.copy(), y.copy()
    
    if kind == 'forest':
        classifier = RandomForestClassifier(random_state=0, oob_score=True)
    elif kind == 'logit':
        # classifier = LogisticRegressionCV(solver='lbfgs', cv=10)
        classifier = LogisticRegression(solver='lbfgs', class_weight=('balanced' if balance else None))

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict_proba(X)[:, 1]
    y_test_pred = classifier.predict_proba(X_test)[:, 1]
    
    data = dict(
        X_train=X_train, 
        X_test=X_test, 
        y_train=y_train, 
        y_test=y_test, 
        y_test_pred=y_test_pred, 
        y_pred=y_pred
    )
    return data, classifier

In [None]:
# predictions for all targets from logistic regression
res, classifier = predict(df, ['ms'], kind='logit', split=False, balance=False)

res = pd.DataFrame({'y_test': res['y_test'], 'y_test_pred': res['y_test_pred']})
sns.displot(res, x='y_test_pred', hue='y_test')

### Results for all targets

In [None]:
# logistic with all targets
res, classifier = predict(df, ['ms'], kind='logit', split=False, balance=False)

y = res['y_test']
y_pred = res['y_test_pred']
print(metrics.classification_report(y, y_pred > 0.5))

In [None]:
classifier.predict([[27.39]])

In [None]:
# average HDR efficiency for failed targets predicted to work, successful targets predicted to fail
(
    df.hdr.mean(),
    
    # failures predicted to work and to fail
    df.loc[(~y)].hdr.mean(), 
    df.loc[(~y) & ((y_pred > 0.5))].hdr.mean(), 
    df.loc[(~y) & ((y_pred < 0.5))].hdr.mean(), 
    
    # successes predicted to work and fail
    df.loc[(y)].hdr.mean(),
    df.loc[(y) & (y_pred > 0.5)].hdr.mean(),
    df.loc[(y) & (y_pred < 0.5)].hdr.mean(),
)

In [None]:
mask = y
plt.scatter(df.loc[mask].ms, df.loc[mask].hdr)
plt.scatter(df.loc[~mask].ms, df.loc[~mask].hdr)

In [None]:
mask = y_pred > 0.5
plt.scatter(df.loc[mask].ms, df.loc[mask].hdr)
plt.scatter(df.loc[~mask].ms, df.loc[~mask].hdr)

In [None]:
# fails predicted to work
dff = df.loc[(~y) & ((y_pred > 0.5))]
plt.scatter(dff.ms, dff.hdr, label='Failures predicted to work')

# success predicted to fail
dff = df.loc[(y) & ((y_pred < 0.5))]
plt.scatter(dff.ms, dff.hdr, label='Successes predicted to fail')
plt.legend()
plt.gca().set_xlabel('protein abundance')
plt.gca().set_ylabel('HDR efficiency')
# plt.savefig('/Users/keith.cheveralls/Box/KC-opencell-paper/crispr-success-abundance-hdr-wrong-predictions.pdf')

### Results for targets w unsorted HDR

In [None]:
res, classifier = predict(df_uns, ['ms'], kind='logit', split=False)

y = res['y_test']
y_pred = res['y_test_pred']
print(metrics.classification_report(y, y_pred > 0.5))

In [None]:
res, classifier = predict(df_uns, ['ms', 'hdr'], kind='logit', split=False, balance=False)

y = res['y_test']
y_pred = res['y_test_pred']
print(metrics.classification_report(y, y_pred > 0.5))

In [None]:
# average HDR efficiency for failed targets predicted to work, successful targets predicted to fail
(
    df_uns.hdr.mean(),
    df_uns.loc[(~y) & ((y_pred > 0.5))].hdr.mean(), 
    df_uns.loc[(y) & (~(y_pred > 0.5))].hdr.mean(),
)

In [None]:
# aside: random forest predictions for an 80-20 test set
res, classifier = predict(df, columns, kind='forest', split=True)

res = pd.DataFrame({'y_test': res['y_test'], 'y_test_pred': res['y_test_pred']})
sns.displot(res, x='y_test_pred', hue='y_test')

In [None]:
y = res['y_test']
y_pred = res['y_test_pred']
print(metrics.classification_report(y, y_pred > 0.5))

### Logistic regression using one column at a time and all columns

In [None]:
# one column at a time
d = df.copy()

kind = 'logit'
columns = ['rna', 'ms', 'essential', 'nterm'] #+ ['hdr']
for column in columns:
    
    # using only the column
    res, classifier = predict(d, [column], kind=kind)
    roc = metrics.roc_auc_score(d.library_success, res['y_pred'])
    
    print('%s (ROC %0.2f)' % (column, roc)) 
    print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))


In [None]:
# all combinations of two columns
d = df.copy()

kind = 'logit'
columns = ['rna', 'ms', 'essential', 'nterm'] #+ ['hdr']
for col_1 in columns:
    for col_2 in columns:
        if col_1 == col_2: continue
        res, classifier = predict(d, [col_1, col_2], kind='logit')
        roc = metrics.roc_auc_score(d.library_success, res['y_pred'])
        
        res, classifier = predict(d, [col_1, col_2], kind='forest')
        oob = classifier.oob_score_
        print("'%s' ROC: %d | OOB: %d" % ([col_1, col_2], 100*roc, 100*oob))

In [None]:
# report for two columns
res, classifier = predict(df_uns, ['hdr',], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

In [None]:
# report for two columns
res, classifier = predict(df, ['ms', 'nterm'], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

In [None]:
res, classifier = predict(df_uns, ['ms',], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

In [None]:
res, classifier = predict(df_uns, ['ms', 'nterm'], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

In [None]:
res, classifier = predict(df_uns, ['ms', 'hdr'], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

In [None]:
# report for two columns
res, classifier = predict(df_uns, ['ms', 'hdr', 'nterm', 'rna'], kind='logit', split=False)
print(metrics.classification_report(res['y_test'], res['y_test_pred'] > 0.5))

### Cross-validated precision and recall for successes and failures

In [None]:
def cross_validate_precision_recall(df, columns):
    
    cv_scores = {}
    X = df[columns].values
    y = df.library_success.values > 0
    
    classifier = LogisticRegression(solver='lbfgs')
    cv = sklearn.model_selection.StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

    res = sklearn.model_selection.cross_validate(
        classifier, X, y, scoring=['accuracy', 'precision', 'recall'], cv=cv
    )  
    cv_scores['accuracy'] = res['test_accuracy'].mean()
    cv_scores['success_precision'] = res['test_precision'].mean()
    cv_scores['success_recall'] = res['test_recall'].mean()
    
    y = ~y
    res = sklearn.model_selection.cross_validate(classifier, X, y, scoring=['precision', 'recall'], cv=cv)    
    cv_scores['failure_precision'] = res['test_precision'].mean()
    cv_scores['failure_recall'] = res['test_recall'].mean()
    
    for key, value in cv_scores.items():
        cv_scores[key] = int(value*100)
    
    return cv_scores

In [None]:
cross_validate_precision_recall(df, ['rna'])

In [None]:
cross_validate_precision_recall(df, ['ms'])

In [None]:
cross_validate_precision_recall(df, ['ms', 'nterm'])

In [None]:
cross_validate_precision_recall(df, ['ms', 'rna', 'nterm', 'essential'])

#### Unsorted HDR

In [None]:
cross_validate_precision_recall(df_uns, ['rna'])

In [None]:
cross_validate_precision_recall(df_uns, ['ms'])

In [None]:
cross_validate_precision_recall(df_uns, ['ms', 'nterm'])

In [None]:
cross_validate_precision_recall(df_uns, ['ms', 'hdr'])

In [None]:
cross_validate_precision_recall(df_uns, ['ms', 'hdr', 'nterm'])

### Feature importances from random forest

In [None]:
y = d.library_success.values
X = d[['rna', 'ms', 'essential', 'nterm']].values

classifier = RandomForestClassifier(random_state=0, oob_score=True)
classifier.fit(X, y)
y_pred = classifier.predict_proba(X)[:, 1]
 
result = inspection.permutation_importance(
    classifier, X, y, n_repeats=10, random_state=42, n_jobs=2
)

(classifier.feature_importances_, result['importances_mean'])