In [None]:
# default_exp predicttarg

# predicttarg

> Rule set 3 target-site predictions

In [None]:
# export
from rs3 import targetfeat
import joblib
import os

In [None]:
import lightgbm
import pandas as pd
from rs3 import targetdata
from scipy import stats

In [None]:
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

In [None]:
# export
def load_target_model(lite=False):
    """Load rule set 3 target model"""
    if lite:
        model_name = 'target_lite_model.pkl'
    else:
        model_name = 'target_model.pkl'
    model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))
    return model

In [None]:
assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor



In [None]:
# export
def predict_target(design_df, aa_seq_df, protein_domain_df=None, conservation_df=None,
                   id_cols=None, lite=False):
    """Make predictions using the Rule Set 3 target model

    :param design_df: DataFrame
    :param aa_seq_df: DataFrame
    :param protein_domain_df: DatFrame
    :param id_cols: list or str
    :param lite: bool, whether to use the lite model
    :return: list
    """
    model = load_target_model(lite=lite)
    if id_cols is None:
        id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
    if lite:
        features = ['position', 'aa']
    else:
        features = ['position', 'aa', 'domain', 'conservation']
    target_feature_df, target_feature_cols = (targetfeat
                                              .build_target_feature_df(design_df,
                                                                       features=features,
                                                                       aa_seq_df=aa_seq_df,
                                                                       protein_domain_df=protein_domain_df,
                                                                       conservation_df=conservation_df,
                                                                       id_cols=id_cols))
    X_target = target_feature_df[target_feature_cols]
    predictions = model.predict(X_target)
    return predictions

In [None]:
design_df = pd.read_table('test_data/sgrna-designs.txt')
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)
predictions = predict_target(design_df=design_df,
                             aa_seq_df=aa_seq_df,
                             protein_domain_df=domain_df,
                             conservation_df=conservation_df)
design_df['Target Score'] = predictions
lite_predictions = predict_target(design_df=design_df,
                                  aa_seq_df=aa_seq_df,
                                  lite=True)
design_df['Target Score Lite'] = lite_predictions
assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7

Getting amino acid sequences


100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


Getting protein domains


100%|██████████| 200/200 [00:50<00:00,  3.97it/s]


Getting conservation


100%|██████████| 200/200 [04:58<00:00,  1.49s/it]


In [None]:
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')

sanger_designs = sanger_df.merge(design_df, how='inner',
                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                     'Target Cut %'])
gecko_designs = gecko_df.merge(design_df, how='inner',
                                on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                    'Target Cut %'])
assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
                      sanger_designs['Target Score'])[0] > 0.2
assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],
                      gecko_designs['Target Score'])[0] > 0.05
