In [None]:
# default_exp predicttarg

# predicttarg

> Rule set 3 target-site predictions

In [None]:
# export
from rs3 import targetfeat
import joblib
import os

In [None]:
import lightgbm
import pandas as pd
from rs3 import targetdata
from scipy import stats

In [None]:
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'

In [None]:
# export
def load_target_model():
    """Load rule set 3 target model"""
    model = joblib.load(os.path.join(os.path.dirname(__file__), 'target_model.pkl'))
    return model

In [None]:
assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor



In [None]:
# export
def predict_target(design_df, aa_seq_df, protein_domain_df,
                   id_cols=None):
    """Make predictions using the Rule Set 3 target model

    :param design_df: DataFrame
    :param aa_seq_df: DataFrame
    :param protein_domain_df: DatFrame
    :param id_cols: list or str
    :return: list
    """
    model = load_target_model()
    if id_cols is None:
        id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
    target_feature_df, target_feature_cols = (targetfeat
                                              .build_target_feature_df(design_df,
                                                                       features=['position', 'aa', 'domain'], # fixed
                                                                       aa_seq_df=aa_seq_df,
                                                                       protein_domain_df=protein_domain_df,
                                                                       id_cols=id_cols))
    X_target = target_feature_df[target_feature_cols]
    predictions = model.predict(X_target)
    return predictions

In [None]:
design_df = pd.read_table('test_data/sgrna-designs.txt')
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
predictions = predict_target(design_df, aa_seq_df, domain_df)
design_df['Target Score'] = predictions

Getting amino acid sequences


100%|██████████| 4/4 [00:00<00:00, 91.28it/s]


Getting protein domains


100%|██████████| 200/200 [00:40<00:00,  4.94it/s]


In [None]:
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_df = pd.read_csv('test_data/Aguirre2017_activity.csv')

sanger_designs = sanger_df.merge(design_df, how='inner',
                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                     'Target Cut %'])
gecko_designs = gecko_df.merge(design_df, how='inner',
                                on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                    'Target Cut %'])
assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
                      sanger_designs['Target Score'])[0] > 0.05
assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],
                      gecko_designs['Target Score'])[0] > 0.05
