In [1]:
import pandas as pd
import gpplot
from scipy import stats
from rs3.predict import predict

from datasets import (GuideDataset, human_designs,
                      dataset_list)

In [2]:
train_data_names = pd.read_csv('../data/processed/train_data_names.csv').name.to_list()
hanna_aacc03_annotations = pd.read_csv('../data/raw/Hanna2020_aacc03_annotations.csv')

In [3]:
hanna_aacc03_data = GuideDataset(filepath='../data/processed/Hanna2020_aacc03_activity.csv',
                                 name='Hanna2020_Essential',
                                 sgrna_seq_col='sgRNA Sequence',
                                 context_seq_col='sgRNA Context Sequence',
                                 rank_col='scaled_neg_lfc', endogenous=True,
                                 sgrna_group_col='Target Gene Symbol',
                                 cut_perc_col='Target Cut %',
                                 design_file=human_designs, genomewide=False,
                                 tracr='Chen2013')
hanna_aacc03_data.load_data()
hanna_aacc03_data.set_sgrnas()
hanna_aacc03_designs = hanna_aacc03_data.get_designs()
hanna_aacc03_designs

Unnamed: 0,Input,Quota,Target Taxon,Target Gene ID,Target Gene Symbol,Target Transcript,Target Alias,CRISPR Mechanism,Target Domain,Reference Sequence,...,On-Target Ruleset,On-Target Efficacy Score,On-Target Rank,Off-Target Rank,On-Target Rank Weight,Off-Target Rank Weight,Combined Rank,Pick Order,Picking Round,Picking Notes
0,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,Azimuth_2.0,0.6142,9,5,1.0,1.0,2,1.0,1.0,
1,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,Azimuth_2.0,0.5501,16,1,1.0,1.0,4,2.0,1.0,
2,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,Azimuth_2.0,0.6165,8,9,1.0,1.0,5,11.0,3.0,Previously skipped due to: Outside Target Wind...
3,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,Azimuth_2.0,0.5112,20,6,1.0,1.0,8,4.0,1.0,
4,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,Azimuth_2.0,0.5017,24,4,1.0,1.0,10,6.0,2.0,Previously skipped due to: Spacing Violation: ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,Azimuth_2.0,0.3337,154,120,1.0,1.0,158,120.0,3.0,Previously skipped due to: Outside Target Wind...
2399,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,Azimuth_2.0,0.4010,144,132,1.0,1.0,159,81.0,2.0,Previously skipped due to: Spacing Violation: ...
2400,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,Azimuth_2.0,0.4183,141,143,1.0,1.0,160,82.0,2.0,Previously skipped due to: Spacing Violation: ...
2401,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,Azimuth_2.0,0.4390,133,158,1.0,1.0,161,153.0,4.0,Previously skipped due to: Outside Target Wind...


In [4]:
hanna_aacc03_scores = predict(hanna_aacc03_designs,
                              tracr='Chen2013', target=True,
                              n_jobs_min=2, n_jobs_max=12,
                              lite=False)
hanna_aacc03_scores

Calculating sequence-based features


100%|██████████| 2403/2403 [00:05<00:00, 433.14it/s] 


Getting amino acid sequences


100%|██████████| 1/1 [00:00<00:00,  4.52it/s]


Getting protein domains


100%|██████████| 10/10 [00:02<00:00,  4.27it/s]


Getting conservation


100%|██████████| 10/10 [00:00<00:00, 92.42it/s]


Unnamed: 0,Input,Quota,Target Taxon,Target Gene ID,Target Gene Symbol,Target Transcript,Target Alias,CRISPR Mechanism,Target Domain,Reference Sequence,...,Off-Target Rank Weight,Combined Rank,Pick Order,Picking Round,Picking Notes,RS3 Sequence Score (Chen2013 tracr),Transcript Base,Missing conservation information,Target Score,RS3 Sequence (Chen2013 tracr) + Target Score
0,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,1.0,2,1.0,1.0,,0.462968,ENST00000009589,False,-0.082090,0.380878
1,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,1.0,4,2.0,1.0,,0.681802,ENST00000009589,False,-0.049504,0.632298
2,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,1.0,5,11.0,3.0,Previously skipped due to: Outside Target Wind...,0.183004,ENST00000009589,False,-0.167990,0.015015
3,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,1.0,8,4.0,1.0,,-0.537057,ENST00000009589,False,0.051581,-0.485476
4,ENSG00000008988,MAX,9606,ENSG00000008988,RPS20,ENST00000009589.7,,CRISPRko,CDS,NC_000008.11,...,1.0,10,6.0,2.0,Previously skipped due to: Spacing Violation: ...,0.115983,ENST00000009589,False,0.095028,0.211011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,1.0,158,120.0,3.0,Previously skipped due to: Outside Target Wind...,-0.448485,ENST00000642195,False,-0.174673,-0.623158
2399,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,1.0,159,81.0,2.0,Previously skipped due to: Spacing Violation: ...,-1.069427,ENST00000642195,False,0.095516,-0.973910
2400,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,1.0,160,82.0,2.0,Previously skipped due to: Spacing Violation: ...,0.048147,ENST00000642195,False,0.554555,0.602702
2401,ENSG00000171453,MAX,9606,ENSG00000171453,POLR1C,ENST00000642195.1,,CRISPRko,CDS,NC_000006.12,...,1.0,161,153.0,4.0,Previously skipped due to: Outside Target Wind...,0.341778,ENST00000642195,False,0.157797,0.499575


In [5]:
hanna_activity = hanna_aacc03_data.get_sg_df(include_group=True, include_activity=True)
hanna_activity_predictions = (hanna_activity
                              .rename({'sgRNA Target': 'Target Gene Symbol'}, axis=1)
                              .merge(hanna_aacc03_scores,
                                     how='inner',
                                     on=['sgRNA Sequence', 'sgRNA Context Sequence',
                                         'Target Gene Symbol', 'PAM Sequence']))
hanna_activity_predictions

Unnamed: 0,sgRNA Sequence,sgRNA Context Sequence,PAM Sequence,Target Gene Symbol,sgRNA Activity,Input,Quota,Target Taxon,Target Gene ID,Target Transcript,...,Off-Target Rank Weight,Combined Rank,Pick Order,Picking Round,Picking Notes,RS3 Sequence Score (Chen2013 tracr),Transcript Base,Missing conservation information,Target Score,RS3 Sequence (Chen2013 tracr) + Target Score
0,AAAAAATTATGCTGAATACC,GATTAAAAAATTATGCTGAATACCAGGTAG,AGG,KPNB1,0.988858,ENSG00000108424,MAX,9606,ENSG00000108424,ENST00000290158.9,...,1.0,122,179.0,3.0,Previously skipped due to: Outside Target Wind...,-0.416875,ENST00000290158,False,-0.000684,-0.417559
1,AAAACAGGGGCACGAGGGTA,AGGAAAAACAGGGGCACGAGGGTATGGATA,TGG,HNRNPU,-1.111856,ENSG00000153187,MAX,9606,ENSG00000153187,ENST00000444376.7,...,1.0,311,286.0,4.0,Previously skipped due to: Outside Target Wind...,-0.542411,ENST00000444376,False,-0.319682,-0.862093
2,AAAACCGGCGCTGGAACCAC,GGTCAAAACCGGCGCTGGAACCACGGGACA,GGG,PSMA6,1.052915,ENSG00000100902,MAX,9606,ENSG00000100902,ENST00000261479.9,...,1.0,24,34.0,3.0,Previously skipped due to: Outside Target Wind...,0.103319,ENST00000261479,False,-0.252199,-0.148880
3,AAAACTATCAAGTTGGGATC,CCAAAAAACTATCAAGTTGGGATCAGGCAG,AGG,SF3B1,-0.606245,ENSG00000115524,MAX,9606,ENSG00000115524,ENST00000335508.10,...,1.0,146,84.0,2.0,Previously skipped due to: Spacing Violation: ...,0.398501,ENST00000335508,False,-0.402890,-0.004389
4,AAAACTCTTGGAGCAATATA,CCCAAAAACTCTTGGAGCAATATAAGGAAG,AGG,HNRNPU,1.107731,ENSG00000153187,MAX,9606,ENSG00000153187,ENST00000444376.7,...,1.0,138,256.0,4.0,Previously skipped due to: Outside Target Wind...,0.175236,ENST00000444376,False,0.033653,0.208890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,TTTGGCAGCTGTGGGCTTAG,TTTGTTTGGCAGCTGTGGGCTTAGTGGGAG,TGG,KPNB1,-1.262632,ENSG00000108424,MAX,9606,ENSG00000108424,ENST00000290158.9,...,1.0,234,210.0,3.0,Previously skipped due to: Outside Target Wind...,-0.445308,ENST00000290158,False,0.022228,-0.423080
2399,TTTGGCCATTGAAGCTTCAG,TGGATTTGGCCATTGAAGCTTCAGAGGTGA,AGG,KPNB1,-0.175935,ENSG00000108424,MAX,9606,ENSG00000108424,ENST00000290158.9,...,1.0,110,58.0,2.0,Previously skipped due to: Spacing Violation: ...,-0.387132,ENST00000290158,False,-0.121343,-0.508475
2400,TTTGTATATAAATGCCTTAC,ATCTTTTGTATATAAATGCCTTACTGGGAT,TGG,HNRNPU,0.529085,ENSG00000153187,MAX,9606,ENSG00000153187,ENST00000444376.7,...,1.0,170,97.0,2.0,Previously skipped due to: Spacing Violation: ...,-0.739140,ENST00000444376,False,0.173191,-0.565949
2401,TTTGTCCAGGAAGTTGTCCA,ACAATTTGTCCAGGAAGTTGTCCAGGGCAG,GGG,EEF2,-0.203730,ENSG00000167658,MAX,9606,ENSG00000167658,ENST00000309311.7,...,1.0,205,424.0,4.0,Previously skipped due to: Outside Target Wind...,0.015411,ENST00000309311,False,-0.655281,-0.639869


In [6]:
train_data_list = list()
for ds in dataset_list:
    if ds.name in train_data_names:
        train_data_list.append(ds)

train_data_sgs = set()
for ds in train_data_list:
    ds.load_data()
    ds.set_sgrnas()
    train_data_sgs |= set(ds.sgrnas)

In [7]:
filtered_activity_predictions = hanna_activity_predictions[~hanna_activity_predictions['sgRNA Sequence'].isin(train_data_sgs)]

In [9]:
print(stats.pearsonr(filtered_activity_predictions['sgRNA Activity'],
                     filtered_activity_predictions['RS3 Sequence Score (Chen2013 tracr)']))
print(stats.pearsonr(filtered_activity_predictions['sgRNA Activity'],
                     filtered_activity_predictions['RS3 Sequence (Chen2013 tracr) + Target Score']))

(0.11029546957745733, 6.171789576923872e-06)
(0.1313718947162287, 6.99994709929524e-08)


In [10]:
be_annotated_predictions = (filtered_activity_predictions
                                 .merge(hanna_aacc03_annotations
                                        .rename({'sgRNA sequence': 'sgRNA Sequence',
                                                 'sgRNA context sequence': 'sgRNA Context Sequence',
                                                 'Gene symbol': 'Target Gene Symbol',
                                                 'Ensembl transcript ID': 'Transcript Base',
                                                 'Ensembl gene ID': 'Target Gene ID',
                                                 'sgRNA strand': 'Orientation'}, axis=1),
                                        how='inner',
                                        on=['sgRNA Sequence', 'sgRNA Context Sequence',
                                            'Target Gene Symbol',
                                            'Target Gene ID', 'Orientation']))
be_filtered_predictions = be_annotated_predictions[be_annotated_predictions['Mutation category']
    .str.contains('Nonsense|Splice', na=False)].copy()

In [11]:
print(stats.pearsonr(be_filtered_predictions['sgRNA Activity'],
                     be_filtered_predictions['RS3 Sequence Score (Chen2013 tracr)']))
print(stats.pearsonr(be_filtered_predictions['sgRNA Activity'],
                     be_filtered_predictions['RS3 Sequence (Chen2013 tracr) + Target Score']))

(0.202923772295795, 0.08079864862957045)
(0.127816070043014, 0.27447394020404725)
