# Example code to apply the modular splicing model

Example test variants come from ClinVar BRCA1 variants

## Splicing delta PSI prediction

In [1]:
from mmsplice.vcf_dataloader import SplicingVCFDataloader
from mmsplice import MMSplice, predict_all_table

from mmsplice.utils import max_varEff

Using TensorFlow backend.


In [2]:
gtf = '../tests/data/test.gtf'
vcf = '../tests/data/test.vcf.gz'
fasta = '../tests/data/hg19.nochr.chr17.fa'
gtfIntervalTree = '../tests/data/test.pkl' # pickle exon interval Tree

In [3]:
dl = SplicingVCFDataloader(gtfIntervalTree, 
                          fasta,
                          vcf,
                          out_file=gtfIntervalTree,
                          split_seq=False, overhang=(100,100))

In [4]:
next(dl)

{'inputs': {'intronl_len': 100,
  'intronr_len': 100,
  'seq': 'CCTAAGAACTCATACAACCAGGACCCTGGAGTCGATTGATTAGAGCCTAGTCCAGGAGAATGAATTGACACTAATCTCTGCTTGTGTTCTCTGTCTCCAGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTCCTTCT'},
 'inputs_mut': {'intronl_len': 100,
  'intronr_len': 100,
  'seq': 'CCTAAGAACTCATACAACCAGGACCCTGGAGTCGATTGATTAGAGCCTAGTCCAGGAGAATGAATTGACACTAATCTCTGCTTGTGTTCTCTGTCTCCAGCAATTGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTCCTTCT'},
 'metadata': {'ExonInterval': {'Exon_End': 41197819,
   'Exon_Start': 41197695,
   'end': 41197919,
   'gene_id': 'ENSG00000012048',
   'intronl_len': 100,
   'intronr_len': 100,
   'isFirst': False,
   'isLast': False

In [5]:
model = MMSplice(
    exon_cut_l=0,
    exon_cut_r=0,
    acceptor_intron_cut=6,
    donor_intron_cut=6,
    acceptor_intron_len=50,
    acceptor_exon_len=3,
    donor_exon_len=5,
    donor_intron_len=13)



In [6]:
predictions = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True, pathogenicity=True, splicing_efficiency=True)

3it [00:24,  8.79s/it]


In [7]:
predictionsMax = max_varEff(predictions)

In [8]:
predictionsMax.sort_values(['mmsplice_dlogitPsi']).head()

Unnamed: 0,ID,mmsplice_dlogitPsi,exons,mmsplice_pathogenicity,mmsplice_dse
632,17:41242891:GTGGGATACATACTACTGAATGCAAAGGACACCA...,-13.616748,17_41242961_41243049:-,1.0,-15.24226
957,17:41251814:CTTTTGAGGTTGTATCCGCTGCTTTGTCCTCAGA...,-10.040847,17_41251848_41251897:-,1.0,-8.3386
699,17:41245766:CATATTGCTTATACTGCTGCTTATAGGTTCAGCT...,-9.789052,17_41245603_41246877:-,1.0,-6.664236
147,17:41203074:TCTTACCT:['ATGTTG'],-8.18161,17_41203080_41203134:-,1.0,-12.337522
946,17:41251791:C:['CCCAATTCAATGTAGACAGACGTCTTTTGA...,-7.988797,17_41251792_41251894:-,1.0,-6.596324
