# Example code to apply the modular splicing model

Example test variants come from ClinVar BRCA1 variants

## Splicing delta PSI prediction

In [1]:
from mmsplice.vcf_dataloader import SplicingVCFDataloader
from mmsplice import MMSplice, predict_all_table

from mmsplice.utils import max_varEff

Using TensorFlow backend.


In [2]:
gtf = '../tests/data/test.gtf'
vcf = '../tests/data/test.vcf.gz'
fasta = '../tests/data/hg19.nochr.chr17.fa'
gtfIntervalTree = '../tests/data/test.pkl' # pickle exon interval Tree

In [3]:
dl = SplicingVCFDataloader(gtfIntervalTree, 
                          fasta,
                          vcf,
                          out_file=gtfIntervalTree,
                          split_seq=False, overhang=(50,10))

In [4]:
next(dl)

{'inputs': {'intronl_len': 100,
  'intronr_len': 0,
  'seq': 'CCTAAGAACTCATACAACCAGGACCCTGGAGTCGATTGATTAGAGCCTAGTCCAGGAGAATGAATTGACACTAATCTCTGCTTGTGTTCTCTGTCTCCAGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTCCTTCTACTGTCCTGGCTACTAAATATTTTATGTACATCAGCCTGAAAAGGACTTCTGGCTATGCAAGGGTCCCTTAAAGATTTTCTGCTTGAAGTCTCCCTTGGAAATCTGCCATGAGCACAAAATTATGGTAATTTTTCACCTGAGAAGATTTTAAAACCATTTAAACGCCACCAATTGAGCAAGATGCTGATTCATTATTTATCAGCCCTATTCTTTCTATTCAGGCTGTTGTTGGCTTAGGGCTGGAAGCACAGAGTGGCTTGGCCTCAAGAGAATAGCTGGTTTCCCTAAGTTTACTTCTCTAAAACCCTGTGTTCACAAAGGCAGAGAGTCAGACCCTTCAATGGAAGGAGAGTGCTTGGGATCGATTATGTGACTTAAAGTCAGAATAGTCCTTGGGCAGTTCTCAAATGTTGGAGTGGAACATTGGGGAGGAAATTCTGAGGCAGGTATTAGAAATGAAAAGGAAACTTGAAACCTGGGCATGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGCAGATCACTGGAGGTCAGGAGTTCGAAACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAGAAATTAGCC

In [5]:
model = MMSplice(
    exon_cut_l=0,
    exon_cut_r=0,
    acceptor_intron_cut=6,
    donor_intron_cut=6,
    acceptor_intron_len=50,
    acceptor_exon_len=3,
    donor_exon_len=5,
    donor_intron_len=13)



In [6]:
predictions = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True)

3it [00:22,  8.11s/it]


In [7]:
predictionsMax = max_varEff(predictions)

In [8]:
predictionsMax.sort_values(['mmsplice_diff']).head()

Unnamed: 0,ID,mmsplice_diff,exons
632,17:41242891:GTGGGATACATACTACTGAATGCAAAGGACACCA...,-13.616748,17_41242961_41243049:-
957,17:41251814:CTTTTGAGGTTGTATCCGCTGCTTTGTCCTCAGA...,-10.040847,17_41251848_41251897:-
699,17:41245766:CATATTGCTTATACTGCTGCTTATAGGTTCAGCT...,-9.789052,17_41245603_41246877:-
147,17:41203074:TCTTACCT:['ATGTTG'],-8.18161,17_41203080_41203134:-
946,17:41251791:C:['CCCAATTCAATGTAGACAGACGTCTTTTGA...,-7.988797,17_41251792_41251897:-


## Predict pathogenicity

In [9]:
dl = SplicingVCFDataloader(gtfIntervalTree, 
                          fasta,
                          vcf,
                          out_file=gtfIntervalTree,
                          split_seq=False, overhang=(50,10))

In [10]:
predictions = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True, pathogenicity=True)

3it [00:24,  8.16s/it]


In [11]:
predictionsMax = max_varEff(predictions)

In [12]:
predictionsMax.sort_values('mmsplice_diff', ascending=False).head()

Unnamed: 0,ID,mmsplice_diff,exons
585,17:41245766:CATATTGCTTATACTGCTGCTTATAGGTTCAGCT...,1.0,17_41245603_41246877:-
524,17:41242891:GTGGGATACATACTACTGAATGCAAAGGACACCA...,1.0,17_41242961_41243049:-
233,17:41215387:ACCCCTAAAGAGATCATAGA:['TATT'],1.0,17_41215377_41215390:-
720,17:41251814:CTTTTGAGGTTGTATCCGCTGCTTTGTCCTCAGA...,1.0,17_41251848_41251897:-
779,17:41256281:G:['C'],1.0,17_41256206_41256278:-
