# Example code to apply the modular splicing model

Example test variants come from ClinVar BRCA1 variants

## Splicing delta PSI prediction

In [1]:
from mmsplice.vcf_dataloader import SplicingVCFDataloader
from mmsplice import MMSplice, predict_all_table

from mmsplice.utils import max_varEff

Using TensorFlow backend.


In [2]:
gtf = '../tests/data/test.gtf'
vcf = '../tests/data/test.vcf.gz'
fasta = '../tests/data/hg19.nochr.chr17.fa'
gtfIntervalTree = '../tests/data/test.pkl' # pickle exon interval Tree

In [3]:
dl = SplicingVCFDataloader(gtfIntervalTree, 
                          fasta,
                          vcf,
                          out_file=gtfIntervalTree,
                          split_seq=False, overhang=(100,100))

In [4]:
next(dl)

{'inputs': {'seq': 'TCCAGGAGAATGAATTGACACTAATCTCTGCTTGTGTTCTCTGTCTCCAGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAG',
  'intronl_len': 50,
  'intronr_len': 10},
 'inputs_mut': {'seq': 'TCCAGGAGAATGAATTGACACTAATCTCTGCTTGTGTTCTCTGTCTCCAGCAATTGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAG',
  'intronl_len': 50,
  'intronr_len': 10},
 'metadata': {'ranges': GenomicRanges(chr='17', start=41197685, end=41197869, id='ENST00000461221', strand='-'),
  'variant': {'CHROM': '17',
   'POS': 41197805,
   'ID': '182075',
   'REF': 'ACATCTGCC',
   'ALT': 'A',
   'STR': "17:41197805:ACATCTGCC:['A']"},
  'ExonInterval': {'isLast': False,
   'isFirst': False,
   'order': 23,
   'name': 'ENSE00003513816',
   'gene_id': 'ENSG00000012048',
   'Exon_Start': 41197695,
   'Exon_End': 41197819,
   'intronl_len': 50,
   'intronr_len': 10,
   'seqid

In [5]:
model = MMSplice(
    exon_cut_l=0,
    exon_cut_r=0,
    acceptor_intron_cut=6,
    donor_intron_cut=6,
    acceptor_intron_len=50,
    acceptor_exon_len=3,
    donor_exon_len=5,
    donor_intron_len=13)



In [6]:
predictions = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True)

2it [00:29, 14.97s/it]


In [7]:
predictionsMax = max_varEff(predictions)

In [8]:
predictionsMax.sort_values(['mmsplice_diff']).head()

Unnamed: 0,ID,mmsplice_diff,exons
104,17:41201134:TTAC:['T'],-11.431542,17_41201138_41201211:-
691,17:41245766:CATATTGCTTATACTGCTGCTTATAGGTTCAGCT...,-10.751444,17_41245603_41246877:-
1212,17:41267740:TA:['T'],-8.586722,17_41267743_41267796:-
159,17:41203079:CCT:['ATGTTG'],-7.958546,17_41203080_41203134:-
69,17:41199658:AC:['A'],-7.749616,17_41199660_41199720:-


## Predict pathogenicity

In [9]:
dl = SplicingVCFDataloader(gtfIntervalTree, 
                          fasta,
                          vcf,
                          out_file=gtfIntervalTree,
                          split_seq=False, overhang=(50,10))

In [10]:
predictions = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True, pathogenicity=True)

2it [00:10,  5.25s/it]


In [11]:
predictionsMax = max_varEff(predictions)

In [12]:
predictionsMax.sort_values('mmsplice_diff', ascending=False).head()

Unnamed: 0,ID,mmsplice_diff,exons
577,17:41245766:CATATTGCTTATACTGCTGCTTATAGGTTCAGCT...,1.0,17_41245601_41246877:-
230,17:41215387:ACCCCTAAAGAGATCATAGA:['TATT'],1.0,17_41215377_41215390:-
712,17:41251814:CTTTTGAGGTTGTATCCGCTGCTTTGTCCTCAGA...,1.0,17_41251792_41251894:-
771,17:41256281:G:['C'],1.0,17_41256206_41256278:-
203,17:41215348:AC:['A'],1.0,17_41215350_41215390:-
