## Goal -- how does our precision, sensitivity, and accuracy get impacted when applying the PODER filters? 

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import yaml
from snakemake.io import expand
import pyranges as pr
from pyfaidx import Fasta
from mizani.formatters import percent_format
from scipy import stats


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

from plotnine import *

In [2]:
def my_theme(base_size=11, w=4, h=3):
    """
    Custom plotnine theme with:
    - White background
    - Clean styling
    - Axes and ticks retained

    Parameters:
    - base_size: Base font size

    Returns:
    - plotnine.theme object
    """
    return (
        theme_minimal(base_size=base_size)
        + theme(
            # White background
            panel_background=element_rect(fill='white', color=None),
            plot_background=element_rect(fill='white', color=None),

            # Remove grid lines
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),

            # Keep axis lines & ticks (don't blank them)
            axis_line=element_line(color='black'),
            axis_ticks=element_line(color='black'),

            plot_title=element_text(hjust=0.5, family='Helvetica'),
            axis_title_x=element_text(hjust=0.5, family='Helvetica'),
            axis_title_y=element_text(hjust=0.5, margin={'t':0, 'r':-2, 'b':0, 'l':0}, family='Helvetica'),
            
            # Styling text
            legend_title=element_blank(),
            axis_title=element_text(size=base_size + 1, family='Helvetica'),
            legend_text=element_text(size=base_size-2, family='Helvetica'),
            axis_text=element_text(size=base_size, color='black', family='Helvetica'),
            figure_size=(w, h),  # Controls plot dimensions (width x height in inches)
            plot_margin=0.05      # Shrinks surrounding white space
        )
    )

def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [4]:
config = load_config()
od = '../'
meta_df = load_meta()
meta_df['lab_sample'] = meta_df['lab_number_sample'].astype(str)+'_'+\
                        meta_df['lab_sampleid']+'_'+\
                        meta_df['cell_line_id']


def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

init_plot_settings()

In [5]:
## SQANTI reads stuff
f = proc_cfg(expand(config['lr']['qc_sirvs']['sqanti_reads']['class_summary'],
                    annot_completeness='C')[0], od)
df = pd.read_csv(f, sep='\t')

# rename structural categories
m = {'antisense': 'Antisense',
     'full-splice_match': 'FSM',
     'genic': 'Genic', 
     'incomplete-splice_match': 'ISM',
     'intergenic': 'Intergenic',
     'novel_in_catalog': 'NIC',
     'novel_not_in_catalog': 'NNC'}
df['structural_category'] = df.structural_category.map(m)

# add antisense gene ids as assc_gene_2
df['assc_gene_2'] = df.associated_gene
inds = df.loc[df.structural_category=='Antisense'].index
df.loc[inds, 'assc_gene_2'] = df.loc[inds, 'assc_gene_2'].str.split('_', expand=True)[1]

# add relevant metadata
df = df.merge(meta_df[['lab_sample', 'population']], 
              how='left', on='lab_sample')

# add # mapped reads 
df['n_mapped_reads'] = df.groupby('lab_sample')['isoform'].transform('count')

# add spike type
df['spike_type'] = np.nan
df.loc[df.chrom.str.contains('ERCC'), 'spike_type'] = 'ERCC'
df.loc[df.chrom.str.contains('SIRV'), 'spike_type'] = 'SIRV'

spliced_sirvs = ['SIRV1', 'SIRV2', 'SIRV3', 'SIRV4', 'SIRV5', 'SIRV6', 'SIRV7']
df.loc[df.chrom.isin(spliced_sirvs), 'spike_type_2'] = 'spliced_sirv'

# add # spliced sirv reads
df['n_spliced_sirv_reads'] = df.loc[df.spike_type_2=='spliced_sirv'].groupby('lab_sample')['isoform'].transform('count')

# splicing novelty
df['splicing_novelty'] = 'Novel'
df.loc[df.structural_category.isin(['FSM', 'ISM']), 'splicing_novelty'] = 'Known'

# overal known vs. novel
df['overall_nov'] = 'Novel'
df.loc[df.structural_category == 'FSM', 'overall_nov'] = 'Known'

## Ground truth sirv / ercc gtf
gtf_df = pr.read_gtf('../data/qc_sirvs/SIRV_ERCC_longSIRV_multi-fasta_20210507.gtf').df

temp = gtf_df.loc[gtf_df.Chromosome.isin(spliced_sirvs)]
n_spliced_sirvs_tot = len(temp.transcript_id.unique())

# there are 69 spliced sirvs as ground truth

# get the transcript length of each sirv transcript
gtf_df['exon_len'] = gtf_df['End'] - gtf_df['Start']
gtf_df['transcript_len'] = gtf_df.groupby('transcript_id')['exon_len'].transform('sum')

gtf_df['mean_transcript_len'] = gtf_df.groupby('gene_id')['transcript_len'].transform('mean')
gtf_df['med_transcript_len'] = gtf_df.groupby('gene_id')['transcript_len'].transform('median')

# also get number of exons
gtf_df['n_exons'] = gtf_df.groupby('transcript_id')['exon_assignment'].transform('count')
gtf_df['mean_n_exons'] = gtf_df.groupby('gene_id')['n_exons'].transform('mean')
gtf_df['med_n_exons'] = gtf_df.groupby('gene_id')['n_exons'].transform('median')

# also get number of transcripts
gtf_df['n_transcripts'] = gtf_df.groupby('gene_id')['transcript_id'].transform('nunique')

# get # monoexonic transcripts
temp = gtf_df[['gene_id', 'transcript_id', 'n_exons']].groupby(['gene_id', 'n_exons']).nunique().reset_index()
temp = temp.loc[temp.n_exons==1]
temp.drop('n_exons', axis=1, inplace=True)
temp.rename({'transcript_id': 'n_monoexonic_transcripts'}, axis=1, inplace=True)
temp.loc[temp.gene_id=='SIRV6']
gtf_df = gtf_df.merge(temp,
                      how='left',
                      on='gene_id')

# sort the long sirvs at least by length
sirv_order = ['1', '2', '3', '4', '5', '6', '7',
              '4001', '4002', '4003',
              '6001', '6002', '6003', 
              '8001', '8002', '8003',
              '10001', '10002', '10003',
              '12001', '12002', '12003']
sirv_order = [f'SIRV{s}' for s in sirv_order]



## Apply filters one by one and create a label to say whether the transcript passed each filter 
* FSM? (pass_fsm_filter)
* Monoexonic? (pass_monoexonic_filter)
* Reproducibile for novel transcripts? (pass_reproducibility_filter)
* Promoted ISM? (promoted_ism_filter)
* ISM? (pass_ism_filter)

In [6]:
# first, limit to spliced sirvs
temp = df.loc[df.spike_type_2=='spliced_sirv']

In [7]:
# fsm filter
temp['pass_fsm_filter'] = False
temp.loc[temp.structural_category=='FSM', 'pass_fsm_filter'] = True

# monoexonic filter
temp['pass_monoexonic_filter'] = True
temp.loc[temp.exons==1, 'pass_monoexonic_filter'] = False

# reproducibility filter -- only applies to potential novel transcripts
# we would consider; ie FSM are immune; monoexonic and ISM are already out
temp['pass_reproducibility_filter'] = False
temp['sample_sharing'] = temp.groupby('jxnHash')['lab_sample'].transform('nunique')
temp.loc[(temp.structural_category!='FSM')&\
         (temp.structural_cateogry!='ISM')&\
         (temp.exons!=1)&\
         (temp.sample_sharing>=2)], 'pass_reproducibility_filter'] = True

Unnamed: 0,isoform,chrom,strand,length,exons,structural_category,associated_gene,associated_transcript,ref_length,ref_exons,...,jxnHash,lab_sample,assc_gene_2,population,n_mapped_reads,spike_type,spike_type_2,n_spliced_sirv_reads,splicing_novelty,overall_nov
3904,0a5f1630-d746-49b6-abba-c6adfa5295d7:0,SIRV1,-,1362,6,FSM,SIRV1,SIRV103,1363.0,6.0,...,277f49c4a3633bf1591c1d77d02bc4c42e246ca47b54e8...,39_IN5_HG04217,SIRV1,ITU,6922,SIRV,spliced_sirv,2952.0,Known,Known
3907,0c85d074-a84a-4e9f-af13-098effd9fbbe:0,SIRV1,+,462,3,FSM,SIRV1,SIRV109,464.0,3.0,...,7736df0b7910af28a57a785c0ef320241b6b369bbe5164...,39_IN5_HG04217,SIRV1,ITU,6922,SIRV,spliced_sirv,2952.0,Known,Known
3908,0c8473b2-7937-4b90-be63-201092c8eba9:0,SIRV1,-,1359,6,FSM,SIRV1,SIRV103,1363.0,6.0,...,277f49c4a3633bf1591c1d77d02bc4c42e246ca47b54e8...,39_IN5_HG04217,SIRV1,ITU,6922,SIRV,spliced_sirv,2952.0,Known,Known
3909,0dd9fbea-d401-4926-84f2-27527783e934:0,SIRV1,-,667,5,FSM,SIRV1,SIRV105,670.0,5.0,...,3a1b1eddd311a31572338f49a0d58dcac043e8f71f1f11...,39_IN5_HG04217,SIRV1,ITU,6922,SIRV,spliced_sirv,2952.0,Known,Known
3912,1b0f17ac-35a9-445b-aed7-0d6dde421d3f:0,SIRV1,+,463,3,FSM,SIRV1,SIRV109,464.0,3.0,...,7736df0b7910af28a57a785c0ef320241b6b369bbe5164...,39_IN5_HG04217,SIRV1,ITU,6922,SIRV,spliced_sirv,2952.0,Known,Known
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427772,f4e4f36d-7d20-43fb-a440-640c62efc29a:0,SIRV7,-,412,3,FSM,SIRV7,SIRV704,428.0,3.0,...,7a0922befdaba1e23a5e78062a7794f5d5c5d892273509...,1_PY1_GM10492,SIRV7,MPC,6914,SIRV,spliced_sirv,3131.0,Known,Known
427774,f2195ba7-81d3-41c4-83a9-f9f0b54b8949:0,SIRV7,-,423,3,FSM,SIRV7,SIRV704,428.0,3.0,...,7a0922befdaba1e23a5e78062a7794f5d5c5d892273509...,1_PY1_GM10492,SIRV7,MPC,6914,SIRV,spliced_sirv,3131.0,Known,Known
427775,f800451c-e3d9-4e9c-bc1d-9a5d3186692f:0_2,SIRV7,-,426,3,FSM,SIRV7,SIRV704,428.0,3.0,...,7a0922befdaba1e23a5e78062a7794f5d5c5d892273509...,1_PY1_GM10492,SIRV7,MPC,6914,SIRV,spliced_sirv,3131.0,Known,Known
427776,feb40ebf-8075-4884-9c5e-b594e56d9f48:0,SIRV7,-,412,3,FSM,SIRV7,SIRV704,428.0,3.0,...,7a0922befdaba1e23a5e78062a7794f5d5c5d892273509...,1_PY1_GM10492,SIRV7,MPC,6914,SIRV,spliced_sirv,3131.0,Known,Known
