## Goal: What % of novel transcripts are pop-spec? What about for the different novelty categories?

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import itertools
import os
import matplotlib.pyplot as plt
import yaml
from snakemake.io import expand
import pyranges as pr
from pyfaidx import Fasta
from mizani.formatters import percent_format
from scipy import stats


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

from plotnine import *

In [2]:
def my_theme(base_size=11, w=4, h=3):
    """
    Custom plotnine theme with:
    - White background
    - Clean styling
    - Axes and ticks retained

    Parameters:
    - base_size: Base font size

    Returns:
    - plotnine.theme object
    """
    return (
        theme_minimal(base_size=base_size)
        + theme(
            # White background
            panel_background=element_rect(fill='white', color=None),
            plot_background=element_rect(fill='white', color=None),

            # Remove grid lines
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),

            # Keep axis lines & ticks (don't blank them)
            axis_line=element_line(color='black'),
            axis_ticks=element_line(color='black'),

            plot_title=element_text(hjust=0.5, family='Helvetica'),
            axis_title_x=element_text(hjust=0.5, family='Helvetica'),
            axis_title_y=element_text(hjust=0.5, margin={'t':0, 'r':-2, 'b':0, 'l':0}, family='Helvetica'),
            
            # Styling text
            legend_title=element_blank(),
            axis_title=element_text(size=base_size + 1, family='Helvetica'),
            legend_text=element_text(size=base_size-2, family='Helvetica'),
            axis_text=element_text(size=base_size, color='black', family='Helvetica'),
            strip_text_x=element_text(size=base_size-1),
            strip_text_y=element_text(size=base_size-1),
            figure_size=(w, h),  # Controls plot dimensions (width x height in inches)
            plot_margin=0.05      # Shrinks surrounding white space
        )
    )

In [4]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [5]:
mt_df = pd.read_csv('../data/05_mastertable/26062025_PODER_mastertable.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
pops = get_population_colors()[1]
# mt_df['population_sharing'] = (mt_df[pops]>0).sum(axis=1)
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
mt_df['det_pop'] = mt_df[get_population_colors()[1]].idxmax(axis=1)

In [10]:
# get total number of transcripts per novelty cat
mt_df['n_t_nov'] = mt_df.groupby('structural_category')['isoform'].transform('nunique')

In [11]:
mt_df[['isoform', 'structural_category']].groupby('structural_category').nunique()

Unnamed: 0_level_0,isoform
structural_category,Unnamed: 1_level_1
Antisense,1002
FSM,114578
Fusion,1066
Genic,114
Intergenic,731
NIC,17425
NNC,20959


In [12]:
mt_df['n_t_pop_spec_nov'] = mt_df.groupby(['structural_category', 'pop_spec_t'])['isoform'].transform('nunique')


In [15]:
temp = mt_df[['structural_category', 'pop_spec_t', 'n_t_nov', 'n_t_pop_spec_nov']].drop_duplicates()

In [17]:
temp['perc'] = (temp['n_t_pop_spec_nov']/temp['n_t_nov'])*100

In [19]:
temp.loc[temp.pop_spec_t==True]

Unnamed: 0,structural_category,pop_spec_t,n_t_nov,n_t_pop_spec_nov,perc
14,FSM,True,114578,1568,1.3685
109,NNC,True,20959,443,2.11365
2372,NIC,True,17425,305,1.750359
5080,Intergenic,True,731,29,3.967168
5229,Antisense,True,1002,38,3.792415
9100,Fusion,True,1066,33,3.095685
27356,Genic,True,114,3,2.631579


In [21]:
n1 = temp.loc[(temp.pop_spec_t==True)&(temp.structural_category!='FSM')].n_t_nov.sum()
n1

np.int64(41297)

In [22]:
n2 = temp.loc[(temp.pop_spec_t==True)&(temp.structural_category!='FSM')].n_t_pop_spec_nov.sum()

In [25]:
n2/n1

np.float64(0.02060682374022326)

In [24]:
n2

np.int64(851)