## Goal: how does transcript length affect # and type of transcript detection?

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import yaml
from snakemake.io import expand
import pyranges as pr
from pyfaidx import Fasta
from mizani.formatters import percent_format
from scipy import stats


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

from plotnine import *

In [2]:
def my_theme(base_size=11, w=4, h=3):
    """
    Custom plotnine theme with:
    - White background
    - Clean styling
    - Axes and ticks retained

    Parameters:
    - base_size: Base font size

    Returns:
    - plotnine.theme object
    """
    return (
        theme_minimal(base_size=base_size)
        + theme(
            # White background
            panel_background=element_rect(fill='white', color=None),
            plot_background=element_rect(fill='white', color=None),

            # Remove grid lines
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),

            # Keep axis lines & ticks (don't blank them)
            axis_line=element_line(color='black'),
            axis_ticks=element_line(color='black'),

            plot_title=element_text(hjust=0.5, family='Helvetica'),
            axis_title_x=element_text(hjust=0.5, family='Helvetica'),
            axis_title_y=element_text(hjust=0.5, margin={'t':0, 'r':-2, 'b':0, 'l':0}, family='Helvetica'),
            
            # Styling text
            legend_title=element_blank(),
            axis_title=element_text(size=base_size + 1, family='Helvetica'),
            legend_text=element_text(size=base_size-2, family='Helvetica'),
            axis_text=element_text(size=base_size, color='black', family='Helvetica'),
            figure_size=(w, h),  # Controls plot dimensions (width x height in inches)
            plot_margin=0.05      # Shrinks surrounding white space
        )
    )

def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [3]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [4]:
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
mt_df['det_pop'] = mt_df[get_population_colors()[1]].idxmax(axis=1)

KeyboardInterrupt: 

In [None]:
mt_df[['isoform', 'geneid.v', 'length']].sort_values(by='geneid.v').head()

## First, general distributions

In [None]:
# biotype
temp = mt_df[['length', 'isoform', 'associated_gene_biotype_sub']].drop_duplicates()
init_plot_settings()
ax = sns.displot(temp,
                 x='length', kind='kde',
                 hue='associated_gene_biotype_sub',
                 linewidth=3, common_norm=False, alpha=0.5)
ax.set(xlabel='Transcript length (bp)')

In [None]:
temp = mt_df[['length', 'isoform', 'associated_gene_biotype_sub']].drop_duplicates()
temp = temp.loc[temp.length<20000]
init_plot_settings()
ax = sns.displot(temp,
                 x='length', kind='kde',
                 hue='associated_gene_biotype_sub',
                 linewidth=3, common_norm=False, alpha=0.5)
ax.set(xlabel='Transcript length (bp)')


In [None]:
temp = mt_df[['length', 'isoform', 'associated_gene_biotype_sub']].drop_duplicates()
temp = temp.loc[temp.length<10000]
init_plot_settings()
ax = sns.displot(temp,
                 x='length', kind='kde',
                 hue='associated_gene_biotype_sub',
                 linewidth=3, common_norm=False, alpha=0.5)
ax.set(xlabel='Transcript length (bp)')


In [None]:
temp = mt_df[['length', 'isoform', 'associated_gene_biotype_sub']].drop_duplicates()
temp = temp.loc[temp.length<5000]
init_plot_settings()
ax = sns.displot(temp,
                 x='length', kind='kde',
                 hue='associated_gene_biotype_sub',
                 linewidth=3, common_norm=False, alpha=0.5)
ax.set(xlabel='Transcript length (bp)')


In [None]:
init_plot_settings(aspect='square')
ax = sns.boxplot(mt_df,
                 y='length',
                 x='associated_gene_biotype_sub')
clean_figure(ax)
ax.set(ylabel='Transcript length (bp)', xlabel='')

In [None]:
temp = temp.loc[temp.length<5000]

init_plot_settings(aspect='square')
ax = sns.boxplot(temp,
                 y='length',
                 x='associated_gene_biotype_sub')
clean_figure(ax)
ax.set(ylabel='Transcript length (bp)', xlabel='')

## Split by known / novel / promoted ISM

In [None]:
uma_mt = pd.read_csv('../supp_tables/02_uma_mt.tsv', sep='\t')

# promoted genes are those that are FSMs in poder
# but not present in uma 
promoted_fsm_ids = list(set(mt_df.loc[mt_df.structural_category=='FSM'].isoform.tolist()) -\
                   set(uma_mt.loc[uma_mt.structural_category=='FSM']['isoform']))

mt_df['gene_len_nov'] = np.nan
mt_df.loc[mt_df.structural_category=='FSM', 'gene_len_nov'] = 'FSM'
mt_df.loc[mt_df.structural_category!='FSM', 'gene_len_nov'] = 'Novel'
mt_df.loc[mt_df.isoform.isin(promoted_fsm_ids), 'gene_len_nov'] ='FSM (Promoted ISM)'

assert len(promoted_fsm_ids) == 10255
# got the number from the paper

## How long are the transcripts from these novelty designations?

In [None]:
init_plot_settings(aspect='square')
ax = sns.boxplot(mt_df,
                 y='length',
                 x='gene_len_nov')
ax.set(ylabel='Transcript length', xlabel='')
clean_figure(ax)

In [None]:
# logged version
init_plot_settings(aspect='square')
mt_df['log10_t_len'] = np.log10(mt_df['length'])
ax = sns.boxplot(mt_df,
                 y='log10_t_len',
                 x='gene_len_nov')
ax.set(ylabel='log10(Transcript length)', xlabel='')
clean_figure(ax)

In [None]:
# plotnine equivalent
c_dict, order = get_novelty_colors(mt_df.gene_len_nov.unique().tolist())
c_dict['FSM (Promoted ISM)'] = '#adc0a4'
c_dict['Novel'] = '#aca6b0'
(
    ggplot(mt_df, aes(x='gene_len_nov', y='log10_t_len', fill='gene_len_nov'))
    + geom_boxplot()
    + scale_fill_manual(c_dict)
    + labs(
        y='log10(Transcript length)',
        x='')
    + my_theme(w=2.5, h=4)
    + theme(
        axis_text_x=element_text(rotation=45, hjust=1))
    + theme(legend_position='none')
)

In [None]:
# well, they look different but also let's check 
import itertools
for s in list(itertools.combinations(mt_df.gene_len_nov.unique().tolist(), 2)):
    x = mt_df.loc[mt_df.gene_len_nov==s[0], 'length'].tolist()
    y = mt_df.loc[mt_df.gene_len_nov==s[1], 'length'].tolist()
    stat, pval = st.mannwhitneyu(x, y)
    # print(f'{s[0]} vs {s[1]}')
    print(f'{s[0]} median: {np.median(x)}')
    print(f'{s[1]} median: {np.median(y)}')
    print(stat)
    print(pval)
    print()

In [None]:
# plotnine version faceted by gene biotype

# 1. Count n per gene_len_nov per facet
n_labels = (
    mt_df.groupby(['associated_gene_biotype_sub', 'gene_len_nov'])
    .size()
    .reset_index(name='n')
)

# 2. Format label
n_labels['label'] = 'n = ' + n_labels['n'].apply(lambda x: f"{x:,}")

# 2. Set fixed y-position below boxes
y_min = mt_df['log10_t_len'].min()
n_labels['y'] = y_min - 0.2  # tweak -0.2 as needed to move lower


(
    ggplot(mt_df, aes(x='gene_len_nov', y='log10_t_len'))
    + geom_boxplot()
    + facet_wrap('~associated_gene_biotype_sub',
                 ncol=len(mt_df.associated_gene_biotype_sub.unique().tolist()),
                 scales='free_y')  # Adjust ncol as needed
    + labs(
        y='log10(Transcript length)',
        x=''
    )
    + geom_text(
        n_labels,
        aes(x='gene_len_nov', y='y', label='label'),
        inherit_aes=False,
        size=6  # increase for readability if needed
    )
    + my_theme(w=8, h=4)
    + theme(
        axis_text_x=element_text(rotation=45, hjust=1))
)

In [None]:
# well, they look different but also let's check 
import itertools
for c in mt_df.associated_gene_biotype_sub.unique().tolist():
    temp = mt_df.loc[mt_df.associated_gene_biotype_sub==c]
    for s in list(itertools.combinations(mt_df.gene_len_nov.unique().tolist(), 2)):
        x = temp.loc[temp.gene_len_nov==s[0], 'length'].tolist()
        y = temp.loc[temp.gene_len_nov==s[1], 'length'].tolist()
        stat, pval = st.mannwhitneyu(x, y)
        # print(f'{s[0]} vs {s[1]}')
        print(c)
        print(f'{s[0]} median: {np.median(x)}')
        print(f'{s[1]} median: {np.median(y)}')
        print(stat)
        print(pval)
        print()

In [None]:
# limit just to pc and lnc, and just to novel vs. fsm
temp = mt_df.loc[(mt_df.associated_gene_biotype_sub.isin(['Protein Coding', 'lncRNA']))&\
                 (mt_df.gene_len_nov.isin(['Novel', 'FSM']))]

# plotnine version faceted by gene biotype

# 1. Count n per gene_len_nov per facet
n_labels = (
    temp.groupby(['associated_gene_biotype_sub', 'gene_len_nov'])
    .size()
    .reset_index(name='n')
)

# 2. Format label
n_labels['label'] = 'n = ' + n_labels['n'].apply(lambda x: f"{x:,}")

# 2. Set fixed y-position below boxes
y_min = temp['log10_t_len'].min()
n_labels['y'] = y_min - 0.2  # tweak -0.2 as needed to move lower


(
    ggplot(temp, aes(x='gene_len_nov', y='log10_t_len'))
    + geom_boxplot()
    + facet_wrap('~associated_gene_biotype_sub',
                 ncol=len(temp.associated_gene_biotype_sub.unique().tolist()),
                 scales='free_y')  # Adjust ncol as needed
    + labs(
        y='log10(Transcript length)',
        x=''
    )
    + geom_text(
        n_labels,
        aes(x='gene_len_nov', y='y', label='label'),
        inherit_aes=False,
        size=8  # increase for readability if needed
    )
    + my_theme(w=5, h=3)
    + theme(
        axis_text_x=element_text(rotation=45, hjust=1))
)

In [None]:
# well, they look different but also let's check 
import itertools
for c in temp.associated_gene_biotype_sub.unique().tolist():
    temp2 = temp.loc[temp.associated_gene_biotype_sub==c]
    for s in list(itertools.combinations(temp.gene_len_nov.unique().tolist(), 2)):
        x = temp2.loc[temp.gene_len_nov==s[0], 'length'].tolist()
        y = temp2.loc[temp.gene_len_nov==s[1], 'length'].tolist()
        stat, pval = st.mannwhitneyu(x, y)
        # print(f'{s[0]} vs {s[1]}')
        print(c)
        print(f'{s[0]} median: {np.median(x)}')
        print(f'{s[1]} median: {np.median(y)}')
        print(stat)
        print(pval)
        print()

## Correlate transcript expression w/ transcript length

In [None]:
# read in the expression matrix
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]

meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])


df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)
df.set_index('tid', inplace=True)

In [None]:
# get max. expression per transcript 
drop_cols = df.columns.tolist()
df['max_tpm'] = df.max(axis=1)
df = df.drop(drop_cols, axis=1)
df.head()

In [None]:
# merge in max exp. w/ mt_df
mt_df = mt_df.merge(df, 
                    how='left',
                    left_on='isoform',
                    right_index=True)
mt_df['log10_max_tpm'] = np.log10(mt_df['max_tpm']+1)

In [None]:
def plot_thing(df, c1, c2, hue, c_dict):

        ax = sns.jointplot(data=df, x=c1, y=c2,
                             hue=hue, palette=c_dict,
                             # xlim=(0,xlim), ylim=(0,ylim), 
                             joint_kws={'data':df, 's':40, 'alpha':0.4})
        ax = ax.ax_joint
        
        # plot regression lines and equation of regression lines
        # https://stackoverflow.com/questions/48145924/different-colors-for-points-and-line-in-seaborn-regplot/68135585#68135585
        # https://stackoverflow.com/questions/45902739/seaborn-annotate-the-linear-regression-equation
        # https://stackoverflow.com/questions/62705904/add-entry-to-matplotlib-legend-without-plotting-an-object
        lines = []
        labels = []
        for s in df[hue].unique().tolist():
            temp = df.loc[df[hue] == s]
            color = c_dict[s]
            line_color = adjust_lightness(color, 0.5)
            
            # get coeffs of linear fit
            slope, intercept, r_value, p_value, std_err = stats.linregress(temp[c1],temp[c2])
            lines += [mpl.lines.Line2D([0], [0], color=line_color)]
            labels += ['m={0:.1f}'.format(slope)]
    
            print('Slope of {} correlation: {}'.format(s, slope))
            print(f'Pval = {p_value}')
            print(f'n = {len(temp.index)}')
            
            
            sns.regplot(data=temp, x=c1, y=c2,
                        scatter=False, ax=ax, color=color)
            sns.regplot(data=temp, x=c1, y=c2,
                scatter=False, ax=ax, color=color, ci=0,
                line_kws={'color':line_color,
                          'linestyle':'-',
                          'label':"m={0:.1f}".format(slope)})
        
        ax.legend(title='')
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        # ax.get_legend().remove()

        _ = ax.set(xlabel='log10(Transcript length)', ylabel='Log10(Max. transcript TPM+1)')

In [None]:
# plot and correlate transcript length w/ max. expression; split by gene biotype
c_dict = {'Protein Coding': 'k', 
      'Fusion Gene': 'b',
      'lncRNA': 'r', 
      'Novel Gene': 'g'}
plot_thing(mt_df, 'log10_t_len', 'log10_max_tpm', 'associated_gene_biotype_sub', c_dict)

In [None]:
# plot and correlate transcript length w/ max. expression; split by transcript novelty
c_dict, order = get_novelty_colors(mt_df.structural_category.unique().tolist())
plot_thing(mt_df, 'log10_t_len', 'log10_max_tpm', 'structural_category', c_dict)


In [None]:
# maybe summarize these correlations in a bar plot because it's neater
hue = 'structural_category'
c1 = 'log10_t_len'
c2 = 'log10_max_tpm'
slopes = []
for s in mt_df[hue].unique().tolist():
    temp = mt_df.loc[mt_df[hue] == s]
    
    # get coeffs of linear fit
    slope, intercept, r_value, p_value, std_err = stats.linregress(temp[c1],temp[c2])
    slopes += [slope]

In [None]:
temp = pd.DataFrame({'slope': slopes, 
                     hue: mt_df[hue].unique().tolist()})

(ggplot(temp, aes(x='structural_category', y='slope', fill='structural_category'))
 +geom_bar(stat='identity')
 +scale_fill_manual(values=c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4)
 +labs(y='Slope between transcript length\nand transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# maybe pearson or spearman correlation would be better actually
# maybe summarize these correlations in a bar plot because it's neater
hue = 'structural_category'
c1 = 'log10_t_len'
c2 = 'log10_max_tpm'
pearsons = []
spearmans = []
for s in mt_df[hue].unique().tolist():
    temp = mt_df.loc[mt_df[hue] == s]
    
    # get correlations
    x = temp[c1].tolist()
    y = temp[c2].tolist()
    r, r_p = st.pearsonr(x, y)
    rho, rho_p = st.spearmanr(x, y)

    pearsons.append(r)
    spearmans.append(rho)

In [None]:
temp = pd.DataFrame({'pearson': pearsons, 
                     hue: mt_df[hue].unique().tolist()})
(ggplot(temp, aes(x='structural_category', y='pearson', fill='structural_category'))
 +geom_bar(stat='identity')
 +scale_fill_manual(values=c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4)
 +labs(y='Pearson R correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
temp = pd.DataFrame({'spearman': spearmans, 
                     hue: mt_df[hue].unique().tolist()})
(ggplot(temp, aes(x='structural_category', y='spearman', fill='structural_category'))
 +geom_bar(stat='identity')
 +scale_fill_manual(values=c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4)
 +labs(y='Spearman Rho correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

## What if we use all samples' TPMs? 

In [None]:
# read in the expression matrix
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]

meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])


df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)
df.set_index('tid', inplace=True)

In [None]:
samples = df.columns.tolist()

In [None]:
# drop the detection columns so we can merge in the TPM columns
mt_df = mt_df.drop(samples, axis=1)

# now merge
mt_df = mt_df.merge(df, how='left',
                    left_on='isoform',
                    right_index=True)

In [None]:

# log transform
log_tpm_cols = [f'{sample}_log10_tpm' for sample in samples]
mt_df[log_tpm_cols] = np.log10(mt_df[samples])
mt_df[log_tpm_cols] = mt_df[log_tpm_cols].replace(-np.inf, np.nan)  # or .where(df['tpm'] > 0)


In [None]:
# correlate,  make sure to ignore NaNs? 
hue = 'structural_category'
c1 = 'log10_t_len'

pearsons = []
spearmans = []
samples = []
novs = []

for c in log_tpm_cols:
    for s in mt_df[hue].unique().tolist():
        temp = mt_df.loc[mt_df[hue] == s]
        
        # get correlations for expressed transcripts only
        x = temp.loc[temp[c].notnull(), c1].tolist()
        y = temp.loc[temp[c].notnull(), c].tolist()
        r, r_p = st.pearsonr(x, y)
        rho, rho_p = st.spearmanr(x, y)
    
        pearsons.append(r)
        spearmans.append(rho)
        samples.append(c.split('_')[0])
        novs.append(s)

In [None]:
temp = pd.DataFrame({'pearson': pearsons, 
                     'spearman': spearmans,
                     'sample': samples,
                     hue: novs})

In [None]:
p_c_dict, p_order = get_population_colors()
temp['population'] = temp['sample'].str.slice(0,3)

In [None]:
(ggplot(temp, aes(x='structural_category', y='pearson', fill='structural_category'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4, w=5)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Pearson R correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
(ggplot(temp, aes(x='structural_category', y='spearman', fill='structural_category'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4, w=5)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Spearman Rho correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# ones I actually care about
temp2 = temp.loc[temp.structural_category.isin(['FSM', 'NIC', 'NNC'])]
c_dict, order = get_novelty_colors(temp2.structural_category.unique().tolist())
(ggplot(temp2, aes(x='structural_category', y='pearson', fill='structural_category'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4, w=3)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Pearson R correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# ones I actually care about
temp2 = temp.loc[temp.structural_category.isin(['FSM', 'NIC', 'NNC'])]
c_dict, order = get_novelty_colors(temp2.structural_category.unique().tolist())
(ggplot(temp2, aes(x='structural_category', y='spearman', fill='structural_category'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4, w=3)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Spearman Rho correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# well, they look different but also let's check 
import itertools
for s in list(itertools.combinations(temp2.structural_category.unique().tolist(), 2)):
    x = temp2.loc[temp2.structural_category==s[0], 'pearson'].tolist()
    y = temp2.loc[temp2.structural_category==s[1], 'pearson'].tolist()
    stat, pval = st.mannwhitneyu(x, y)
    # print(f'{s[0]} vs {s[1]}')
    print(f'{s[0]} median: {np.median(x)}')
    print(f'{s[1]} median: {np.median(y)}')
    print(stat)
    print(pval)
    print()

In [None]:
# well, they look different but also let's check 
import itertools
for s in list(itertools.combinations(temp2.structural_category.unique().tolist(), 2)):
    x = temp2.loc[temp2.structural_category==s[0], 'spearman'].tolist()
    y = temp2.loc[temp2.structural_category==s[1], 'spearman'].tolist()
    stat, pval = st.mannwhitneyu(x, y)
    # print(f'{s[0]} vs {s[1]}')
    print(f'{s[0]} median: {np.median(x)}')
    print(f'{s[1]} median: {np.median(y)}')
    print(stat)
    print(pval)
    print()

In [None]:
# so all of these are significantly different

## Transcript length distributions by novelty

In [None]:
mt_df.head()

In [None]:
# biotype
temp = mt_df[['length', 'isoform', 'gene_len_nov']].drop_duplicates()
init_plot_settings()
ax = sns.displot(temp,
                 x='length', kind='kde',
                 hue='gene_len_nov',
                 linewidth=3, common_norm=False, alpha=0.5)
ax.set(xlabel='Transcript length (bp)')

## Transcript length to expression correlation by category (including promoted ISM)

In [None]:
# add new struct. cat. column 
mt_df['structural_category_2'] = mt_df.structural_category
mt_df.loc[mt_df.gene_len_nov=='FSM (Promoted ISM)', 'structural_category_2'] = 'FSM (Promoted ISM)'

c_dict, order = get_novelty_colors(mt_df.structural_category.unique().tolist())
c_dict['FSM - Promoted ISM'] = mute_color(c_dict['FSM'], factor=1.4)
order = [order[0]] + ['FSM (Promoted ISM)'] + order[1:]
order

In [None]:
# correlate,  make sure to ignore NaNs? 
hue = 'structural_category_2'
c1 = 'log10_t_len'

pearsons = []
spearmans = []
samples = []
novs = []

for c in log_tpm_cols:
    for s in mt_df[hue].unique().tolist():
        temp = mt_df.loc[mt_df[hue] == s]
        
        # get correlations for expressed transcripts only
        x = temp.loc[temp[c].notnull(), c1].tolist()
        y = temp.loc[temp[c].notnull(), c].tolist()
        r, r_p = st.pearsonr(x, y)
        rho, rho_p = st.spearmanr(x, y)
    
        pearsons.append(r)
        spearmans.append(rho)
        samples.append(c.split('_')[0])
        novs.append(s)

In [None]:
temp = pd.DataFrame({'pearson': pearsons, 
                     'spearman': spearmans,
                     'sample': samples,
                     hue: novs})

In [None]:
p_c_dict, p_order = get_population_colors()
temp['population'] = temp['sample'].str.slice(0,3)

In [None]:
(ggplot(temp, aes(x='structural_category_2', y='pearson', fill='structural_category_2'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=5, w=5)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Pearson R correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
(ggplot(temp, aes(x='structural_category_2', y='spearman', fill='structural_category_2'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=4, w=5)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Spearman Rho correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# ones I actually care about
cats = ['FSM', 'FSM (Promoted ISM)', 'NIC', 'NNC']
temp2 = temp.loc[temp.structural_category_2.isin(cats)]
c_dict, order = rm_color_cats(c_dict, order, cats)
(ggplot(temp2, aes(x='structural_category_2', y='pearson', fill='structural_category_2'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=5, w=3)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Pearson R correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# ones I actually care about
# ones I actually care about
cats = ['FSM', 'FSM (Promoted ISM)', 'NIC', 'NNC']
temp2 = temp.loc[temp.structural_category_2.isin(cats)]
c_dict, order = rm_color_cats(c_dict, order, cats)
(ggplot(temp2, aes(x='structural_category_2', y='spearman', fill='structural_category_2'))
 +geom_violin(alpha=0.7)
 +geom_boxplot(outlier_size=0, width=0.1)
 +geom_jitter(aes(fill='population', color='population'),
              size=1.5, alpha=0.5, width=0.25)
 +scale_color_manual(values=p_c_dict)
 +scale_fill_manual(values=c_dict|p_c_dict)
 +scale_x_discrete(limits=order)
 +my_theme(h=5, w=3)
 +theme(legend_box='horizontal')
 +guides(fill="none")
 +labs(y='Spearman Rho correlation between\ntranscript length and transcript expression',
       x='')
 +theme(
    axis_text_x=element_text(rotation=90))
)

In [None]:
# well, they look different but also let's check 
import itertools
for s in list(itertools.combinations(temp2.structural_category_2.unique().tolist(), 2)):
    x = temp2.loc[temp2.structural_category_2==s[0], 'pearson'].tolist()
    y = temp2.loc[temp2.structural_category_2==s[1], 'pearson'].tolist()
    stat, pval = st.mannwhitneyu(x, y)
    # print(f'{s[0]} vs {s[1]}')
    print(f'{s[0]} median: {np.median(x)}')
    print(f'{s[1]} median: {np.median(y)}')
    print(stat)
    print(pval)
    print()

In [None]:
# well, they look different but also let's check 
import itertools
for s in list(itertools.combinations(temp2.structural_category_2.unique().tolist(), 2)):
    x = temp2.loc[temp2.structural_category_2==s[0], 'spearman'].tolist()
    y = temp2.loc[temp2.structural_category_2==s[1], 'spearman'].tolist()
    stat, pval = st.mannwhitneyu(x, y)
    # print(f'{s[0]} vs {s[1]}')
    print(f'{s[0]} median: {np.median(x)}')
    print(f'{s[1]} median: {np.median(y)}')
    print(stat)
    print(pval)
    print()