In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = '../'+expand(config['data']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

In [4]:
# first make tables for the different splice junctions and splice sites; determine their novelty
ca = cerberus.read(cerberus_h5)

In [5]:
ref_sources = ['v29', 'v40']

In [31]:
def get_source_table(df):
    """
    Get a melted form table for each entry in a tss, ic, or tes table
    for each form of support for each triplet feature.
    
    Parameters:
        df (pandas DataFrame): DataFrame of tsss, ics, or tess
    
    Returns:
        df (pandas DataFrame): Long-form DataFrame of support for each tss, ic, or tes
    """
    keep_cols = ['Name', 'source']
    df = ic[keep_cols].copy(deep=True)
    df['list_source'] = df.source.str.split(',')
    df = df.explode('list_source')
    df.drop('source', axis=1, inplace=True)
    
    return df

# chatgpt wrote this for me thanx chatgpt
def sequential_pairs(x):
    """
    Get sequential pairs of tuples in list.
    Example: [1,2,3,4] -> [(1,2),(3,4)]
    """
    p = []
    for i in range(0, len(x) - 1, 2):
        p.append((x[i], x[i + 1]))
    return p

def explode_ic(ic):
    """
    Explode an ic df to long form with splice junction entries
    """
    # remove the monoexonic entries
    ic = ic.loc[~(ic.Coordinates == '-')]
    
    # explode into series of ss coords
    keep_cols = ['Chromosome', 'Coordinates',
                 'Strand', 'gene_id',
                 'Name']
    df = ic.copy(deep=True)
    df = df[keep_cols]
    df['ss_coords'] = df.Coordinates.str.split('-')
    
    # get pairs of sss to form sjs
    df['sj_coords'] = df.ss_coords.apply(sequential_pairs)
    df = df.explode('sj_coords')
    df.drop(['Coordinates', 'ss_coords'], axis=1, inplace=True)
                
    return df

def get_ss_sj_from_ic(ic, how):
    ic = ic.copy(deep=True)

    # get coords of each splice site in each splice junction
    df = explode_ic(ic)
    df['Start'] = df['sj_coords'].str[0].astype(int)
    df['End'] = df['sj_coords'].str[1].astype(int)
    df.drop('sj_coords', axis=1, inplace=True)

    # label sss as 5' or 3' and melt
    if how == 'ss':
        assert len(df.loc[(df.Start>df.End)&(df.Strand=='+')].index) == 0
        # since these are intron coords, the start defines a 3' ss 
        # and the end defines a 5' ss
        df.rename({'Start':'ss_3', 'End':'ss_5'}, axis=1, inplace=True)
        id_cols = ['Chromosome', 'Strand', 'gene_id', 'Name']
        df = df.melt(id_vars=id_cols,
                     var_name='ss_type',
                     value_name='Start')
        
    # for sjs, reorder according to min and max coords
    # in bed standard format
    elif how == 'sj':
        df['temp_Start'] = df.Start
        df['temp_End'] = df.End
        df['Start'] = df[['temp_Start', 'temp_End']].min(axis=1)
        df['End'] = df[['temp_Start', 'temp_End']].max(axis=1)
        df.drop(['temp_Start', 'temp_End'], axis=1, inplace=True)
        
    # merge source info in w/ coord info
    df2 = get_source_table(ic)
    df = df.merge(df2, how='left', on=['Name'])

    # figure out novelty and source of each ss / sj
    df.drop('Name', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
    if how == 'ss':
        gb_cols += ['ss_type']
    elif how == 'sj':
        gb_cols += ['End']
    df.rename({'list_source': 'source'},
              axis=1, inplace=True)
    df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
                                                     False: 'Novel'})
    df = df.groupby(gb_cols).agg(','.join).reset_index()
    df = cerberus.update_novelty(df)
    
    return df

def get_sj_from_ic(ic):
    """
    Get a splice junction table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice junction
    """
    return get_ss_sj_from_ic(ic, 'sj')

def get_ss_from_ic(ic):
    """
    Get a splice site table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice site
    """
    return get_ss_sj_from_ic(ic, 'ss')       

In [115]:
df = get_sj_from_ic(ca.ic)
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
0,chr1,+,ENSG00000000460,169662523,169683468,"v40,v29",Known
1,chr1,+,ENSG00000000460,169683625,169683755,"v40,v29",Known
2,chr1,+,ENSG00000000460,169683932,169783810,"v40,v29",Known
3,chr1,+,ENSG00000000460,169783928,169784876,"v40,v29",Known
4,chr1,+,ENSG00000000460,169784913,169798856,"v40,v29",Known


In [159]:
# # introp = '../data/human/intropolis/smol_intropolis.bed'
# # introp = '../data/human/intropolis/smol_intropolis.tab'

# # introp = '../data/human/intropolis/intropolis.bed'
# introp = '../data/human/intropolis/intropolis.tab'

# i_df = pd.read_csv(introp, sep='\t', header=None,
#                    usecols=[8,9,10,11],
#                    names=['Chromosome', 'Start', 'End', 'Strand'])
# source = 'intropolis'
# ref = False
# i_df = i_df.loc[i_df.Start.notnull()]

# # decrement all starts
# i_df['Start'] = i_df['Start'] - 1

# i_df['source'] = source

introp = '../data/human/intropolis/intropolis.bed'
i_df = pr.read_bed(introp).df
source = 'intropolis'
ref = False
i_df = i_df.loc[i_df.Start.notnull()]
i_df['source'] = source

In [160]:
i_df.loc[i_df.Start > i_df.End]

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source


In [164]:
# i_df.loc[i_df.Strand=='+']

In [165]:
i_df.loc[i_df.Start.isnull()]

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source


In [166]:
# print(len(df))
# df = df.loc[(df.Start>=i_df.Start.min())&\
#             (df.End<=i_df.End.max())]
# df = df.loc[df.Chromosome.isin(i_df.Chromosome.tolist())]
# print(len(df))

In [167]:
df = df.loc[df.Chromosome=='chr18']
i_df = i_df.loc[i_df.Chromosome=='chr18']

In [168]:
df = df.sort_values(by='Start')
i_df = i_df.sort_values(by='Start')

In [169]:
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
0,chr18,+,ENSG00000262352,11595,13151,"v40,v29",Known
1,chr18,+,ENSG00000262352,11595,15616,"v40,v29",Known
2,chr18,+,ENSG00000262352,13354,15616,"v40,v29",Known
4145,chr18,-,ENSG00000263305,14653,14850,"v40,v29",Known
4144,chr18,-,ENSG00000263305,14653,16855,"v40,v29",Known


In [170]:
# df.loc[df.Start>df.End]

In [171]:
i_df.head()

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source
34447704,chr18,10020,10032,.,.,-,intropolis
34447719,chr18,10020,105511,.,.,-,intropolis
34447718,chr18,10020,63875,.,.,-,intropolis
34447717,chr18,10020,10586,.,.,-,intropolis
34447716,chr18,10020,10521,.,.,-,intropolis


In [172]:
i_df.loc[i_df.Start.isin(df.Start.tolist())]

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source
34128643,chr18,11595,13151,.,.,+,intropolis
34128642,chr18,11595,12614,.,.,+,intropolis
34128641,chr18,11595,12582,.,.,+,intropolis
34128640,chr18,11595,12554,.,.,+,intropolis
34128644,chr18,11595,15616,.,.,+,intropolis
...,...,...,...,...,...,...,...
34757951,chr18,80202932,80220121,.,.,-,intropolis
34757952,chr18,80202932,80227160,.,.,-,intropolis
34757953,chr18,80202932,80229362,.,.,-,intropolis
34757950,chr18,80202932,80217859,.,.,-,intropolis


In [173]:
df.loc[df.Start==215504]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
4161,chr18,-,ENSG00000079134,215504,216485,"v40,v29,lapa,gtex",Known


In [174]:
# try merging
temp = df.merge(i_df,
                how='left',
                on=['Chromosome', 'Start', 'End', 'Strand'])

In [175]:
temp.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source_x,novelty,Name,Score,source_y
0,chr18,+,ENSG00000262352,11595,13151,"v40,v29",Known,.,.,intropolis
1,chr18,+,ENSG00000262352,11595,15616,"v40,v29",Known,.,.,intropolis
2,chr18,+,ENSG00000262352,13354,15616,"v40,v29",Known,.,.,intropolis
3,chr18,-,ENSG00000263305,14653,14850,"v40,v29",Known,.,.,intropolis
4,chr18,-,ENSG00000263305,14653,16855,"v40,v29",Known,.,.,intropolis


In [176]:
temp.loc[temp.source_y.isnull()]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source_x,novelty,Name,Score,source_y
5,chr18,+,ENSG00000262181,45235,45282,"v40,v29",Known,,,
6,chr18,+,ENSG00000262181,45556,45640,"v40,v29",Known,,,
7,chr18,+,ENSG00000262181,45787,45907,"v40,v29",Known,,,
16,chr18,-,ENSG00000262081,84148,84728,"v40,v29",Known,,,
18,chr18,-,ENSG00000262081,85701,86110,"v40,v29",Known,,,
...,...,...,...,...,...,...,...,...,...,...
7652,chr18,-,ENSG00000271702,77633120,77633278,"v40,v29",Known,,,
7691,chr18,+,ENSG00000256463,78992252,78993056,v29,Known,,,
7706,chr18,+,ENSG00000166377,79113354,79114993,"v40,v29,lapa",Known,,,
7725,chr18,+,ENSG00000166377,79277196,79377889,v29,Known,,,


In [177]:
temp.loc[temp.source_y.notnull()]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source_x,novelty,Name,Score,source_y
0,chr18,+,ENSG00000262352,11595,13151,"v40,v29",Known,.,.,intropolis
1,chr18,+,ENSG00000262352,11595,15616,"v40,v29",Known,.,.,intropolis
2,chr18,+,ENSG00000262352,13354,15616,"v40,v29",Known,.,.,intropolis
3,chr18,-,ENSG00000263305,14653,14850,"v40,v29",Known,.,.,intropolis
4,chr18,-,ENSG00000263305,14653,16855,"v40,v29",Known,.,.,intropolis
...,...,...,...,...,...,...,...,...,...,...
7921,chr18,+,ENSG00000267251,80187942,80189280,"v40,v29",Known,.,.,intropolis
7922,chr18,+,ENSG00000267251,80189439,80197448,"v40,v29",Known,.,.,intropolis
7923,chr18,+,ENSG00000267251,80197638,80202603,"v40,v29",Known,.,.,intropolis
7924,chr18,-,ENSG00000178184,80202018,80202709,"v40,v29,lapa,gtex",Known,.,.,intropolis


In [153]:
i_df.loc[i_df.Start==215504]

Unnamed: 0,Chromosome,Start,End,Strand,source
18185374,chr18,215504.0,246323.0,-,intropolis
18185373,chr18,215504.0,224923.0,-,intropolis
18185376,chr18,215504.0,267965.0,-,intropolis
18185375,chr18,215504.0,265302.0,-,intropolis
18185372,chr18,215504.0,218885.0,-,intropolis
18185371,chr18,215504.0,218763.0,-,intropolis
18185370,chr18,215504.0,216485.0,-,intropolis
18185369,chr18,215504.0,215945.0,-,intropolis
18185368,chr18,215504.0,215678.0,-,intropolis
18185367,chr18,215504.0,215621.0,-,intropolis


In [128]:
i_df = pr.PyRanges(i_df)
i_df.to_bed('chr18_intropolis.bed')
df = pr.PyRanges(df)
df.to_bed('chr18_introns.bed')

In [129]:
i_df = i_df.df
df = df.df

In [136]:
df.loc[df.End==21110817]
df.loc[(df.End==21110817)&(df.Start==21070613)]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
5188,chr18,-,ENSG00000067900,21070613,21110817,"v40,v29,lapa,gtex",Known


In [134]:
i_df.loc[(i_df.End==21110817)&(i_df.Start==21070614)]

Unnamed: 0,Chromosome,Start,End,Strand,source
412420,chr18,21070614,21110817,-,intropolis


In [137]:
i_df.loc[(i_df.End==21110817)&(i_df.Start==21070615)]

Unnamed: 0,Chromosome,Start,End,Strand,source


In [140]:
df.loc[(df.Start==21242393)&(df.End==21366026)]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
1290,chr18,+,ENSG00000141449,21242393,21366026,"v40,v29,lapa,gtex",Known


In [142]:
i_df.loc[(i_df.Start==21242394)&(i_df.End==21366026)]

Unnamed: 0,Chromosome,Start,End,Strand,source
97097,chr18,21242394,21366026,+,intropolis


In [131]:
df.loc[df.gene_id=='ENSG00000067900']

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
5154,chr18,-,ENSG00000067900,20951387,20953577,"v40,v29,lapa,gtex",Known
5155,chr18,-,ENSG00000067900,20953785,20954782,"v40,v29,lapa,gtex",Known
5156,chr18,-,ENSG00000067900,20955044,20955166,"v40,v29,lapa,gtex",Known
5157,chr18,-,ENSG00000067900,20955245,20959839,"v40,v29,lapa,gtex",Known
5158,chr18,-,ENSG00000067900,20959928,20960135,"v40,v29,lapa,gtex",Known
5159,chr18,-,ENSG00000067900,20960206,20960662,"v40,v29,lapa,gtex",Known
5160,chr18,-,ENSG00000067900,20960206,20966916,"v40,v29,lapa,gtex",Known
5161,chr18,-,ENSG00000067900,20960711,20966916,"v40,v29,lapa,gtex",Known
5162,chr18,-,ENSG00000067900,20967076,20967751,"v40,v29,lapa,gtex",Known
5163,chr18,-,ENSG00000067900,20967940,20968771,"v40,v29,lapa,gtex",Known


In [87]:
temp = df.merge(i_df, on=['Chromosome', 'Start', 'End', 'Strand'], how='left')

In [88]:
temp.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source_x,novelty,source_y
0,chr1,+,ENSG00000223972,12057,12178,"v40,v29",Known,
1,chr1,+,ENSG00000223972,12227,12612,"v40,v29",Known,
2,chr1,+,ENSG00000223972,12697,12974,"v40,v29",Known,
3,chr1,+,ENSG00000223972,12721,13220,"v40,v29",Known,
4,chr1,+,ENSG00000223972,13052,13220,"v40,v29",Known,


In [89]:
temp.loc[temp.source_y.notnull()]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source_x,novelty,source_y


In [90]:
# try getting nearest and seeing if there's a uniform transformation 
# to get them
df = pr.PyRanges(df)
i_df = pr.PyRanges(i_df)
temp = df.nearest(i_df,
                  strandedness='same',
                  nb_cpu=1)

In [91]:
temp.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty,Start_b,End_b,Strand_b,source_b,Distance
0,chr1,+,ENSG00000223972,12057,12178,"v40,v29",Known,10923,170574,+,intropolis,0
1,chr1,+,ENSG00000223972,12227,12612,"v40,v29",Known,10923,170574,+,intropolis,0
2,chr1,+,ENSG00000223972,12697,12974,"v40,v29",Known,10923,170574,+,intropolis,0
3,chr1,+,ENSG00000223972,12721,13220,"v40,v29",Known,10923,170574,+,intropolis,0
4,chr1,+,ENSG00000223972,13052,13220,"v40,v29",Known,10923,170574,+,intropolis,0
5,chr1,+,ENSG00000223972,13374,13452,"v40,v29",Known,10923,170574,+,intropolis,0
6,chr1,+,ENSG00000243485,30039,30563,"v40,v29",Known,10923,170574,+,intropolis,0
7,chr1,+,ENSG00000243485,30667,30975,"v40,v29,lapa",Known,10923,170574,+,intropolis,0


In [97]:
# temp = temp.df
temp['Start_dist'] = abs(temp.Start_b-temp.Start)
temp['End_dist'] = abs(temp.End_b-temp.End)

In [98]:
temp

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty,Start_b,End_b,Strand_b,source_b,Distance,Start_dist,End_dist
0,chr1,+,ENSG00000223972,12057,12178,"v40,v29",Known,10923,170574,+,intropolis,0,1134,158396
1,chr1,+,ENSG00000223972,12227,12612,"v40,v29",Known,10923,170574,+,intropolis,0,1304,157962
2,chr1,+,ENSG00000223972,12697,12974,"v40,v29",Known,10923,170574,+,intropolis,0,1774,157600
3,chr1,+,ENSG00000223972,12721,13220,"v40,v29",Known,10923,170574,+,intropolis,0,1798,157354
4,chr1,+,ENSG00000223972,13052,13220,"v40,v29",Known,10923,170574,+,intropolis,0,2129,157354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537,chr1,-,ENSG00000215912,2787706,2789502,"v40,v29",Known,2279427,2789291,-,intropolis,0,508279,211
1538,chr1,-,ENSG00000215912,2790346,2800043,"v40,v29",Known,2297400,2805203,-,intropolis,0,492946,5160
1539,chr1,-,ENSG00000215912,2800868,2801576,"v40,v29",Known,2297400,2805203,-,intropolis,0,503468,3627
1540,chr1,-,ENSG00000233234,2811998,2812234,"v40,v29",Known,2302755,2815806,-,intropolis,0,509243,3572


In [99]:
temp.sort_values(by='Start_dist').head()
temp

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty,Start_b,End_b,Strand_b,source_b,Distance,Start_dist,End_dist
0,chr1,+,ENSG00000223972,12057,12178,"v40,v29",Known,10923,170574,+,intropolis,0,1134,158396
1,chr1,+,ENSG00000223972,12227,12612,"v40,v29",Known,10923,170574,+,intropolis,0,1304,157962
2,chr1,+,ENSG00000223972,12697,12974,"v40,v29",Known,10923,170574,+,intropolis,0,1774,157600
3,chr1,+,ENSG00000223972,12721,13220,"v40,v29",Known,10923,170574,+,intropolis,0,1798,157354
4,chr1,+,ENSG00000223972,13052,13220,"v40,v29",Known,10923,170574,+,intropolis,0,2129,157354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537,chr1,-,ENSG00000215912,2787706,2789502,"v40,v29",Known,2279427,2789291,-,intropolis,0,508279,211
1538,chr1,-,ENSG00000215912,2790346,2800043,"v40,v29",Known,2297400,2805203,-,intropolis,0,492946,5160
1539,chr1,-,ENSG00000215912,2800868,2801576,"v40,v29",Known,2297400,2805203,-,intropolis,0,503468,3627
1540,chr1,-,ENSG00000233234,2811998,2812234,"v40,v29",Known,2302755,2815806,-,intropolis,0,509243,3572
