In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = '../'+expand(config['data']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

In [4]:
# first make tables for the different splice junctions and splice sites; determine their novelty
ca = cerberus.read(cerberus_h5)

In [5]:
ref_sources = ['v29', 'v40']

In [6]:
ic = ca.ic.copy(deep=True)


In [78]:
how = 'ss'
# how='sj'


In [79]:
# ic = ic.copy(deep=True)

# # get coords of each splice site in each splice junction
# df = explode_ic(ic)
# df['Start'] = df['sj_coords'].str[0].astype(int)
# df['End'] = df['sj_coords'].str[1].astype(int)
# df.drop('sj_coords', axis=1, inplace=True)

# # label sss as 5' or 3' and melt
# if how == 'ss':
#     assert len(df.loc[(df.Start>df.End)&(df.Strand=='+')].index) == 0
#     # since these are intron coords, the start defines a 3' ss 
#     # and the end defines a 5' ss
#     df.rename({'Start':'ss_3', 'End':'ss_5'}, axis=1, inplace=True)
#     id_cols = ['Chromosome', 'Strand', 'gene_id', 'Name']
#     df = df.melt(id_vars=id_cols,
#                  var_name='ss_type',
#                  value_name='Start')
    
# # merge source info in w/ coord info
# df2 = get_source_table(ic)
# df = df.merge(df2, how='left', on=['Name'])

# # figure out novelty and source of each ss / sj
# df.drop('Name', axis=1, inplace=True)
# df.drop_duplicates(inplace=True)
# gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
# if how == 'ss':
#     gb_cols += ['ss_type']
# df.rename({'list_source': 'source'},
#           axis=1, inplace=True)
# df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
#                                                  False: 'Novel'})
# df = df.groupby(gb_cols).agg(','.join).reset_index()
# df = cerberus.update_novelty(df)

In [82]:
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,ss_type,source,novelty
0,chr1,+,ENSG00000000460,169662523,ss_3,"v40,v29",Known
1,chr1,+,ENSG00000000460,169683468,ss_5,"v40,v29",Known
2,chr1,+,ENSG00000000460,169683625,ss_3,"v40,v29",Known
3,chr1,+,ENSG00000000460,169683755,ss_5,"v40,v29",Known
4,chr1,+,ENSG00000000460,169683932,ss_3,"v40,v29",Known


In [186]:
def get_source_table(df):
    """
    Get a melted form table for each entry in a tss, ic, or tes table
    for each form of support for each triplet feature.
    
    Parameters:
        df (pandas DataFrame): DataFrame of tsss, ics, or tess
    
    Returns:
        df (pandas DataFrame): Long-form DataFrame of support for each tss, ic, or tes
    """
    keep_cols = ['Name', 'source']
    df = ic[keep_cols].copy(deep=True)
    df['list_source'] = df.source.str.split(',')
    df = df.explode('list_source')
    df.drop('source', axis=1, inplace=True)
    
    return df

# chatgpt wrote this for me thanx chatgpt
def sequential_pairs(x):
    """
    Get sequential pairs of tuples in list.
    Example: [1,2,3,4] -> [(1,2),(3,4)]
    """
    p = []
    for i in range(0, len(x) - 1, 2):
        p.append((x[i], x[i + 1]))
    return p

def explode_ic(ic):
    """
    Explode an ic df to long form with splice junction entries
    """
    # remove the monoexonic entries
    ic = ic.loc[~(ic.Coordinates == '-')]
    
    # explode into series of ss coords
    keep_cols = ['Chromosome', 'Coordinates',
                 'Strand', 'gene_id',
                 'Name']
    df = ic.copy(deep=True)
    df = df[keep_cols]
    df['ss_coords'] = df.Coordinates.str.split('-')
    
    # get pairs of sss to form sjs
    df['sj_coords'] = df.ss_coords.apply(sequential_pairs)
    df = df.explode('sj_coords')
    df.drop(['Coordinates', 'ss_coords'], axis=1, inplace=True)
                
    return df

def get_ss_sj_from_ic(ic, how):
    ic = ic.copy(deep=True)

    # get coords of each splice site in each splice junction
    df = explode_ic(ic)
    df['Start'] = df['sj_coords'].str[0].astype(int)
    df['End'] = df['sj_coords'].str[1].astype(int)
    df.drop('sj_coords', axis=1, inplace=True)

    # label sss as 5' or 3' and melt
    if how == 'ss':
        assert len(df.loc[(df.Start>df.End)&(df.Strand=='+')].index) == 0
        # since these are intron coords, the start defines a 3' ss 
        # and the end defines a 5' ss
        df.rename({'Start':'ss_3', 'End':'ss_5'}, axis=1, inplace=True)
        id_cols = ['Chromosome', 'Strand', 'gene_id', 'Name']
        df = df.melt(id_vars=id_cols,
                     var_name='ss_type',
                     value_name='Start')
        
    # for sjs, reorder according to min and max coords
    # in bed standard format
    elif how == 'sj':
        df['temp_Start'] = df.Start
        df['temp_End'] = df.End
        df['Start'] = df[['temp_Start', 'temp_End']].min(axis=1)
        df['End'] = df[['temp_Start', 'temp_End']].max(axis=1)
        df.drop(['temp_Start', 'temp_End'], axis=1, inplace=True)

    # merge source info in w/ coord info
    df2 = get_source_table(ic)
    df = df.merge(df2, how='left', on=['Name'])

    # figure out novelty and source of each ss / sj
    df.drop('Name', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
    if how == 'ss':
        gb_cols += ['ss_type']
    elif how == 'sj':
        gb_cols += ['End']
    df.rename({'list_source': 'source'},
              axis=1, inplace=True)
    df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
                                                     False: 'Novel'})
    df = df.groupby(gb_cols).agg(','.join).reset_index()
    df = cerberus.update_novelty(df)
    
    return df

def get_sj_from_ic(ic):
    """
    Get a splice junction table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice junction
    """
    return get_ss_sj_from_ic(ic, 'sj')

def get_ss_from_ic(ic):
    """
    Get a splice site table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice site
    """
    return get_ss_sj_from_ic(ic, 'ss')       

In [183]:
# df = get_ss_from_ic(ca.ic)
# df.head()

In [187]:
df = get_sj_from_ic(ca.ic)
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
0,chr1,+,ENSG00000000460,169662523,169683468,"v40,v29",Known
1,chr1,+,ENSG00000000460,169683625,169683755,"v40,v29",Known
2,chr1,+,ENSG00000000460,169683932,169783810,"v40,v29",Known
3,chr1,+,ENSG00000000460,169783928,169784876,"v40,v29",Known
4,chr1,+,ENSG00000000460,169784913,169798856,"v40,v29",Known


In [196]:
temp = df.loc[df.Chromosome == 'chr18']
temp['Score'] = 0
temp.drop(['novelty', 'source', 'gene_id'], axis=1, inplace=True)
temp = pr.PyRanges(temp)
temp.to_bed('chr18_ic_sj.bed')

In [142]:
df.loc[df.Start > df.End]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty


In [168]:
# introp = '../data/human/intropolis/smol_intropolis.bed'
introp = '../data/human/intropolis/intropolis.bed'
i_df = pr.read_bed(introp).df
source = 'intropolis'
ref = False
i_df['source'] = source

In [190]:
temp = pr.PyRanges(i_df.loc[i_df.Chromosome == 'chr18'])
temp.to_bed('chr18_intropolis.bed')

In [144]:
i_df.loc[i_df.Start > i_df.End]

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source


In [145]:
df.loc[df.Start > df.End]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty


In [146]:
print(len(df.index))

453567


In [147]:
i_df.loc[(i_df.Chromosome=='chr1')&(i_df.Start<159662523)&(i_df.Start>23415904)&(i_df.Start==23417390)].sort_values(by='Start').head()

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,source


In [148]:
df.loc[(df.Chromosome=='chr1')&(df.Start<159662523)&(df.Start>23415904)].sort_values(by='Start').head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty
38956,chr1,-,ENSG00000204219,23417390,23417903,"v40,v29,lapa,gtex",Known
38958,chr1,-,ENSG00000204219,23418009,23424564,"lapa,gtex",Novel
38957,chr1,-,ENSG00000204219,23418009,23419076,"v40,v29,lapa,gtex",Known
38959,chr1,-,ENSG00000204219,23419139,23424564,"v40,v29,lapa,gtex",Known
38960,chr1,-,ENSG00000204219,23419207,23424564,lapa,Novel


In [149]:
# # look for individual splice sites in intropolis
# for coord in ['Start', 'End']:
#     for strand in ['+', '-']:
#         temp = df.loc[df.Strand == strand][['Chromosome', 'Strand', coord]].drop_duplicates().copy(deep=True)
#         n = len(temp.index)
#         temp2 = temp.merge(i_df, on=['Chromosome', 'Strand', coord], how='inner')
#         n_num = len(temp[coord].unique().tolist())
#         print(f'{n_num}/{n} ss {coord} from cerberus validated in intropolis')

In [176]:
# can I merge intropolis w/ my sj tsv?
# df = df.merge(i_df.drop(['Name', 'Score'], axis=1), how='left', on=['Chromosome', 'Start', 'Strand'], suffixes=('', '_merge'))
temp = df.merge(i_df.drop(['Name', 'Score'], axis=1), how='left', on=['Chromosome', 'Start', 'End', 'Strand'], suffixes=('', '_merge'))

In [185]:
for coord in ['Start', 'End']:
    for strand in ['+', '-']:
        temp = df.loc[df.Strand == strand][['Chromosome', 'Strand', coord]].drop_duplicates().copy(deep=True)
        n = len(temp.index)
        temp2 = temp.merge(i_df, on=['Chromosome', 'Strand', coord], how='inner')
        n_num = len(temp2[['Chromosome', 'Strand', coord]].drop_duplicates().index)
        print(f'{n_num}/{n} sj {strand} {coord} from cerberus validated in intropolis')

1401/162023 sj + Start from cerberus validated in intropolis
2193/153811 sj - Start from cerberus validated in intropolis
2002/157950 sj + End from cerberus validated in intropolis
1310/158037 sj - End from cerberus validated in intropolis


In [177]:
temp.loc[~(temp.source_merge.isnull())]
# len(df.index)
# df[['Start', 'source_merge']].groupby('source_merge').count()
# df.loc[df.End==df.End_merge-1]

Unnamed: 0,Chromosome,Strand,gene_id,Start,End,source,novelty,source_merge
156355,chr16,+,ENSG00000140678,31371785,31372595,"v40,v29",Known,intropolis
202498,chr18,-,ENSG00000101577,2954598,2960649,"v40,v29,lapa,gtex",Known,intropolis
202540,chr18,-,ENSG00000101605,3168980,3173938,"v40,v29,lapa",Known,intropolis
202543,chr18,-,ENSG00000101605,3176133,3187480,"v40,v29,lapa",Known,intropolis
202830,chr18,-,ENSG00000132204,1273532,1275966,"v40,v29",Known,intropolis
203777,chr18,-,ENSG00000154845,9584827,9588089,"v40,v29,lapa,gtex",Known,intropolis
204230,chr18,-,ENSG00000170579,3534614,3567490,"v40,v29,lapa,gtex",Known,intropolis
206668,chr19,+,ENSG00000065268,992122,993718,lapa,Novel,intropolis
206669,chr19,+,ENSG00000065268,992122,994018,"v40,v29,lapa,gtex",Known,intropolis
206670,chr19,+,ENSG00000065268,992122,994211,"v40,v29,lapa,gtex",Known,intropolis


In [None]:
# concatenate sources where relevant
inds = df.loc[~df.source_merge.isnull()].index.tolist()
df.loc[inds, 'source'] = df.loc[inds, 'source']+','+df.loc[inds, 'source_merge'

In [247]:
for coord in ['Start', 'End']:
    for strand in ['+', '-']:
        temp = df.loc[df.Strand == strand][['Chromosome', 'Strand', coord]].drop_duplicates().copy(deep=True)
        n = len(temp.index)
        temp2 = temp.merge(i_df, on=['Chromosome', 'Strand', coord], how='inner')
        n_num = len(temp[coord].unique().tolist())
        print(f'{n_num}/{n} sj {coord} from cerberus validated in intropolis')

161845/162019 sj Start from cerberus validated in intropolis
153688/153811 sj Start from cerberus validated in intropolis
157762/157947 sj End from cerberus validated in intropolis
157890/158037 sj End from cerberus validated in intropolis


In [216]:
sj_coords = df.loc[df.Chromosome == 'chr1', 'Start'].unique().tolist()
sj_coords[:5]
# sj_coords = [s for s in sj_coords]
# sj_coords = [s+1 for s in sj_coords]
n = len(sj_coords)
temp2 = i_df.loc[i_df.Start.isin(sj_coords)]
n2 = len(temp2.Start.unique().tolist())
print(f'{n2}/{n} sj starts from cerberus validated in intropolis')
for s in ['-', '+']:
    
    
assert '-' in temp2.Strand.tolist()
assert '+' in temp2.Strand.tolist()

3191/31054 sj starts from cerberus validated in intropolis


In [204]:
sj_coords = df.loc[df.Chromosome == 'chr1', 'End'].tolist()
sj_coords[:5]
sj_coords = [s for s in sj_coords]
n = len(list(set(sj_coords)))
temp2 = i_df.loc[i_df.End.isin(sj_coords)]
n2 = len(temp2.End.unique().tolist())
print(f'{n2}/{n} sj starts from cerberus validated in intropolis smol')
assert '-' in temp2.Strand.tolist()
assert '+' in temp2.Strand.tolist()

11/30422 sj starts from cerberus validated in intropolis smol


In [183]:
df.sort_values(by=['Chromosome', 'Start'], ascending=[True,True])

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty,End
17717,chr1,+,ENSG00000223972,12057,"v40,v29",Known,12178
17718,chr1,+,ENSG00000223972,12227,"v40,v29",Known,12612
17719,chr1,+,ENSG00000223972,12697,"v40,v29",Known,12974
17720,chr1,+,ENSG00000223972,12721,"v40,v29",Known,13220
17721,chr1,+,ENSG00000223972,13052,"v40,v29",Known,13220
...,...,...,...,...,...,...,...
451549,chrY,+,ENSG00000182484PARY,57211620,"v40,v29",Known,57211760
453050,chrY,-,ENSG00000227159PARY,57213203,"v40,v29",Known,57213125
453051,chrY,-,ENSG00000227159PARY,57213525,"v40,v29",Known,57213357
453052,chrY,-,ENSG00000227159PARY,57213879,"v40,v29",Known,57213602


In [177]:
df.dtypes

Chromosome    object
Strand        object
gene_id       object
Start          int64
source        object
novelty       object
End            int64
dtype: object

In [182]:
temp = df.merge(i_df, on=['Chromosome', 'End', 'Strand'], how='inner')

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty,End,Name,Score


In [40]:
# ca.ic.loc[ca.ic.gene_id.duplicated(keep=False)].sort_values(by='gene_id')

In [83]:
ic = ca.ic.copy(deep=True)
# ic = ic.loc[ic.gene_id == 'ENSG00000000003']

# remove the monoexonic entries
ic = ic.loc[~(ic.Coordinates == '-')]

# limit to only the relevant columns
keep_cols = ['Chromosome', 'Coordinates',
             'Strand', 'gene_id',
             'Name']
df = ic.copy(deep=True)
df = df[keep_cols]


df['list_coords'] = df.Coordinates.str.split('-')
df = df.explode('list_coords')
df.drop('Coordinates', axis=1, inplace=True)

# add in source later
keep_cols = ['Name', 'source']
df2 = ic[keep_cols].copy()
df2['list_source'] = df2.source.str.split(',')
df2 = df2.explode('list_source')
df2.drop('source', axis=1, inplace=True)


AttributeError: 'DataFrame' object has no attribute 'novelty'

In [84]:
print(len(df.index))
df = df.merge(df2, how='left', on=['Name'])
print(len(df.index))

4670426
8329720


In [85]:
df.drop('Name', axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print(len(df.index))
df.rename({'list_coords': 'Start',
           'list_source': 'source'},
          axis=1, inplace=True)
gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
                                                   False: 'Novel'})
df = df.groupby(gb_cols).agg(','.join).reset_index()
        

1971524


In [86]:
df = cerberus.update_novelty(df)

In [89]:
df.loc[df.source=='lapa']

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty
12,chr1,+,ENSG00000000460,169795226,lapa,Novel
123,chr1,+,ENSG00000000971,196737010,lapa,Novel
198,chr1,+,ENSG00000004487,23071361,lapa,Novel
222,chr1,+,ENSG00000007933,171090964,lapa,Novel
371,chr1,+,ENSG00000010165,171789999,lapa,Novel
...,...,...,...,...,...,...
655540,chrX,-,ENSG00000285756,3829219,lapa,Novel
655848,chrY,+,ENSG00000067048,12916975,lapa,Novel
656165,chrY,+,ENSG00000154620,13702953,lapa,Novel
656166,chrY,+,ENSG00000154620,13704333,lapa,Novel


In [97]:
# ic.loc[ic.Name.duplicated(keep=False)].sort_values(by='Name')

In [37]:
ic.head()

Unnamed: 0,Chromosome,Coordinates,Strand,source,gene_id,Name,list_coords,coords
1084,chr1,100038316-100049908-100050004-100058665-100058...,+,"v40,v29,lapa,gtex",ENSG00000156875,ENSG00000156875_1,"[100038316, 100049908, 100050004, 100058665, 1...",101729567
1085,chr1,100058728-100059877-100060005-100061834,+,"v40,v29",ENSG00000156875,ENSG00000156875_2,"[100058728, 100059877, 100060005, 100061834]",101737223
1086,chr1,1001263-1008193-1008279-1013983,+,"v40,v29,lapa,gtex",ENSG00000187608,ENSG00000187608_3,"[1001263, 1008193, 1008279, 1013983]",101737357
1087,chr1,1001281-1008193-1008279-1013983,+,"v40,v29,lapa",ENSG00000187608,ENSG00000187608_2,"[1001281, 1008193, 1008279, 1013983]",101774100
1088,chr1,100133315-100136881-100136928-100137018-100137...,+,"v40,v29,lapa",ENSG00000122435,ENSG00000122435_1,"[100133315, 100136881, 100136928, 100137018, 1...",101639703


In [None]:
# ne