In [60]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [61]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [62]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = '../'+expand(config['data']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

In [4]:
# first make tables for the different splice junctions and splice sites; determine their novelty
ca = cerberus.read(cerberus_h5)

In [71]:
ref_sources = ['v29', 'v40']

In [96]:
ic = ca.ic.copy(deep=True)

In [101]:


# # Test example
# input_list = [2, 3, 4, 5, 7]
# output_pairs = sequential_pairs(input_list)
# print(output_pairs)  # Output: [(2, 3), (4, 5)]

In [136]:
ic = ic.copy(deep=True)

df = explode_ic(df, 'ss')
df2 = get_source_table(ic)

# # merge source info in w/ coord info
# df = df.merge(df2, how='left', on=['Name'])

# # figure out novelty and source of each coordinate
# df.drop('Name', axis=1, inplace=True)
# df.drop_duplicates(inplace=True)
# df.rename({'sj_coords': 'Start',
#            'list_source': 'source'},
#           axis=1, inplace=True)
# gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
# df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
#                                                  False: 'Novel'})
# df = df.groupby(gb_cols).agg(','.join).reset_index()
# df = cerberus.update_novelty(df)

In [132]:
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Name,sj_coords
1084,chr1,+,ENSG00000156875,ENSG00000156875_1,"(100038316, 100049908)"
1084,chr1,+,ENSG00000156875,ENSG00000156875_1,"(100050004, 100058665)"
1084,chr1,+,ENSG00000156875,ENSG00000156875_1,"(100058728, 100059877)"
1084,chr1,+,ENSG00000156875,ENSG00000156875_1,"(100060005, 100061834)"
1084,chr1,+,ENSG00000156875,ENSG00000156875_1,"(100061949, 100067976)"


In [133]:
df2.head()

Unnamed: 0,Name,list_source
1084,ENSG00000156875_1,v40
1084,ENSG00000156875_1,v29
1084,ENSG00000156875_1,lapa
1084,ENSG00000156875_1,gtex
1085,ENSG00000156875_2,v40


In [149]:
def get_source_table(df):
    """
    Get a melted form table for each entry in a tss, ic, or tes table
    for each form of support for each triplet feature.
    
    Parameters:
        df (pandas DataFrame): DataFrame of tsss, ics, or tess
    
    Returns:
        df (pandas DataFrame): Long-form DataFrame of support for each tss, ic, or tes
    """
    keep_cols = ['Name', 'source']
    df = ic[keep_cols].copy(deep=True)
    df['list_source'] = df.source.str.split(',')
    df = df.explode('list_source')
    df.drop('source', axis=1, inplace=True)
    
    return df

# chatgpt wrote this for me thanx chatgpt
def sequential_pairs(x):
    """
    Get sequential pairs of tuples in list.
    Example: [1,2,3,4] -> [(1,2),(3,4)]
    """
    p = []
    for i in range(0, len(x) - 1, 2):
        p.append((x[i], x[i + 1]))
    return p

def explode_ic(ic, how):
    """
    Explode an ic df to long form, either on the splice site
    or the splice junction level.
    
    Parameters:
        how (str): {'ss', 'sj'}
    """
    # remove the monoexonic entries
    ic = ic.loc[~(ic.Coordinates == '-')]
    
    # explode into series of ss coords
    keep_cols = ['Chromosome', 'Coordinates',
                 'Strand', 'gene_id',
                 'Name']
    df = ic.copy(deep=True)
    df = df[keep_cols]
    df['ss_coords'] = df.Coordinates.str.split('-')
    
    if how == 'ss':
        df = df.explode('ss_coords')
        df.drop('Coordinates', axis=1, inplace=True)
    
    # get pairs of sss to form sjs
    elif how == 'sj':
        df['sj_coords'] = df.ss_coords.apply(sequential_pairs)
        df = df.explode('sj_coords')
        df.drop(['Coordinates', 'ss_coords'], axis=1, inplace=True)
        
    return df

def get_ss_sj_from_ic(ic, how):
    ic = ic.copy(deep=True)
    
    df = explode_ic(ic, how)
    df2 = get_source_table(ic)
    
    # merge source info in w/ coord info
    df = df.merge(df2, how='left', on=['Name'])

    list_col = f'{how}_coords'
    
    # figure out novelty and source of each ss / sj
    df.drop('Name', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df.rename({list_col: 'Start',
               'list_source': 'source'},
              axis=1, inplace=True)
    gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
    df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
                                                     False: 'Novel'})
    df = df.groupby(gb_cols).agg(','.join).reset_index()
    df = cerberus.update_novelty(df)
    
    # if sj, turn tuple coords into bed format
    if how == 'sj':
        df['temp'] = df['Start']
        import pdb; pdb.set_trace()
        df['Start'] = df.temp.str[0]
        df['End'] = df.temp.str[1]
        df.drop('temp', axis=1, inplace=True)
    
    return df

def get_sj_from_ic(ic):
    """
    Get a splice junction table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice junction
    """
    return get_ss_sj_from_ic(ic, 'sj')

def get_ss_from_ic(ic):
    """
    Get a splice site table from an intron chain table.
    Retain source and novelty information.
    
    Parameters:
        ic (pandas DataFrame): DataFrame formatted as cerberus ic table
        
    Returns:
        df (pandas DataFrame): DataFrame with entries for each splice site
    """
    return get_ss_sj_from_ic(ic, 'ss')       

In [141]:
df = get_ss_from_ic(ca.ic)

In [150]:
df = get_sj_from_ic(ca.ic)

> [0;32m<ipython-input-149-49147ecaa0f1>[0m(89)[0;36mget_ss_sj_from_ic[0;34m()[0m
[0;32m     87 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'temp'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0;34m'Start'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     88 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 89 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'Start'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     90 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'End'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     91 [0;31m        [0mdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m'temp'[0m[0;34m,[0m [0maxis[0m[0;34m=[0m

ipdb>  df.head()


  Chromosome Strand          gene_id                   Start   source novelty  \
0       chr1      +  ENSG00000000460  (169662523, 169683468)  v40,v29   Known   
1       chr1      +  ENSG00000000460  (169683625, 169683755)  v40,v29   Known   
2       chr1      +  ENSG00000000460  (169683932, 169783810)  v40,v29   Known   
3       chr1      +  ENSG00000000460  (169783928, 169784876)  v40,v29   Known   
4       chr1      +  ENSG00000000460  (169784913, 169798856)  v40,v29   Known   

                     temp  
0  (169662523, 169683468)  
1  (169683625, 169683755)  
2  (169683932, 169783810)  
3  (169783928, 169784876)  
4  (169784913, 169798856)  


ipdb>  n


> [0;32m<ipython-input-149-49147ecaa0f1>[0m(90)[0;36mget_ss_sj_from_ic[0;34m()[0m
[0;32m     88 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     89 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'Start'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 90 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'End'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     91 [0;31m        [0mdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m'temp'[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     92 [0;31m[0;34m[0m[0m
[0m


ipdb>  df.head()


  Chromosome Strand          gene_id      Start   source novelty  \
0       chr1      +  ENSG00000000460  169662523  v40,v29   Known   
1       chr1      +  ENSG00000000460  169683625  v40,v29   Known   
2       chr1      +  ENSG00000000460  169683932  v40,v29   Known   
3       chr1      +  ENSG00000000460  169783928  v40,v29   Known   
4       chr1      +  ENSG00000000460  169784913  v40,v29   Known   

                     temp  
0  (169662523, 169683468)  
1  (169683625, 169683755)  
2  (169683932, 169783810)  
3  (169783928, 169784876)  
4  (169784913, 169798856)  


ipdb>  n


> [0;32m<ipython-input-149-49147ecaa0f1>[0m(91)[0;36mget_ss_sj_from_ic[0;34m()[0m
[0;32m     89 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'Start'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     90 [0;31m        [0mdf[0m[0;34m[[0m[0;34m'End'[0m[0;34m][0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mtemp[0m[0;34m.[0m[0mstr[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 91 [0;31m        [0mdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m'temp'[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     92 [0;31m[0;34m[0m[0m
[0m[0;32m     93 [0;31m    [0;32mreturn[0m [0mdf[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  df.head()


  Chromosome Strand          gene_id      Start   source novelty  \
0       chr1      +  ENSG00000000460  169662523  v40,v29   Known   
1       chr1      +  ENSG00000000460  169683625  v40,v29   Known   
2       chr1      +  ENSG00000000460  169683932  v40,v29   Known   
3       chr1      +  ENSG00000000460  169783928  v40,v29   Known   
4       chr1      +  ENSG00000000460  169784913  v40,v29   Known   

                     temp        End  
0  (169662523, 169683468)  169683468  
1  (169683625, 169683755)  169683755  
2  (169683932, 169783810)  169783810  
3  (169783928, 169784876)  169784876  
4  (169784913, 169798856)  169798856  


ipdb>  c


In [151]:
df.head()

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty,End
0,chr1,+,ENSG00000000460,169662523,"v40,v29",Known,169683468
1,chr1,+,ENSG00000000460,169683625,"v40,v29",Known,169683755
2,chr1,+,ENSG00000000460,169683932,"v40,v29",Known,169783810
3,chr1,+,ENSG00000000460,169783928,"v40,v29",Known,169784876
4,chr1,+,ENSG00000000460,169784913,"v40,v29",Known,169798856


In [95]:
df.loc[df.source=='lapa']

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty
12,chr1,+,ENSG00000000460,169795226,lapa,Novel
123,chr1,+,ENSG00000000971,196737010,lapa,Novel
198,chr1,+,ENSG00000004487,23071361,lapa,Novel
222,chr1,+,ENSG00000007933,171090964,lapa,Novel
371,chr1,+,ENSG00000010165,171789999,lapa,Novel
...,...,...,...,...,...,...
655540,chrX,-,ENSG00000285756,3829219,lapa,Novel
655848,chrY,+,ENSG00000067048,12916975,lapa,Novel
656165,chrY,+,ENSG00000154620,13702953,lapa,Novel
656166,chrY,+,ENSG00000154620,13704333,lapa,Novel


In [40]:
# ca.ic.loc[ca.ic.gene_id.duplicated(keep=False)].sort_values(by='gene_id')

In [83]:
ic = ca.ic.copy(deep=True)
# ic = ic.loc[ic.gene_id == 'ENSG00000000003']

# remove the monoexonic entries
ic = ic.loc[~(ic.Coordinates == '-')]

# limit to only the relevant columns
keep_cols = ['Chromosome', 'Coordinates',
             'Strand', 'gene_id',
             'Name']
df = ic.copy(deep=True)
df = df[keep_cols]


df['list_coords'] = df.Coordinates.str.split('-')
df = df.explode('list_coords')
df.drop('Coordinates', axis=1, inplace=True)

# add in source later
keep_cols = ['Name', 'source']
df2 = ic[keep_cols].copy()
df2['list_source'] = df2.source.str.split(',')
df2 = df2.explode('list_source')
df2.drop('source', axis=1, inplace=True)


AttributeError: 'DataFrame' object has no attribute 'novelty'

In [84]:
print(len(df.index))
df = df.merge(df2, how='left', on=['Name'])
print(len(df.index))

4670426
8329720


In [85]:
df.drop('Name', axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print(len(df.index))
df.rename({'list_coords': 'Start',
           'list_source': 'source'},
          axis=1, inplace=True)
gb_cols = ['Chromosome', 'Strand', 'gene_id', 'Start']
df['novelty'] = df.source.isin(ref_sources).map({True: 'Known',
                                                   False: 'Novel'})
df = df.groupby(gb_cols).agg(','.join).reset_index()
        

1971524


In [86]:
df = cerberus.update_novelty(df)

In [89]:
df.loc[df.source=='lapa']

Unnamed: 0,Chromosome,Strand,gene_id,Start,source,novelty
12,chr1,+,ENSG00000000460,169795226,lapa,Novel
123,chr1,+,ENSG00000000971,196737010,lapa,Novel
198,chr1,+,ENSG00000004487,23071361,lapa,Novel
222,chr1,+,ENSG00000007933,171090964,lapa,Novel
371,chr1,+,ENSG00000010165,171789999,lapa,Novel
...,...,...,...,...,...,...
655540,chrX,-,ENSG00000285756,3829219,lapa,Novel
655848,chrY,+,ENSG00000067048,12916975,lapa,Novel
656165,chrY,+,ENSG00000154620,13702953,lapa,Novel
656166,chrY,+,ENSG00000154620,13704333,lapa,Novel


In [97]:
# ic.loc[ic.Name.duplicated(keep=False)].sort_values(by='Name')

In [37]:
ic.head()

Unnamed: 0,Chromosome,Coordinates,Strand,source,gene_id,Name,list_coords,coords
1084,chr1,100038316-100049908-100050004-100058665-100058...,+,"v40,v29,lapa,gtex",ENSG00000156875,ENSG00000156875_1,"[100038316, 100049908, 100050004, 100058665, 1...",101729567
1085,chr1,100058728-100059877-100060005-100061834,+,"v40,v29",ENSG00000156875,ENSG00000156875_2,"[100058728, 100059877, 100060005, 100061834]",101737223
1086,chr1,1001263-1008193-1008279-1013983,+,"v40,v29,lapa,gtex",ENSG00000187608,ENSG00000187608_3,"[1001263, 1008193, 1008279, 1013983]",101737357
1087,chr1,1001281-1008193-1008279-1013983,+,"v40,v29,lapa",ENSG00000187608,ENSG00000187608_2,"[1001281, 1008193, 1008279, 1013983]",101774100
1088,chr1,100133315-100136881-100136928-100137018-100137...,+,"v40,v29,lapa",ENSG00000122435,ENSG00000122435_1,"[100133315, 100136881, 100136928, 100137018, 1...",101639703


In [None]:
# ne