In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [3]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [4]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
unfilt_ab = od+expand(config['lr']['cerberus']['ab'], species='human')[0] 
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]

gtex_cerb_gtf = od+expand(config['gtex']['cerberus']['gtf'], species='human')

## Compare how many unique intron chains are detected in GTEx (all) and in ENCODE (using 1 TPM filter)

In [8]:
ca = cerberus.read(cerberus_h5)

In [17]:
# limit to same samples from the gtex paper?
def get_gtex_match_samples():
    samples = ['adipose',
               'brain',
               'brain_ad',
               'hmec',
               'mcf10a',
               'mcf7',
               'heart',
               'liver',
               'lung',
               'muscle',
               'h9_panc_beta',
               'h9_panc_progen',
               'panc1']
    return samples

In [24]:
samples = get_gtex_match_samples()
tl_df = pd.read_csv(filt_ab, sep='\t')
# tl_df, ids = get_tpm_table(tl_df, 
#                          how='ic',
#                          min_tpm=min_tpm,
#                          gene_subset=None,
#                          sample=samples)
tl_df, ids = get_tpm_table(tl_df, 
                         how='ic',
                         min_tpm=min_tpm,
                         gene_subset=gene_subset,
                         sample=samples)

Calculating ic TPM values
Subsetting for ['adipose', 'brain', 'brain_ad', 'hmec', 'mcf10a', 'mcf7', 'heart', 'liver', 'lung', 'muscle', 'h9_panc_beta', 'h9_panc_progen', 'panc1'] samples
Subsetting for polya genes
Enforcing minimum TPM
Total # ics detected: 145392
# ics >= 1 tpm: 108624
Applying gene type and novelty subset
Number of ics reported: 106774


In [25]:
# get the gtex ICs
df = ca.ic.loc[ca.ic.source.str.contains('gtex')].copy(deep=True)
df['gtex'] = True

df2 = ca.ic.loc[ca.ic.Name.isin(ids)].copy(deep=True)
df2['tl'] = True

merge_cols = ['Chromosome', 'Strand', 'Coordinates', 'source', 'novelty', 'gene_id', 'ic', 'Name']
df = df.merge(df2, how='outer', on=merge_cols)
df.gtex.fillna(False, inplace=True)
df.tl.fillna(False, inplace=True)

In [28]:
# get only ics that are spliced
df = df.loc[df.Coordinates != '-']

In [29]:
df[['Name', 'tl', 'gtex']].groupby(['tl', 'gtex']).count().reset_index()

Unnamed: 0,tl,gtex,Name
0,False,True,59184
1,True,False,73415
2,True,True,28578


In [31]:
df.loc[(df.tl==False)&(df.gtex==True)&(df.ic==1)].head()

Unnamed: 0,Chromosome,Strand,Coordinates,Name,source,novelty,gene_id,ic,gtex,tl
61,chr1,+,11068713-11070658,ENSG00000226849_1,"v40,v29,lapa,gtex",Known,ENSG00000226849,1,True,False
97,chr1,+,11610113-11611842-11611978-11612789,ENSG00000235643_1,"v40,v29,lapa,gtex",Known,ENSG00000235643,1,True,False
136,chr1,+,13315658-13318391-13318700-13319371-13319953-1...,ENSG00000204501_1,"v40,v29,gtex",Known,ENSG00000204501,1,True,False
138,chr1,+,143874823-143875218-143875239-143876051-143876...,ENSG00000265531_1,"v40,v29,gtex",Known,ENSG00000265531,1,True,False
166,chr1,+,149782774-149783169-149783190-149784002-149784...,ENSG00000150337_1,"v40,v29,gtex",Known,ENSG00000150337,1,True,False
