In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [3]:
config = load_config()
od = '../../snakemake/compare_external/'

In [4]:
# get ic <-> tid
df = pd.DataFrame()
gtfs = [od+config['lr']['gtf_filt_with_genes']]
sources = ['PODER']

for f, s in zip(gtfs, sources):
    temp = pr.read_gtf(f)
    temp = cerberus.get_ic(temp)
    temp['source'] = s
    df = pd.concat([df, temp], axis=0)
    
ics_df = df.copy(deep=True)    
ics_df['ic_id'] = ics_df.Chromosome.astype(str)+'_'+\
                  ics_df.Strand.astype(str)+'_'+\
                  ics_df.ic.astype(str)

In [5]:
df = pd.DataFrame()
ics = [od+config['gtex']['cerb']['ics'],
       od+config['chess']['cerb']['ics'],
       od+config['enc']['cerb']['ics'],
       od+config['lr']['cerb']['ics'],
       od+config['ref']['cerb']['ics'],
       od+config['ref']['refseq']['cerb']['ics']]
sources = ['GTEx', 'CHESS3', 'ENCODE4', 'PODER', 'GENCODE v47', 'RefSeq v110']
for ic, source in zip(ics, sources):
    temp = pd.read_csv(ic, sep='\t')
    
    # remove monoexonic, add source, drop name
    temp = temp.loc[temp.Coordinates != '-']
    temp['source'] = source
    temp = temp.drop('Name', axis=1)
    
    df = pd.concat([df, temp], axis=0)    
temp = df.copy(deep=True)
df = df.pivot_table(
    index=['Strand', 'Coordinates', 'Chromosome'],
    columns='source',
    aggfunc=lambda x: True,
    fill_value=False).reset_index()
# df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df.columns.values]
df.head()
df.reset_index(drop=True, inplace=True)

In [6]:
df.head()

source,Strand,Coordinates,Chromosome,CHESS3,ENCODE4,GENCODE v47,GTEx,PODER,RefSeq v110
0,+,10000-10669-10744-13745-14013-18875-18940-19252,KZ208910.1,False,False,False,False,False,True
1,+,10000-10669-10744-18875-18940-19252,KZ208910.1,False,False,False,False,False,True
2,+,10000-10669-10744-19252,KZ208910.1,False,False,False,False,False,True
3,+,10000-10669-10749-18875-18940-19252,KZ208910.1,False,False,False,False,False,True
4,+,10000-10669-10749-19252,KZ208910.1,False,False,False,False,False,True


In [9]:
df.to_csv('../../supp_tables/16_inter_catalog_overlap.tsv', sep='\t', index=False)

In [10]:
len(df.index)

631370

## add the novelty categories

In [12]:
df = pd.DataFrame()
gtfs = [od+config['gtex']['gtf'],
       od+config['enc']['gtf'],
       od+config['chess']['gtf']]
sources = ['GTEx', 'ENCODE4', 'CHESS3']

for f, s in zip(gtfs, sources):
    temp = pr.read_gtf(f)
    temp = cerberus.get_ic(temp)
    temp['source'] = s
    df = pd.concat([df, temp], axis=0)
    
ics_df = df.copy(deep=True)    

In [13]:
# get structural categories
f_dir = '/Users/fairliereese/Documents/programming/mele_lab/projects/240903_pt/ref/compare_external/'
files = [f'{f_dir}gtex_sqanti_classification.txt',
     f'{f_dir}enc_sqanti_classification.txt',
        f'{f_dir}chess_sqanti_classification.txt']
refs = ['GTEx', 'ENCODE4', 'CHESS3']

df2 = pd.DataFrame()

for f, r in zip(files, refs):
    df = pd.read_csv(f, sep='\t')
    print(len(df))
    # break
    m = {'antisense': 'Antisense',
     'full-splice_match': 'FSM',
     'fusion': 'Fusion', 
     'genic': 'Genic', 
     'genic_intron': 'Genic', 
     'incomplete-splice_match': 'ISM',
     'novel_in_catalog': 'NIC',
     'novel_not_in_catalog': 'NNC', 
     'intergenic': 'Intergenic'}
    # print(df.structural_category.unique())
    # break
    df['structural_category'] = df['structural_category'].map(m)
    df2 = pd.concat([df2, df], axis=0)

93718
245884
387944


In [17]:
# remove monoexonic
print(len(ics_df.index))
ics_df = ics_df.loc[ics_df.ic!='-']
print(len(ics_df.index))

508058
455963


In [15]:
df2.head()

Unnamed: 0,isoform,chrom,strand,length,exons,structural_category,associated_gene,associated_transcript,ref_length,ref_exons,...,seq_A_downstream_TTS,dist_to_CAGE_peak,within_CAGE_peak,dist_to_polyA_site,within_polyA_site,polyA_motif,polyA_dist,polyA_motif_found,ORF_seq,ratio_TSS
0,00146542-c373-4da6-8d48-2242bf43bade_ENSG00000...,chr1,-,1911,11,NIC,ENSG00000160055.20,novel,1538.0,11.0,...,CTGTGCCAAATATACCAATT,,,,,,,,,
1,0016cd97-0137-4194-9821-910931c6e972_ENSG00000...,chr1,+,4492,23,NNC,ENSG00000086015.23,novel,5207.0,27.0,...,TTTATGTCTTTTGCTTGCTT,,,,,,,,,
2,001eea3f-368c-444f-96c0-539cb2204403_ENSG00000...,chr1,+,2671,16,FSM,ENSG00000162736.18,ENST00000699545.1,2725.0,16.0,...,CTTGTACTGCTAGTTATTCT,,,,,,,,,
3,00294b07-572e-422c-a1bc-6f089eff895c_ENSG00000...,chr1,-,1345,10,NIC,ENSG00000116885.18,novel,1463.0,11.0,...,CCTTTCCCATTTTATTCTTA,,,,,,,,,
4,00321d78-2502-4c1c-bbac-bb2e5f99582a_ENSG00000...,chr1,-,3208,8,NIC,ENSG00000143549.22,novel,7064.0,10.0,...,CCTCATGCCACCCTGCATTT,,,,,,,,,


In [19]:
ics_df = ics_df.merge(df2[['isoform', 'structural_category']],
                      how='left', 
                      left_on='transcript_id', 
                      right_on='isoform')

In [23]:
ics_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,ic,source,isoform,structural_category
0,chr1,-,00146542-c373-4da6-8d48-2242bf43bade_ENSG00000...,ENSG00000160055.19,32222306-32222018-32221866-32221197-32221130-3...,GTEx,00146542-c373-4da6-8d48-2242bf43bade_ENSG00000...,NIC
1,chr1,-,00294b07-572e-422c-a1bc-6f089eff895c_ENSG00000...,ENSG00000116885.18,36450257-36444033-36444003-36438910-36438755-3...,GTEx,00294b07-572e-422c-a1bc-6f089eff895c_ENSG00000...,NIC
2,chr1,-,00321d78-2502-4c1c-bbac-bb2e5f99582a_ENSG00000...,ENSG00000143549.19,154191901-154191311-154191185-154176248-154176...,GTEx,00321d78-2502-4c1c-bbac-bb2e5f99582a_ENSG00000...,NIC
3,chr1,-,004b70dd-1c22-40d9-912d-cdc35d51c816_ENSG00000...,ENSG00000175756.13,1374996-1374790-1374704-1374590-1373999-1373902,GTEx,004b70dd-1c22-40d9-912d-cdc35d51c816_ENSG00000...,NIC
4,chr1,-,004f8830-0766-47d5-af4f-11c745414a2d_ENSG00000...,ENSG00000143549.19,154182987-154176248-154176114-154173201-154173...,GTEx,004f8830-0766-47d5-af4f-11c745414a2d_ENSG00000...,NIC


In [24]:
temp = ics_df.groupby(['isoform', 'source']).nunique().reset_index().rename({'structural_category':'n_cats'}, axis=1)

In [27]:
ics_df.loc[ics_df.isoform=='ENST00000007722.11'].ic.values

array(['50056645-50064076-50064204-50064527-50064607-50068055-50068305-50070843-50070930-50071310-50071518-50071985-50072182-50073915-50074004-50074143-50074280-50074447-50074534-50075458-50075526-50075598-50075735-50076325-50076475-50076583-50076681-50076973-50077121-50077378-50077447-50078045-50078125-50078206-50078284-50078823-50078926-50079075-50079258-50079434-50079557-50080261-50080375-50081309-50081408-50087743-50087869-50089109',
       '50056645-50064076-50064204-50064527-50064607-50068055-50068305-50070843-50070930-50071310-50071518-50071985-50072182-50073915-50074004-50074143-50074280-50074447-50074534-50075458-50075526-50075598-50075735-50076325-50076475-50076583-50076681-50076973-50077121-50077378-50077447-50078045-50078125-50078206-50078284-50078823-50078926-50079075-50079258-50079434-50079557-50080261-50080375-50081309-50081408-50087743-50087869-50089109'],
      dtype=object)

In [28]:
temp.loc[temp.n_cats>1].source.unique()

array(['GTEx'], dtype=object)