In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
# from proc_revisions.mane_utils import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
species = 'human'
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
lib_meta = od+expand(config['lr']['meta'], species=species)[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]
gtf_file = od+expand(config['lr']['cerberus']['gtf'], species='human')[0]

gtex_gtf_file = od+expand(config['gtex']['gtf'], species='human')[0]

gene_subset = 'polya'
min_tpm = 1

species = 'mouse'
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
m_gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
m_lib_meta = od+expand(config['lr']['meta'], species=species)[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
m_ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]


## Human -- how many v47 transcripts do we detect by biotype, novelty?

In [4]:
def get_ic_id(df):
    coord_col = 'ic'
    if 'ic' not in df.columns:
        coord_col = 'Coordinates'
    df['ic_id'] = df.Chromosome.astype(str)+'_'+\
                  df.Strand.astype(str)+'_'+\
                  df[coord_col].astype(str)
    return df

In [5]:
ref_gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/paper_rnawg/proc_revisions/ref/human/gencode.vM36.annotation.gtf'
ref_gtf_df = pr.read_gtf(ref_gtf)
ref_ics = cerberus.get_ic(ref_gtf_df)

In [6]:
ref_gtf_df = ref_gtf_df.df

In [7]:
# add biotype 
ref_gtf_df[['gene_id', 'gene_type']].groupby('gene_type').nunique()
ref_ics = ref_ics.merge(ref_gtf_df[['gene_id', 'gene_type']].drop_duplicates(),
                        how='left', 
                        on='gene_id')

In [8]:
print(m_gtf)
gtf_df = pr.read_gtf(m_gtf)
ics = cerberus.get_ic(gtf_df)

../../proc_revisions/data/mouse/lr/cerberus/cerberus.gtf


In [47]:
gtf_df = gtf_df.df

In [9]:
# get ids for uniq chr+strand+ic pairs
ref_ics = get_ic_id(ref_ics)
ics = get_ic_id(ics)

In [54]:
ics = cerberus.get_ic(pr.PyRanges(gtf_df))
ics = get_ic_id(ics)
ics.loc[ics.ic_id.str.startswith('chr1_-_36445049-36443609-36443490-36439726-364')]

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,ic,ic_id
1,chr1,-,"ENSMUSG00000001143[1,1,1]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
2,chr1,-,"ENSMUSG00000001143[1,1,2]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
3,chr1,-,"ENSMUSG00000001143[1,1,3]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
4,chr1,-,"ENSMUSG00000001143[1,1,8]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
5,chr1,-,"ENSMUSG00000001143[1,10,1]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
6,chr1,-,"ENSMUSG00000001143[1,10,2]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
7,chr1,-,"ENSMUSG00000001143[1,10,8]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
8,chr1,-,"ENSMUSG00000001143[1,11,1]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
9,chr1,-,"ENSMUSG00000001143[1,11,2]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...
10,chr1,-,"ENSMUSG00000001143[1,11,7]",ENSMUSG00000001143.13,36445049-36443609-36443490-36439726-36439608-3...,chr1_-_36445049-36443609-36443490-36439726-364...


In [60]:
ref_ics.loc[ref_ics.gene_id=='ENSMUSG00000001143.14']

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,ic,gene_type,ic_id
2,chr1,-,ENSMUST00000001171.13,ENSMUSG00000001143.14,36484130-36482690-36482571-36478807-36478689-3...,protein_coding,chr1_-_36484130-36482690-36482571-36478807-364...
777,chr1,-,ENSMUST00000115011.8,ENSMUSG00000001143.14,36484130-36482690-36482571-36478807-36478689-3...,protein_coding,chr1_-_36484130-36482690-36482571-36478807-364...
886,chr1,-,ENSMUST00000123583.2,ENSMUSG00000001143.14,36484130-36483223-36483117-36482690-36482600-3...,protein_coding,chr1_-_36484130-36483223-36483117-36482690-364...
908,chr1,-,ENSMUST00000125304.8,ENSMUSG00000001143.14,36484130-36482690-36482571-36478807-36478689-3...,protein_coding,chr1_-_36484130-36482690-36482571-36478807-364...
1120,chr1,-,ENSMUST00000134594.8,ENSMUSG00000001143.14,36478689-36477913-36477830-36467542-36467380-3...,protein_coding,chr1_-_36478689-36477913-36477830-36467542-364...
1477,chr1,-,ENSMUST00000152088.2,ENSMUSG00000001143.14,36467380-36467297-36467182-36464038,protein_coding,chr1_-_36467380-36467297-36467182-36464038
3610,chr1,-,ENSMUST00000192969.6,ENSMUSG00000001143.14,36478689-36477913-36477830-36467542-36467380-3...,protein_coding,chr1_-_36478689-36477913-36477830-36467542-364...
3697,chr1,-,ENSMUST00000193502.2,ENSMUSG00000001143.14,-,protein_coding,chr1_-_-


In [59]:
ref_gtf_df.loc[ref_gtf_df.gene_id.str.startswith('ENSMUSG00000001143')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont
91282,chr1,HAVANA,gene,36458951,36484352,.,-,.,ENSMUSG00000001143.14,protein_coding,...,,,,,,,,,,
91283,chr1,HAVANA,transcript,36458951,36478749,.,-,.,ENSMUSG00000001143.14,protein_coding,...,protein_coding,Lman2l-207,3,cds_start_NF,OTTMUST00000127607.1,,,ENSMUSP00000141806.2,,
91284,chr1,HAVANA,exon,36478689,36478749,.,-,.,ENSMUSG00000001143.14,protein_coding,...,protein_coding,Lman2l-207,3,cds_start_NF,OTTMUST00000127607.1,1,ENSMUSE00001344612.2,ENSMUSP00000141806.2,,
91285,chr1,HAVANA,CDS,36478689,36478749,.,-,2,ENSMUSG00000001143.14,protein_coding,...,protein_coding,Lman2l-207,3,cds_start_NF,OTTMUST00000127607.1,1,ENSMUSE00001344612.2,ENSMUSP00000141806.2,,
91286,chr1,HAVANA,exon,36477830,36477913,.,-,.,ENSMUSG00000001143.14,protein_coding,...,protein_coding,Lman2l-207,3,cds_start_NF,OTTMUST00000127607.1,2,ENSMUSE00001245826.2,ENSMUSP00000141806.2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91391,chr1,HAVANA,UTR,36478689,36478807,.,-,.,ENSMUSG00000001143.14,protein_coding,...,nonsense_mediated_decay,Lman2l-203,5,,OTTMUST00000051152.1,4,ENSMUSE00001241012.2,ENSMUSP00000137344.2,,
91392,chr1,HAVANA,UTR,36477830,36477913,.,-,.,ENSMUSG00000001143.14,protein_coding,...,nonsense_mediated_decay,Lman2l-203,5,,OTTMUST00000051152.1,5,ENSMUSE00001212118.2,ENSMUSP00000137344.2,,
91393,chr1,HAVANA,UTR,36467472,36467542,.,-,.,ENSMUSG00000001143.14,protein_coding,...,nonsense_mediated_decay,Lman2l-203,5,,OTTMUST00000051152.1,6,ENSMUSE00000807125.2,ENSMUSP00000137344.2,,
91394,chr1,HAVANA,transcript,36482825,36484338,.,-,.,ENSMUSG00000001143.14,protein_coding,...,retained_intron,Lman2l-208,,,OTTMUST00000127134.1,,,,,


In [12]:
# get expressed novel genes
df = pd.read_csv(m_filt_ab, sep='\t')
df, _ = get_tpm_table(df, 
                      how='iso',
                      min_tpm=1,
                      gene_subset=None,
                      species='mouse')

df.reset_index(inplace=True)
df['gid'] = df.annot_transcript_id.str.split('[', expand=True)[0]
df.head()
df.reset_index(inplace=True)
df.rename({'gid_stable':'gid'}, axis=1, inplace=True)
ab_df = pd.read_csv(m_filt_ab, sep='\t')
ab_df['gid'] = ab_df.annot_transcript_id.str.split('[', expand=True)[0]
ab_df = ab_df[['gid', 'gene_novelty']].drop_duplicates()
ab_df.loc[ab_df.gid.duplicated(keep=False)].sort_values(by='gid')
assert len(ab_df.loc[ab_df.gid.duplicated(keep=False)].index) == 0
df = df.merge(ab_df, how='left', on='gid')
df.loc[df.gene_novelty=='Fusion', 'gene_novelty'] = 'Readthrough'
df.head()

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 153398
# isos >= 1 tpm: 142504
Number of isos reported: 142504


Unnamed: 0,index,annot_transcript_id,gastroc_36d_f_1,hippocampus_2mo_m_2,gastroc_14d_f_1,cortex_wt_m_1_2,heart_2mo_f_2,adrenal_25d_m_2,heart_14d_f_1,adrenal_10d_m_2,...,hippocampus_18-20mo_f_1,gastroc_18-20mo_f_1,gastroc_25d_m_1,cortex_wt_f_1_2,adrenal_36d_m_1,forelimb_e11_1_1,adrenal_gland_1_2,hippocampus_18-20mo_m_2,gid,gene_novelty
0,0,"ENSMUSG00000051951[2,2,3]",0.0,1.586226,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.307316,ENSMUSG00000051951,Known
1,1,"ENSMUSG00000089699[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000089699,Known
2,2,"ENSMUSG00000103161[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000103161,Known
3,3,"ENSMUSG00000025902[1,2,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.543755,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known
4,4,"ENSMUSG00000025902[1,2,3]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known


In [13]:
df.gene_novelty.unique()

array(['Known', 'Readthrough', 'Intergenic'], dtype=object)

In [14]:
df[['gene_novelty', 'annot_transcript_id']].groupby('gene_novelty').nunique().rename({'annot_transcript_id':'n_t'}, axis=1)

Unnamed: 0_level_0,n_t
gene_novelty,Unnamed: 1_level_1
Intergenic,29
Known,142366
Readthrough,109


In [15]:
df[['gene_novelty', 'gid']].groupby('gene_novelty').nunique().rename({'gid':'n_g'}, axis=1)

Unnamed: 0_level_0,n_g
gene_novelty,Unnamed: 1_level_1
Intergenic,24
Known,24913
Readthrough,55


In [16]:
df.rename({'annot_transcript_id':'transcript_id'}, axis=1, inplace=True)

In [17]:
# add gene novelties to each ic
ics = ics.merge(df[['gene_novelty', 'transcript_id']],
                how='inner', 
                on='transcript_id')

In [18]:
print(len(ics.loc[ics.gene_novelty.notnull()].transcript_id.unique()))
ics = ics.loc[ics.gene_novelty.notnull()]

142504


In [19]:
# limit to non-monoexonic
ics.loc[ics.ic_id.str.endswith('-')].head()
ics = ics.loc[~ics.ic_id.str.endswith('-')]

In [20]:
# limit to non dupe
temp = ics[['gene_novelty', 'ic_id']].drop_duplicates()
temp = temp.loc[temp.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
assert len(temp.index) == 0

ics = ics[['gene_novelty', 'ic_id']].drop_duplicates()

In [41]:
ics.head()

Unnamed: 0,gene_novelty,ic_id
0,Known,chr1_-_36445049-36443609-36443490-36439726-364...
4,Known,chr1_-_36445049-36443609-36443490-36439726-364...
7,Known,chr1_-_36445049-36443609-36443490-36439726-364...
11,Known,chr1_-_36445049-36443609-36443490-36439726-364...
15,Known,chr1_-_36445049-36443609-36443519-36439726-364...


In [42]:
ref_ics.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,ic,gene_type,ic_id
0,chr1,-,ENSMUST00000000266.9,ENSMUSG00000026535.10,173810239-173805003-173804830-173802712-173802...,protein_coding,chr1_-_173810239-173805003-173804830-173802712...
1,chr1,-,ENSMUST00000000834.4,ENSMUSG00000000817.11,161615512-161614719-161614673-161610573-161610...,protein_coding,chr1_-_161615512-161614719-161614673-161610573...
2,chr1,-,ENSMUST00000001171.13,ENSMUSG00000001143.14,36484130-36482690-36482571-36478807-36478689-3...,protein_coding,chr1_-_36484130-36482690-36482571-36478807-364...
3,chr1,-,ENSMUST00000001172.12,ENSMUSG00000079610.10,36586166-36581946-36581842-36581143-36580939-3...,protein_coding,chr1_-_36586166-36581946-36581842-36581143-365...
4,chr1,-,ENSMUST00000001339.6,ENSMUSG00000001305.6,186481360-186472060-186471800-186469882-186469...,protein_coding,chr1_-_186481360-186472060-186471800-186469882...


In [35]:
merge = ics[['ic_id', 'gene_novelty']].merge(
            ref_ics[['ic_id', 'gene_type']].drop_duplicates(),
            how='left',
            on='ic_id')

In [48]:
t = merge.ic_id.values[0]
merge.loc[merge.ic_id==t]

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v36
0,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False


In [36]:
# where are dupes coming from?
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

# so just try limiting it to lncs and protein coding
merge = merge.loc[(merge.gene_type.isin(['protein_coding', 'lncRNA']))|\
                  (merge.gene_type.isnull())]
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')

# who cares cause they're not from novel genes anyway right?

Unnamed: 0,ic_id,gene_novelty,gene_type


In [37]:
merge['in_v36'] = merge.gene_type.notnull()

In [38]:
# nov = merge.loc[merge.gene_novelty!='Known'].copy(deep=True)
merge[['ic_id', 'gene_novelty', 'in_v36']].groupby(['gene_novelty', 'in_v36']).nunique().rename({'ic_id':'n_ic'}, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ic
gene_novelty,in_v36,Unnamed: 2_level_1
Intergenic,False,29
Known,False,78546
Known,True,1027
Readthrough,False,84


In [40]:
# nov = merge.loc[merge.gene_novelty!='Known'].copy(deep=True)
merge[['ic_id', 'gene_type', 'gene_novelty', 'in_v36']].groupby(['gene_novelty', 'gene_type', 'in_v36'], dropna=False).nunique().rename({'ic_id':'n_ic'}, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_ic
gene_novelty,gene_type,in_v36,Unnamed: 3_level_1
Intergenic,,False,29
Known,lncRNA,True,65
Known,protein_coding,True,962
Known,,False,78546
Readthrough,,False,84


In [None]:
merge.loc[merge.

In [None]:
## somethings wrong here

## What about by novelty category and GENCODE gene biotype?

In [29]:
# get novelty types of each IC from cerberus
ca = cerberus.read(m_cerberus_h5)

In [31]:
temp = ca.ic.copy(deep=True)
temp.drop('ic', axis=1, inplace=True)
temp.rename({'Coordinates':'ic'}, axis=1, inplace=True)
temp = get_ic_id(temp)

In [32]:
temp.head()

Unnamed: 0,Chromosome,Strand,ic,Name,source,novelty,gene_id,ic_id
0,chr1,+,-,ENSMUSG00000001138_3,"vM25,vM21,lapa",Known,ENSMUSG00000001138,chr1_+_-
1,chr1,+,-,ENSMUSG00000007805_1,"vM25,vM21,lapa",Known,ENSMUSG00000007805,chr1_+_-
2,chr1,+,-,ENSMUSG00000015314_4,"vM25,vM21,lapa",Known,ENSMUSG00000015314,chr1_+_-
3,chr1,+,-,ENSMUSG00000016494_3,"vM25,vM21,lapa",Known,ENSMUSG00000016494,chr1_+_-
4,chr1,+,-,ENSMUSG00000018196_6,"vM25,vM21,lapa",Known,ENSMUSG00000018196,chr1_+_-


In [33]:
merge.head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v36
0,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
1,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
2,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
3,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
4,chr1_-_36445049-36443609-36443519-36439726-364...,Known,,False


In [54]:
merge = merge.merge(temp[['ic_id', 'novelty']], 
                    how='left',
                    on='ic_id')

In [55]:
# merge.drop('novelty_x', axis=1, inplace=True)
# merge.rename({'novelty_y': 'novelty'}, axis=1, inplace=True)

In [56]:
merge.in_v47.unique()

array([ True, False])

In [57]:
merge.loc[merge.in_v47==False].head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47,novelty
2,chr1_-_169893787-169888890-169888675-169878819...,Known,,False,NIC
6,chr1_-_27635064-27625151-27625088-27623926-276...,Known,,False,NNC
7,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NIC
8,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NIC
9,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NNC


In [58]:
# CHECK FOR DUPES
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()
# merge.drop_duplicates(inplace=True)
# merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47,novelty
16383,chr10_+_14838459-14839904-14839985-14840074-14...,Known,protein_coding,True,NNC
16382,chr10_+_14838459-14839904-14839985-14840074-14...,Known,protein_coding,True,Known
14040,chr10_-_5617930-5617400-5617300-5617187,Known,lncRNA,True,Known
14041,chr10_-_5617930-5617400-5617300-5617187,Known,lncRNA,True,Known
14180,chr10_-_73252469-73252023-73251762-73250991-73...,Readthrough,,False,NNC


In [59]:
# deduplicate using 1 and 0 for known and novel. ANy time 
# an intron chain is called known, regardless of gene its assigned to, call 
# it known
merge['gb_known_col'] = 0
merge.loc[merge.novelty == 'Known', 'gb_known_col'] = 1
merge = merge[['ic_id', 'gene_type',
               'in_v47', 'gb_known_col']].groupby(['ic_id',
                                                   'gene_type',
                                                   'in_v47'],
                                                  dropna=False).max().reset_index()

In [60]:
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()

Unnamed: 0,ic_id,gene_type,in_v47,gb_known_col
112049,chr8_-_144314925-144314820,lncRNA,True,1
112050,chr8_-_144314925-144314820,protein_coding,True,1


In [61]:
# add known or novel this way
merge['novelty'] = 'Known'
merge.loc[merge.gb_known_col==0, 'novelty'] = 'Novel'

In [62]:
merge[['ic_id', 'gene_type', 'novelty', 'in_v47']].groupby(['gene_type', 'novelty', 'in_v47'], dropna=False).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ic_id
gene_type,novelty,in_v47,Unnamed: 3_level_1
lncRNA,Known,True,7051
lncRNA,Novel,True,695
protein_coding,Known,True,66755
protein_coding,Novel,True,689
,Known,False,2000
,Novel,False,44563
