In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
# from proc_revisions.mane_utils import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
species = 'human'
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
lib_meta = od+expand(config['lr']['meta'], species=species)[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]
gtf_file = od+expand(config['lr']['cerberus']['gtf'], species='human')[0]

gtex_gtf_file = od+expand(config['gtex']['gtf'], species='human')[0]

gene_subset = 'polya'
min_tpm = 1

species = 'mouse'
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
m_gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
m_lib_meta = od+expand(config['lr']['meta'], species=species)[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
m_ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]


## Human -- how many v47 transcripts do we detect by biotype, novelty?

In [4]:
def get_ic_id(df):
    coord_col = 'ic'
    if 'ic' not in df.columns:
        coord_col = 'Coordinates'
    df['ic_id'] = df.Chromosome.astype(str)+'_'+\
                  df.Strand.astype(str)+'_'+\
                  df[coord_col].astype(str)
    return df

In [5]:
ref_gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/paper_rnawg/proc_revisions/ref/human/gencode.v47.annotation.gtf'
ref_gtf_df = pr.read_gtf(ref_gtf)
ref_ics = cerberus.get_ic(ref_gtf_df)

In [6]:
ref_gtf_df = ref_gtf_df.df

In [7]:
# add biotype 
ref_gtf_df[['gene_id', 'gene_type']].groupby('gene_type').nunique()
ref_ics = ref_ics.merge(ref_gtf_df[['gene_id', 'gene_type']].drop_duplicates(),
                        how='left', 
                        on='gene_id')

In [8]:
gtf
gtf_df = pr.read_gtf(gtf)
ics = cerberus.get_ic(gtf_df)

In [9]:
# get ids for uniq chr+strand+ic pairs
ref_ics = get_ic_id(ref_ics)
ics = get_ic_id(ics)

In [10]:
# # whats happening to my ics
# ics = cerberus.get_ic(gtf_df)

In [11]:
# temp.head()

In [12]:
# temp.loc[temp.gene_id=='ENSG00000000457'].head()

In [13]:
# ics.head()

In [14]:
# get expressed novel genes
df = pd.read_csv(filt_ab, sep='\t')
df, _ = get_tpm_table(df, 
                      how='iso',
                      min_tpm=1,
                      gene_subset=None)

df.reset_index(inplace=True)
df['gid'] = df.annot_transcript_id.str.split('[', expand=True)[0]
df.head()
df.reset_index(inplace=True)
df.rename({'gid_stable':'gid'}, axis=1, inplace=True)
ab_df = pd.read_csv(filt_ab, sep='\t')
ab_df['gid'] = ab_df.annot_transcript_id.str.split('[', expand=True)[0]
ab_df = ab_df[['gid', 'gene_novelty']].drop_duplicates()
ab_df.loc[ab_df.gid.duplicated(keep=False)].sort_values(by='gid')
assert len(ab_df.loc[ab_df.gid.duplicated(keep=False)].index) == 0
df = df.merge(ab_df, how='left', on='gid')
df.loc[df.gene_novelty=='Fusion', 'gene_novelty'] = 'Readthrough'
df.head()

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 236615
# isos >= 1 tpm: 213476
Number of isos reported: 213476


Unnamed: 0,index,annot_transcript_id,gm12878_1_4,hl60_1_2,brodmann_area_46_9_1,right_cardiac_atrium_3_1,hl60_1_1,hl60_m2_24hr_1_1,hl60_m2_72hr_1_2,brodmann_area_46_8_1,...,mucosa_of_descending_colon_2_1,hl60_m1_12hr_1_2,k562_1_1,left_lung_2_1,pgp1_astro_1_1,hepg2_2_1,gm12878_1_1,pgp1_astro_1_2,gid,gene_novelty
0,0,"ENSG00000000003[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.812232,...,2.755743,0.0,0.0,0.0,1.672048,0.0,0.0,0.0,ENSG00000000003,Known
1,1,"ENSG00000000003[1,1,5]",0.0,0.0,2.029236,5.753204,0.0,0.0,0.0,3.248926,...,15.156588,0.0,0.0,0.0,20.064574,56.52392,0.0,17.065041,ENSG00000000003,Known
2,2,"ENSG00000000003[1,1,6]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.133615,0.0,0.0,0.0,0.0,1.177582,0.0,0.0,ENSG00000000003,Known
3,3,"ENSG00000000003[1,1,7]",0.0,0.0,4.734884,8.054485,0.0,0.0,0.0,10.55901,...,39.958278,0.0,0.0,12.297724,61.865771,120.11333,1.686719,42.662601,ENSG00000000003,Known
4,4,"ENSG00000000003[1,1,8]",0.0,0.0,1.352824,4.602563,0.0,0.0,0.0,4.873389,...,33.06892,0.0,0.0,17.568178,18.392527,98.91686,0.0,34.130081,ENSG00000000003,Known


In [15]:
df.gene_novelty.unique()

array(['Known', 'Intergenic', 'Readthrough'], dtype=object)

In [16]:
df[['gene_novelty', 'annot_transcript_id']].groupby('gene_novelty').nunique().rename({'annot_transcript_id':'n_t'}, axis=1)

Unnamed: 0_level_0,n_t
gene_novelty,Unnamed: 1_level_1
Intergenic,113
Known,213179
Readthrough,184


In [17]:
df[['gene_novelty', 'gid']].groupby('gene_novelty').nunique().rename({'gid':'n_g'}, axis=1)

Unnamed: 0_level_0,n_g
gene_novelty,Unnamed: 1_level_1
Intergenic,68
Known,29386
Readthrough,79


In [18]:
df.rename({'annot_transcript_id':'transcript_id'}, axis=1, inplace=True)

In [19]:
# add gene novelties to each ic
ics = ics.merge(df[['gene_novelty', 'transcript_id']],
                how='inner', 
                on='transcript_id')

In [20]:
# ics.rename({'gene_novelty_x':'gene_novelty'}, axis=1, inplace=True)
# ics.drop('gene_novelty_y', axis=1, inplace=True)

In [21]:
# ics.head()

In [22]:
print(len(ics.loc[ics.gene_novelty.notnull()].transcript_id.unique()))
ics = ics.loc[ics.gene_novelty.notnull()]

213476


In [23]:
# limit to non-monoexonic
ics.loc[ics.ic_id.str.endswith('-')].head()
ics = ics.loc[~ics.ic_id.str.endswith('-')]

In [24]:
# limit to non dupe
temp = ics[['gene_novelty', 'ic_id']].drop_duplicates()
temp = temp.loc[temp.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
assert len(temp.index) == 0

ics = ics[['gene_novelty', 'ic_id']].drop_duplicates()

In [46]:
merge = ics[['ic_id', 'gene_novelty']].merge(
            ref_ics[['ic_id', 'gene_type']].drop_duplicates(),
            how='left',
            on='ic_id')

In [47]:
# where are dupes coming from?
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

# so just try limiting it to lncs and protein coding
merge = merge.loc[(merge.gene_type.isin(['protein_coding', 'lncRNA']))|\
                  (merge.gene_type.isnull())]
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')

# who cares cause they're not from novel genes anyway right?

Unnamed: 0,ic_id,gene_novelty,gene_type
111544,chr8_-_144314925-144314820,Known,lncRNA
111545,chr8_-_144314925-144314820,Known,protein_coding


In [48]:
merge['in_v47'] = merge.gene_type.notnull()

In [49]:
nov = merge.loc[merge.gene_novelty!='Known'].copy(deep=True)
nov[['ic_id', 'gene_novelty', 'in_v47']].groupby(['gene_novelty', 'in_v47']).nunique().rename({'ic_id':'n_ic'}, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ic
gene_novelty,in_v47,Unnamed: 2_level_1
Intergenic,False,79
Intergenic,True,34
Readthrough,False,114
Readthrough,True,17


In [50]:
34/(79+34)

0.3008849557522124

## What about by novelty category and GENCODE gene biotype?

In [51]:
# get novelty types of each IC from cerberus
ca = cerberus.read(cerberus_h5)

In [52]:
merge.head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47
0,chr1_-_169893787-169888890-169888675-169878819...,Known,protein_coding,True
1,chr1_-_169893787-169888890-169888675-169878819...,Known,protein_coding,True
2,chr1_-_169893787-169888890-169888675-169878819...,Known,,False
3,chr1_-_169888675-169878819-169878633-169876091...,Known,protein_coding,True
4,chr1_-_169894006-169888890-169888675-169878819...,Known,protein_coding,True


In [53]:
temp = ca.ic.copy(deep=True)
temp.drop('ic', axis=1, inplace=True)
temp.rename({'Coordinates':'ic'}, axis=1, inplace=True)
temp = get_ic_id(temp)

In [54]:
merge = merge.merge(temp[['ic_id', 'novelty']], 
                    how='left',
                    on='ic_id')

In [55]:
# merge.drop('novelty_x', axis=1, inplace=True)
# merge.rename({'novelty_y': 'novelty'}, axis=1, inplace=True)

In [56]:
merge.in_v47.unique()

array([ True, False])

In [57]:
merge.loc[merge.in_v47==False].head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47,novelty
2,chr1_-_169893787-169888890-169888675-169878819...,Known,,False,NIC
6,chr1_-_27635064-27625151-27625088-27623926-276...,Known,,False,NNC
7,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NIC
8,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NIC
9,chr1_-_27635064-27625151-27625088-27623929-276...,Known,,False,NNC


In [58]:
# CHECK FOR DUPES
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()
# merge.drop_duplicates(inplace=True)
# merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47,novelty
16383,chr10_+_14838459-14839904-14839985-14840074-14...,Known,protein_coding,True,NNC
16382,chr10_+_14838459-14839904-14839985-14840074-14...,Known,protein_coding,True,Known
14040,chr10_-_5617930-5617400-5617300-5617187,Known,lncRNA,True,Known
14041,chr10_-_5617930-5617400-5617300-5617187,Known,lncRNA,True,Known
14180,chr10_-_73252469-73252023-73251762-73250991-73...,Readthrough,,False,NNC


In [59]:
# deduplicate using 1 and 0 for known and novel. ANy time 
# an intron chain is called known, regardless of gene its assigned to, call 
# it known
merge['gb_known_col'] = 0
merge.loc[merge.novelty == 'Known', 'gb_known_col'] = 1
merge = merge[['ic_id', 'gene_type',
               'in_v47', 'gb_known_col']].groupby(['ic_id',
                                                   'gene_type',
                                                   'in_v47'],
                                                  dropna=False).max().reset_index()

In [60]:
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()

Unnamed: 0,ic_id,gene_type,in_v47,gb_known_col
112049,chr8_-_144314925-144314820,lncRNA,True,1
112050,chr8_-_144314925-144314820,protein_coding,True,1


In [61]:
# add known or novel this way
merge['novelty'] = 'Known'
merge.loc[merge.gb_known_col==0, 'novelty'] = 'Novel'

In [62]:
merge[['ic_id', 'gene_type', 'novelty', 'in_v47']].groupby(['gene_type', 'novelty', 'in_v47'], dropna=False).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ic_id
gene_type,novelty,in_v47,Unnamed: 3_level_1
lncRNA,Known,True,7051
lncRNA,Novel,True,695
protein_coding,Known,True,66755
protein_coding,Novel,True,689
,Known,False,2000
,Novel,False,44563
