In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
# from proc_revisions.mane_utils import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
species = 'human'
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
lib_meta = od+expand(config['lr']['meta'], species=species)[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]
gtf_file = od+expand(config['lr']['cerberus']['gtf'], species='human')[0]

gtex_gtf_file = od+expand(config['gtex']['gtf'], species='human')[0]

gene_subset = 'polya'
min_tpm = 1

species = 'mouse'
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
m_gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
m_lib_meta = od+expand(config['lr']['meta'], species=species)[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
m_ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]


## Mouse -- how many vM36 transcripts do we detect by biotype, novelty?

In [4]:
def get_ic_id(df):
    coord_col = 'ic'
    if 'ic' not in df.columns:
        coord_col = 'Coordinates'
    df['ic_id'] = df.Chromosome.astype(str)+'_'+\
                  df.Strand.astype(str)+'_'+\
                  df[coord_col].astype(str)
    return df

In [5]:
ref_gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/paper_rnawg/proc_revisions/ref/human/gencode.vM36.annotation.gtf'
ref_gtf_df = pr.read_gtf(ref_gtf)
ref_ics = cerberus.get_ic(ref_gtf_df)

In [6]:
ref_gtf_df = ref_gtf_df.df

In [7]:
# add biotype 
ref_gtf_df[['gene_id', 'gene_type']].groupby('gene_type').nunique()
ref_ics = ref_ics.merge(ref_gtf_df[['gene_id', 'gene_type']].drop_duplicates(),
                        how='left', 
                        on='gene_id')

In [8]:
gtf_df = pr.read_gtf(m_gtf)
ics = cerberus.get_ic(gtf_df)

In [9]:
# get ids for uniq chr+strand+ic pairs
ref_ics = get_ic_id(ref_ics)
ics = get_ic_id(ics)

In [23]:
# get expressed novel genes
df = pd.read_csv(m_filt_ab, sep='\t')
df, _ = get_tpm_table(df, 
                      how='iso',
                      min_tpm=1,
                      gene_subset=None, # no gene subset cause we want novel genes
                      species='mouse')

df.reset_index(inplace=True)
df['gid'] = df.annot_transcript_id.str.split('[', expand=True)[0]
df.head()
df.reset_index(inplace=True)
df.rename({'gid_stable':'gid'}, axis=1, inplace=True)
ab_df = pd.read_csv(m_filt_ab, sep='\t')
ab_df['gid'] = ab_df.annot_transcript_id.str.split('[', expand=True)[0]
ab_df = ab_df[['gid', 'gene_novelty']].drop_duplicates()
ab_df.loc[ab_df.gid.duplicated(keep=False)].sort_values(by='gid')
assert len(ab_df.loc[ab_df.gid.duplicated(keep=False)].index) == 0
df = df.merge(ab_df, how='left', on='gid')
df.loc[df.gene_novelty=='Fusion', 'gene_novelty'] = 'Readthrough'
df.head()

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 153398
# isos >= 1 tpm: 142504
Number of isos reported: 142504


Unnamed: 0,index,annot_transcript_id,cortex_2mo_f_1,gastroc_10d_f_2,hippocampus_2mo_f_1,hippocampus_wt_m_1_1,hippocampus_5x_m_1_2,adrenal_2mo_m_1,cortex_wt_m_1_2,adrenal_4d_m_2,...,cortex_2mo_m_1,gastroc_36d_f_2,adrenal_18-20mo_f_1,adrenal_36d_m_1,adrenal_gland_1_2,cortex_14d_f_1,gastroc_2mo_m_2,gastroc_2mo_f_1,gid,gene_novelty
0,0,"ENSMUSG00000051951[2,2,3]",0.0,0.0,0.0,0.998864,0.431823,0.0,0.0,0.0,...,2.658793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000051951,Known
1,1,"ENSMUSG00000089699[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000089699,Known
2,2,"ENSMUSG00000103161[1,1,1]",0.0,0.0,0.0,0.998864,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000103161,Known
3,3,"ENSMUSG00000025902[1,2,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known
4,4,"ENSMUSG00000025902[1,2,3]",0.0,0.0,0.0,0.0,0.431823,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known


In [24]:
ab_df = pd.read_csv(m_filt_ab, sep='\t')
ab_df.gene_novelty.unique()

array(['Known', 'Fusion', 'Intergenic'], dtype=object)

In [25]:
df.gene_novelty.unique()

array(['Known', 'Readthrough', 'Intergenic'], dtype=object)

In [26]:
df[['gene_novelty', 'annot_transcript_id']].groupby('gene_novelty').nunique().rename({'annot_transcript_id':'n_t'}, axis=1)

Unnamed: 0_level_0,n_t
gene_novelty,Unnamed: 1_level_1
Intergenic,29
Known,142366
Readthrough,109


In [27]:
df[['gene_novelty', 'gid']].groupby('gene_novelty').nunique().rename({'gid':'n_g'}, axis=1)

Unnamed: 0_level_0,n_g
gene_novelty,Unnamed: 1_level_1
Intergenic,24
Known,24913
Readthrough,55


In [28]:
df.rename({'annot_transcript_id':'transcript_id'}, axis=1, inplace=True)

In [29]:
# add gene novelties to each ic
ics = ics.merge(df[['gene_novelty', 'transcript_id']],
                how='inner', 
                on='transcript_id')

In [30]:
# ics.rename({'gene_novelty_x':'gene_novelty'}, axis=1, inplace=True)
# ics.drop('gene_novelty_y', axis=1, inplace=True)

In [31]:
# ics.head()

In [32]:
print(len(ics.loc[ics.gene_novelty.notnull()].transcript_id.unique()))
ics = ics.loc[ics.gene_novelty.notnull()]

142504


In [33]:
# limit to non-monoexonic
ics.loc[ics.ic_id.str.endswith('-')].head()
ics = ics.loc[~ics.ic_id.str.endswith('-')]

In [34]:
# limit to non dupe
temp = ics[['gene_novelty', 'ic_id']].drop_duplicates()
temp = temp.loc[temp.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
assert len(temp.index) == 0

ics = ics[['gene_novelty', 'ic_id']].drop_duplicates()

In [35]:
merge = ics[['ic_id', 'gene_novelty']].merge(
            ref_ics[['ic_id', 'gene_type']].drop_duplicates(),
            how='left',
            on='ic_id')

In [36]:
# where are dupes coming from?
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

# so just try limiting it to lncs and protein coding
merge = merge.loc[(merge.gene_type.isin(['protein_coding', 'lncRNA']))|\
                  (merge.gene_type.isnull())]
merge.loc[merge.ic_id.duplicated(keep=False)][['gene_type', 'ic_id']].groupby('gene_type').count().rename({'ic_id':'counts'}, axis=1)

merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id')

# who cares cause they're not from novel genes anyway right?

Unnamed: 0,ic_id,gene_novelty,gene_type


In [37]:
merge['in_v47'] = merge.gene_type.notnull()

In [38]:
nov = merge.loc[merge.gene_novelty!='Known'].copy(deep=True)
nov[['ic_id', 'gene_novelty', 'in_v47']].groupby(['gene_novelty', 'in_v47']).nunique().rename({'ic_id':'n_ic'}, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ic
gene_novelty,in_v47,Unnamed: 2_level_1
Intergenic,False,29
Readthrough,False,84


In [39]:
# 34/(79+34)

## What about by novelty category and GENCODE gene biotype?

In [40]:
# get novelty types of each IC from cerberus
ca = cerberus.read(m_cerberus_h5)

In [42]:
merge.head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47
0,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
1,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
2,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
3,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False
4,chr1_-_36445049-36443609-36443519-36439726-364...,Known,,False


In [43]:
temp = ca.ic.copy(deep=True)
temp.drop('ic', axis=1, inplace=True)
temp.rename({'Coordinates':'ic'}, axis=1, inplace=True)
temp = get_ic_id(temp)

In [44]:
merge = merge.merge(temp[['ic_id', 'novelty']], 
                    how='left',
                    on='ic_id')

In [45]:
# merge.drop('novelty_x', axis=1, inplace=True)
# merge.rename({'novelty_y': 'novelty'}, axis=1, inplace=True)

In [46]:
merge.in_v47.unique()

array([False,  True])

In [47]:
merge.loc[merge.in_v47==False].head()

Unnamed: 0,ic_id,gene_novelty,gene_type,in_v47,novelty
0,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False,Known
1,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False,NNC
2,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False,NNC
3,chr1_-_36445049-36443609-36443490-36439726-364...,Known,,False,NNC
4,chr1_-_36445049-36443609-36443519-36439726-364...,Known,,False,NIC


In [51]:
# CHECK FOR DUPES
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()
merge.drop_duplicates(inplace=True)
merge.loc[merge.ic_id.duplicated(keep=False)].sort_values(by='ic_id').head()

AttributeError: 'NoneType' object has no attribute 'loc'

In [49]:
merge[['ic_id', 'gene_type', 'novelty', 'in_v47']].groupby(['gene_type', 'novelty', 'in_v47'], dropna=False).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ic_id
gene_type,novelty,in_v47,Unnamed: 3_level_1
lncRNA,Known,True,57
lncRNA,NIC,True,1
lncRNA,NNC,True,7
protein_coding,Known,True,962
,ISM,False,4908
,Known,False,48619
,NIC,False,13020
,NNC,False,12115
