In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.mane_utils import *
from scripts.plotting import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [4]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]


sr_ab = '../'+config['sr']['ab']


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_ab = '../'+expand(config['data']['ab'], species='mouse')[0]
m_filt_ab = '../'+expand(config['data']['filt_ab'], species='mouse')[0]
m_read_annot = '../'+expand(config['data']['read_annot'], species='mouse')[0]
m_t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='mouse')[0]
m_swan_file = '../'+expand(config['data']['sg'], species='mouse')[0]
m_cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='mouse')[0]
m_pi_tpm_table = '../'+expand(config['data']['pi_tpm']['triplet'], species='mouse', obs_col='sample')[0]
m_major_isos = '../'+expand(config['data']['major_isos'], species='mouse', obs_col='sample')[0]
m_pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='mouse')[0]
m_ref_t_metadata = '../'+expand(config['ref']['t_info'], species='mouse')[0]
m_ref_g_metadata = '../'+expand(config['ref']['g_info'], species='mouse')[0]
m_ref_gtf = '../'+expand(config['ref']['cerberus']['gtf'], species='mouse')[0]
m_gtf = '../'+expand(config['data']['cerb_gtf'], species='mouse')[0]

mouse_ver = 'vM25_cerberus'

I want a GTF that's the union of the reference cerberus-ized GTF and the mouse GTF. First I'll try to do that, then I'll try to limit the novel transcripts in the GTF to those that've passed our filtering.

In [120]:
def get_union_cerb_gtf(gtf1,
                       gtf2, 
                       h5):
    """
    Merge 2 GTFs. Preferentially keep entries from gtf1 vs. gtf2.
    """
    
    def preproc_gtf(gtf):
        df = pr.read_gtf(gtf,
                         duplicate_attr=True,
                         rename_attr=True).df
        df['gene_id'] = cerberus.get_stable_gid(df, 'gene_id')
        df = df.loc[df.gene_id.str.contains('ENSMUS')]
        tids = df.transcript_id.unique().tolist()    
        return df, tids
    
    gtf1_df, gtf1_tids = preproc_gtf(gtf1)
    gtf2_df, gtf2_tids = preproc_gtf(gtf2)    
    
    # filter gtf2 based on unique transcript ids
    # already in gtf1
    new_ids = list(set(gtf1_tids)-set(gtf2_tids))
    gtf2_df = gtf2_df.loc[gtf2_df.transcript_id.isin(new_ids)]
    
    # filter gtf2 based on genes that are in gtf1
    gtf2_df = gtf2_df.loc[gtf2_df.gene_id.isin(gtf1_df.gene_id.tolist())]
    
    # use cerberus functions to fix GTF entries
    ca = cerberus.read(h5)
    tss = pr.PyRanges(ca.tss.rename({'Name': 'tss_id'}, axis=1))
    tes = pr.PyRanges(ca.tes.rename({'Name': 'tes_id'}, axis=1))
    df = pd.concat([gtf1_df, gtf2_df], axis=0) 
    df = cerberus.sort_gtf(df)
    df = cerberus.update_gtf_ends(df, tss, tes)
    
    return df

In [121]:
df = get_union_cerb_gtf(m_ref_gtf,
                        m_gtf,
                        m_cerberus_h5)

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


In [122]:
pr.PyRanges(df).to_gtf('vM25_cerberus_2.gtf')

In [94]:
ref_df = pr.read_gtf(m_ref_gtf, duplicate_attr=True).df
ref_df['gene_id'] = cerberus.get_stable_gid(ref_df, 'gene_id')

In [95]:
ref_ids = ref_df.transcript_id.unique().tolist()

In [96]:
df = pr.read_gtf(m_gtf, duplicate_attr=True, rename_attr=True).df

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


In [97]:
df = df.loc[df.gene_id.str.contains('ENSMUS')]
df['gene_id'] = cerberus.get_stable_gid(df, 'gene_id')

In [98]:
# filter the new gtf based on unique transcript ids
new_ids = df.transcript_id.unique().tolist()
print(len(new_ids))
new_ids = list(set(new_ids)-set(ref_ids))
print(len(new_ids))
df = df.loc[df.transcript_id.isin(new_ids)]

149102
85674


In [99]:
# calculate how many transcripts we have and how many new ones we should get

In [100]:
ca = cerberus.read(m_cerberus_h5)
tss = pr.PyRanges(ca.tss.rename({'Name': 'tss_id'}, axis=1))
tes = pr.PyRanges(ca.tes.rename({'Name': 'tes_id'}, axis=1))

In [101]:
new_df = pd.concat([ref_df, df], axis=0)

In [102]:
new_df = cerberus.sort_gtf(new_df)
new_df = cerberus.update_gtf_ends(new_df, tss, tes)

In [104]:
new_df.head()
# new_df.gene_id.tail()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_status,source_attr,NNC_transcript,ISM-suffix_to_IDs,ISM_to_IDs,ISM_transcript,transcript_novelty,ISM-suffix_transcript,ISM-prefix_transcript,ISM-prefix_to_IDs
0,chrX,,gene,161082474,161258263,.,+,.,ENSMUSG00000000037,protein_coding,...,,,,,,,,,,
1,chrX,HAVANA,transcript,161162699,161258263,.,+,.,ENSMUSG00000000037,protein_coding,...,,,,,,,,,,
2,chrX,HAVANA,exon,161162699,161163248,.,+,.,ENSMUSG00000000037,protein_coding,...,,,,,,,,,,
3,chrX,HAVANA,exon,161171489,161171535,.,+,.,ENSMUSG00000000037,protein_coding,...,,,,,,,,,,
4,chrX,HAVANA,exon,161177246,161177317,.,+,.,ENSMUSG00000000037,protein_coding,...,,,,,,,,,,


In [105]:
ref_df.loc[ref_df.gene_id=='ENCODEMG000055991']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,havana_gene,havana_transcript,mgi_id,transcript_type,level,original_transcript_id,protein_id,ccdsid,tag,ont


In [106]:
new_df.Feature.unique()

['gene', 'transcript', 'exon']
Categories (3, object): ['exon', 'gene', 'transcript']

In [117]:
# remove things that don't have a gene in v40
t_df = new_df[['transcript_id', 'gene_id']].drop_duplicates()
t_gids = t_df.gene_id.unique().tolist()
g_df = new_df.loc[new_df.Feature=='gene']
g_gids = g_df.gene_id.unique().tolist()
t_df.loc[~t_df.gene_id.isin(g_gids)]

len(new_df.loc[new_df.gene_id.isin(g_gids)])
new_df = new_df.loc[new_df.gene_id.isin(g_gids)]



In [118]:
new_df.loc[new_df.gene_id =='ENSMUSG00000097131']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_status,source_attr,NNC_transcript,ISM-suffix_to_IDs,ISM_to_IDs,ISM_transcript,transcript_novelty,ISM-suffix_transcript,ISM-prefix_transcript,ISM-prefix_to_IDs


In [119]:
pr.PyRanges(new_df).to_gtf('vM25_cerberus.gtf')

In [111]:
# new_df.gene_id.head()
# new_df.gene_id.tail()
# new_df.loc[new_df.gene_id.str.contains('.')]
new_df.loc[new_df.gene_id=='ENSMUSG00000097131']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_status,source_attr,NNC_transcript,ISM-suffix_to_IDs,ISM_to_IDs,ISM_transcript,transcript_novelty,ISM-suffix_transcript,ISM-prefix_transcript,ISM-prefix_to_IDs
973796,chr1,ENSEMBL,transcript,72236414,72266186,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973797,chr1,ENSEMBL,exon,72236414,72236528,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973798,chr1,ENSEMBL,exon,72237865,72237923,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973799,chr1,ENSEMBL,exon,72241370,72241503,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973800,chr1,ENSEMBL,exon,72264742,72266186,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973801,chr1,ENSEMBL,transcript,72243970,72274919,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973802,chr1,ENSEMBL,exon,72243970,72244163,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973803,chr1,ENSEMBL,exon,72264742,72264892,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973804,chr1,ENSEMBL,exon,72269467,72269548,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,
973805,chr1,ENSEMBL,exon,72272147,72274919,.,+,.,ENSMUSG00000097131,lincRNA,...,KNOWN,ENSEMBL,,,,,,,,


In [110]:
# try to make this into a talon db

In [None]:
# talon_initialize_database \
#     --f vM25_cerberus.gtf \
#     --g mm10 \
#     --a vM25+cerberus \
#     --o vM25_cerberus