In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['ab'], species='human')[0]
talon_filt_ab = od+expand(config['lr']['talon']['filt_ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
read_lens = od+expand(config['lr']['read_len_meta'], species='human')[0]
gtf = od+expand(config['lr']['cerberus']['gtf'], species='human')[0]

sr_ab = od+expand(config['sr']['ab'], species='human')[0]


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]
m_ab = od+expand(config['lr']['talon']['ab'], species='mouse')[0]
m_talon_filt_ab = od+expand(config['lr']['talon']['filt_ab'], species='mouse')[0]
m_read_annot = od+expand(config['lr']['talon']['full_annot'], species='mouse')[0]
m_t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='mouse')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species='mouse')[0]

mouse_ver = 'vM25_cerberus'

In [4]:
df = pd.read_csv(filt_ab, sep='\t')

In [5]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,brodmann_area_46_5_1,h1_1_2,hffc6_1_1,hl60_m1_72hr_1_1,h1_de_1_1,hl60_m0_1_2,aorta_2_1,h9_chondro_1_2,hepg2_1_1,mucosa_of_descending_colon_2_1
0,52125,187550187550,ENSG00000121410.11,"ENSG00000121410[1,1,2]",A1BG,"A1BG[1,1,2]",8.0,1722.0,Known,Known,...,0,0,0,0,0,0,0,0,121,0
1,52125,187551187551,ENSG00000121410.11,"ENSG00000121410[1,4,2]",A1BG,"A1BG[1,4,2]",7.0,2301.0,Known,Known,...,0,0,0,0,0,0,0,0,0,0
2,52125,10496451049645,ENSG00000121410.11,"ENSG00000121410[1,5,2]",A1BG,"A1BG[1,5,2]",9.0,1791.0,Known,NIC,...,0,0,0,0,0,0,0,0,6,0
3,52125,187549,ENSG00000121410.11,"ENSG00000121410[2,2,1]",A1BG,"A1BG[2,2,1]",2.0,2134.0,Known,Known,...,1,0,0,0,0,1,0,0,1,0
4,52125,187549,ENSG00000121410.11,"ENSG00000121410[2,2,2]",A1BG,"A1BG[2,2,2]",2.0,2134.0,Known,Known,...,10,1,12,0,3,1,0,35,0,1


In [6]:
# get only the readthrough genes
df = df.loc[df.gene_novelty=='Fusion']
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,brodmann_area_46_5_1,h1_1_2,hffc6_1_1,hl60_m1_72hr_1_1,h1_de_1_1,hl60_m0_1_2,aorta_2_1,h9_chondro_1_2,hepg2_1_1,mucosa_of_descending_colon_2_1
200296,59968,214728214728,TALONG000059968,"TALONG000059968[1,1,1]",TALONG000059968,"TALONG000059968[1,1,1]",16.0,2661.0,Fusion,ISM,...,5,5,2,0,13,2,0,1,5,3
200299,61086,223753,TALONG000061086,"TALONG000061086[1,1,1]",TALONG000061086,"TALONG000061086[1,1,1]",6.0,1367.0,Fusion,ISM,...,0,0,0,0,0,0,0,0,0,0
200300,61086,224292,TALONG000061086,"TALONG000061086[1,2,1]",TALONG000061086,"TALONG000061086[1,2,1]",6.0,1363.0,Fusion,ISM,...,0,1,0,0,0,0,0,0,0,0
200301,61194,224539,TALONG000061194,"TALONG000061194[1,1,1]",TALONG000061194,"TALONG000061194[1,1,1]",5.0,3650.0,Fusion,ISM,...,1,0,0,0,0,0,0,0,0,0
200302,61194,225029,TALONG000061194,"TALONG000061194[1,2,2]",TALONG000061194,"TALONG000061194[1,2,2]",4.0,1273.0,Fusion,ISM,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def get_fusion_sample_t_coords(ab, gtf, min_tpm, sample, species, ofile):
    """
    Get genomic start / stop coords for transcripts expressed in
    a given sample
    """

    df = pd.read_csv(ab, sep='\t')
    tids = df.loc[df.gene_novelty=='Fusion', 'annot_transcript_id'].tolist()
    print(len(tids))
    df = get_det_table(df,
                       groupby='sample',
                       how='iso',
                       min_tpm=min_tpm,
                       species=species)
    df = df.transpose()
    tids2 = df.loc[df[sample]==True].index.tolist()
    tids = list(set(tids)&set(tids2))

    gtf_df = pr.read_gtf(gtf).df
    gtf_df = gtf_df.loc[gtf_df.transcript_id.isin(tids)]
    
    gtf_df = gtf_df.loc[gtf_df.Feature=='transcript']
    gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'transcript_id',]]
    
    gtgtf_dff.to_csv(ofile, sep='\t', index=False)

In [10]:
# get the transcripts that are fusion that are expressed in each sample
df = get_fusion_sample_t_coords(filt_ab,
               gtf,
               1,
               'gm12878',
               'human',
              ofile)

548
Calculating iso TPM values
Enforcing minimum TPM
Total # isos detected: 237022
# isos >= 1 tpm: 213032
Number of isos reported: 213032
Found 51 total samples


In [11]:
df.head()


Unnamed: 0,Chromosome,Start,End,Strand,gene_id,transcript_id
108449,chr1,103525927,103555289,+,TALONG000169854,"TALONG000169854[2,2,1]"
108465,chr1,103525648,103555289,+,TALONG000169854,"TALONG000169854[1,1,1]"
108511,chr1,168576554,168581452,+,TALONG000189600,"TALONG000189600[1,2,1]"
108515,chr1,168576554,168581452,+,TALONG000189600,"TALONG000189600[1,1,1]"
108533,chr1,192609265,192660310,+,TALONG000190575,"TALONG000190575[1,1,1]"
