In [1]:
import pandas as pd
import pyranges as pr
import sys
import os
from Bio import SearchIO
import yaml
from snakemake.io import expand
import pdb
import swan_vis as swan

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *
from scripts.mane_utils import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_ab = '../'+expand(config['data']['ab'], species='mouse')[0]
m_filt_ab = '../'+expand(config['data']['filt_ab'], species='mouse')[0]
m_read_annot = '../'+expand(config['data']['read_annot'], species='mouse')[0]
m_ref_t_metadata = '../'+expand(config['ref']['t_info'], species='mouse')[0]
m_ref_g_metadata = '../'+expand(config['ref']['g_info'], species='mouse')[0]
m_t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='mouse')[0]
m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_swan_file = '../'+expand(config['data']['sg'], species='mouse')[0]
m_cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='mouse')[0]
m_cerb_t_metadata = '../'+expand(config['data']['t_info'], species='mouse')[0]
m_major_isos = '../'+expand(config['data']['major_isos'], species='mouse', obs_col='sample')[0]
m_pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='mouse')[0]


mouse_ver = 'vM25_cerberus'

## Save TPMs of transcripts expressed in 16 cell lines

In [55]:
datasets = get_ljungman_datasets()

ab_df = pd.read_csv(filt_ab, sep='\t')
ab_df, _ = get_tpm_table(ab_df,
               how='iso',
               min_tpm=min_tpm,
               gene_subset=gene_subset)
ab_df = ab_df[datasets]
ab_df.to_csv('transcript_abundance.tsv', sep='\t')

Calculating iso TPM values
Subsetting for polya genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 206806


## Get transcripts expressed in 16 cell lines

In [56]:
# get tids that are expressed in ljungman datasets
datasets = get_ljungman_datasets()
df = pd.read_csv(filt_ab, sep='\t')
df = get_det_table(df,
               how='iso',
               min_tpm=min_tpm,
               gene_subset=gene_subset)

Calculating iso TPM values
Subsetting for polya genes
Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 206806
Found 138 total libraries


In [57]:
# limit to 16 cell lines
df = df.loc[datasets]

In [58]:
df = df.transpose()
df = df.loc[df.sum(axis=1)>=1]

In [59]:
tids = df.index.tolist()

## Get v29 transcripts

In [60]:
ca = cerberus.read(cerberus_h5)

In [61]:
ref_df = ca.t_map.loc[ca.t_map.source=='v29']
ref_tids = ref_df.transcript_id.unique().tolist()

## Find transcripts w/ evidence of fusion

In [62]:
sg = swan.read(swan_file)

Read in graph from ../data/human/swan_graph.p


In [63]:
# get the edges for each transcript
# get the genes associated with each edge
# get the transcripts that have edges assc. w/ more than 1 gene

In [64]:
# get a list of exons / introns
df = swan.pivot_path_list(sg.t_df, 'path')
df.reset_index(inplace=True)
df = df.merge(sg.t_df[['gid', 'gname']].reset_index(), on='tid', how='left')

In [65]:
# get a table of edge:gid
eg_df = df[['edge_id', 'gid']].drop_duplicates()

In [66]:
# merge in the edge:gid to the tid:edge table
df.drop(['gid', 'gname'], axis=1, inplace=True)
df = df.merge(eg_df[['edge_id', 'gid']], how='left', on='edge_id')

In [67]:
# how many genes are represented by the edges of each transcript?
gt_df = df.drop('edge_id', axis=1).copy(deep=True)
gt_df = gt_df.groupby('tid').nunique().rename({'gid':'n_genes'}, axis=1).reset_index()

# get gene ids and gene names assc. with this as well and merge in
temp = df.drop('edge_id', axis=1).copy(deep=True)
temp = temp.merge(sg.t_df[['gname', 'gid']].drop_duplicates(), how='left', on='gid').drop_duplicates()
temp = temp.groupby('tid').agg(','.join).reset_index()
gt_df = gt_df.merge(temp, on='tid', how='left')

# limit to transcripts w/ >1 gene
gt_df = gt_df.loc[gt_df.n_genes>1]

In [68]:
# limit entries in to those expressed in the 16 cell lines
gt_df = gt_df.loc[gt_df.tid.isin(tids)]

# label v29 annotated transcripts
gt_df['Known'] = gt_df.tid.isin(ref_tids)

In [69]:
gt_df.to_csv('spliced_fusion_transcripts.tsv', sep='\t', index=False)

## Get all unspliced transcripts w/ gene name, gene id

In [70]:
# label monoexonic transcripts
t_df = sg.t_df.copy(deep=True)
t_df = t_df[['tid', 'gid', 'gname', 'path']].reset_index(drop=True)
t_df['unspliced'] = t_df.apply(lambda x: len(x.path)==1, axis=1)
t_df = t_df.loc[t_df.unspliced==True]
t_df.set_index('tid', inplace=True)

# get edges used in each monoexonic transcript
ue_t_df = swan.pivot_path_list(t_df, path_col='path')
ue_df = sg.edge_df.loc[ue_t_df.edge_id.tolist()]

# add coords for these edges
ue_df = ue_df.merge(sg.loc_df[['chrom', 'coord']],
            how='left', left_on='v1', right_on='vertex_id')
ue_df.rename({'coord': 'start'}, axis=1, inplace=True)
ue_df = ue_df.merge(sg.loc_df[['coord']],
            how='left', left_on='v2', right_on='vertex_id')
ue_df.rename({'coord': 'stop'}, axis=1, inplace=True)
ue_df.drop(['v1', 'v2'], axis=1, inplace=True)

# merge info in with transcript info
ue_t_df.reset_index(inplace=True)
ue_t_df = ue_t_df.merge(ue_df, on='edge_id', how='left')
ue_t_df = ue_t_df.merge(sg.t_df.reset_index(drop=True)[['tid', 'gid', 'gname']], on='tid', how='left')

In [71]:
# limit entries in to those expressed in the 16 cell lines
ue_t_df = ue_t_df.loc[ue_t_df.tid.isin(tids)]

# label v29 annotated transcripts
ue_t_df['Known'] = ue_t_df.tid.isin(ref_tids)

In [72]:
ue_t_df.to_csv('unspliced_transcripts.tsv', sep='\t', index=False)

In [74]:
ue_t_df.head()

Unnamed: 0,tid,edge_id,strand,edge_type,annotation,chrom,start,stop,gid,gname,Known
0,"ENSG00000001460[12,13,1]",947924,-,exon,False,chr1,24357356,24356949,ENSG00000001460,STPG1,False
3,"ENSG00000002834[9,9,1]",974058,+,exon,False,chr17,38919325,38921820,ENSG00000002834,LASP1,False
4,"ENSG00000003249[5,6,1]",339329,-,exon,True,chr16,90007811,90004821,ENSG00000003249,DBNDD1,True
5,"ENSG00000004399[17,17,1]",997925,-,exon,False,chr3,129555524,129555164,ENSG00000004399,PLXND1,False
6,"ENSG00000004487[11,18,1]",287,+,exon,True,chr1,23080823,23083739,ENSG00000004487,KDM1A,True
