In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import itertools

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [3]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [5]:
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
ca_plus = '../'+expand(config['data']['ca_plus'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
gtf = '../'+expand(config['data']['cerb_gtf'], species='human')[0]
sfs = '../'+expand(config['ref']['sfs'], species='human')[0]
ver = 'v40_cerberus'

phastcons = '../'+config['ref']['phastcons100']['txt']

gene_subset = 'polya'

biosamp_name_map = '../'+expand(config['ref']['biosamp_map'])[0]

cage_meta = '../'+expand(config['cage']['meta'], species='human')[0]
rampage_meta = '../'+expand(config['rampage']['meta'], species='human')[0]

min_tpm = 1

In [29]:
df, _, _ = get_gtf_info(how='gene',
                        subset=gene_subset,
                        ver=ver,
                        add_stable_gid=True)
df = df[['gid_stable', 'gname']]

In [32]:
ca = cerberus.read(cerberus_h5)
tss_df = ca.tss.loc[(ca.tss.source.str.contains('lapa'))]

In [34]:
# limit to polya genes
print(len(tss_df))
tss_df = tss_df.loc[tss_df.gene_id.isin(df.gid_stable.tolist())]
print(len(tss_df))

# add gname 
tss_df = tss_df.merge(df, how='left', left_on='gene_id', right_on='gid_stable')
tss_df.drop('gid_stable', axis=1, inplace=True)

76347
71895


In [39]:
# get two duplicate pyranges objects to merge
tss1 = pr.PyRanges(tss_df)
tss2 = pr.PyRanges(tss_df)

tss1 = tss1.join(tss2,
                 how=None,
                 strandedness='same',
                 slack=0,
                 suffix='_other')

df = tss1.df
df = df.loc[df.gene_id!=df.gene_id_other]

In [None]:
len(df.index)

1890

In [47]:
gene_ids = list(set(df.gene_id.tolist()+df.gene_id_other.tolist()))
len(gene_ids)

1448

In [46]:
df.to_csv('readthrough_genes.tsv', sep='\t')

In [58]:
## how many gencode polya genes are there w/ overlapping tsss
df, _, _ = get_gtf_info(how='gene',
                        subset=gene_subset,
                        ver=ver,
                        add_stable_gid=True)
df = df[['gid_stable', 'gname']]

In [59]:
print(len(df.gid_stable.unique().tolist()))

52274


In [60]:
ca = cerberus.read(cerberus_h5)
tss_df = ca.tss.loc[(ca.tss.source.str.contains('v40'))]

In [61]:
(1448/52274)*100

2.77001951256839

In [62]:
# limit to polya genes
print(len(tss_df))
tss_df = tss_df.loc[tss_df.gene_id.isin(df.gid_stable.tolist())]
print(len(tss_df))

# add gname 
tss_df = tss_df.merge(df, how='left', left_on='gene_id', right_on='gid_stable')
tss_df.drop('gid_stable', axis=1, inplace=True)

135780
126506


In [63]:
# get two duplicate pyranges objects to merge
tss1 = pr.PyRanges(tss_df)
tss2 = pr.PyRanges(tss_df)

tss1 = tss1.join(tss2,
                 how=None,
                 strandedness='same',
                 slack=0,
                 suffix='_other')

df = tss1.df
df = df.loc[df.gene_id!=df.gene_id_other]

In [65]:
gene_ids = list(set(df.gene_id.tolist()+df.gene_id_other.tolist()))
len(gene_ids)

1882

In [66]:
df.to_csv('gencode_readthrough_genes.tsv', sep='\t')

In [67]:
## what's the intersection of gencode read through and observed read through?
df1 = pd.read_csv('readthrough_genes.tsv', sep='\t')
df2 = pd.read_csv('gencode_readthrough_genes.tsv', sep='\t')

In [99]:
def sort_gnames_name_fusion(df):
    df = df[['gene_id', 'gname', 'gene_id_other', 'gname_other']]
    df['gid_1'] = df[['gene_id', 'gene_id_other']].min(axis=1)
    df['gname_1'] = df[['gname', 'gname_other']].min(axis=1)
    df['gid_2'] = df[['gene_id', 'gene_id_other']].max(axis=1)
    df['gname_2'] = df[['gname', 'gname_other']].max(axis=1)
    
    df['fusion_name'] = df.gid_1+'-'+df.gid_2
    
    return df

In [100]:
df1 = sort_gnames_name_fusion(df1)
df2 = sort_gnames_name_fusion(df2)

In [101]:
# for df in [df1, df2]:
#     df = df[['gene_id', 'gname', 'gene_id_other', 'gname_other']]
#     df['gid_1'] = df[['gene_id', 'gene_id_other']].min(axis=1)
#     df['gname_1'] = df[['gname', 'gname_other']].min(axis=1)
#     df['gid_2'] = df[['gene_id', 'gene_id_other']].max(axis=1)
#     df['gname_2'] = df[['gname', 'gname_other']].max(axis=1)

In [102]:
# df1 = df1[['gene_id', 'gname', 'gene_id_other', 'gname_other']]
# df1['gid_1'] = df1[['gene_id', 'gene_id_other']].min(axis=1)
# df1['gname_1'] = df1[['gname', 'gname_other']].min(axis=1)
# df1['gid_2'] = df1[['gene_id', 'gene_id_other']].max(axis=1)
# df1['gname_2'] = df1[['gname', 'gname_other']].max(axis=1)
# # df2 = df2[['gene_id', 'gname', 'gene_id_other', 'gname_other']]

In [107]:
# what % of readthrough events that we detect have an equivalent gencode event?
n = len(set(df1.gene_id.tolist()+df1.gene_id_other.tolist()))
temp = df1.loc[df1.fusion_name.isin(df2.fusion_name.tolist())]
n_num = len(set(temp.gene_id.tolist()+temp.gene_id_other.tolist()))
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) readthrough events we detect have an equivalent in GENCODE')

48.90% (708/1448) readthrough events we detect have an equivalent in GENCODE
