In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

# pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]

## Get bed files for TSSs detected in each of the GM12878 / K562 datasets

For the purposes of Milad's TSS prediction

In [4]:
datasets = [d for d in get_datasets(species='human') if 'gm12878' in d or 'k562' in d]
datasets

['gm12878_1_1',
 'gm12878_1_2',
 'gm12878_1_3',
 'gm12878_1_4',
 'gm12878_2_1',
 'gm12878_2_2',
 'gm12878_3_1',
 'k562_1_1',
 'k562_2_1',
 'k562_2_2',
 'k562_3_1',
 'k562_3_2']

In [5]:
ab_df = pd.read_csv(filt_ab, sep='\t')
df = get_det_table(ab_df, 
                   how='tss',
                   min_tpm=1,
                   groupby='library',
                   gene_subset=None)
df = df.transpose()

Calculating tss TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # tsss detected: 73812
# tsss >= 1 tpm: 64913
Number of tsss reported: 64913
Found 138 total libraries


In [6]:
tpm_df, _ = get_tpm_table(ab_df, 
                   how='tss',
                   min_tpm=1,
                   groupby='library',
                   gene_subset=None)

Calculating tss TPM values
Enforcing minimum TPM
Total # tsss detected: 73812
# tsss >= 1 tpm: 64913
Number of tsss reported: 64913


In [7]:
ca = cerberus.read(cerberus_h5)
tss_df = ca.tss.copy(deep=True)
tss_df.head()

Unnamed: 0,Chromosome,Start,End,Strand,Name,source,novelty,gene_id,tss
0,chr1,169794989,169795129,+,ENSG00000000460_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000460,1
1,chr1,169795358,169795459,+,ENSG00000000460_2,"v40,v29,lapa,pls,encode_procap,pol2",Known,ENSG00000000460,2
2,chr1,169794679,169794780,+,ENSG00000000460_3,"v40,v29,lapa,gtex,pls,lrgasp_cage,encode_proca...",Known,ENSG00000000460,3
3,chr1,169795870,169795971,+,ENSG00000000460_4,"v40,v29,pls,encode_procap",Known,ENSG00000000460,4
4,chr1,169661956,169662057,+,ENSG00000000460_5,"v40,v29,dels",Known,ENSG00000000460,5


In [8]:
for d in datasets:
    temp = df.loc[df[d]==True].copy(deep=True)[[d]]    
    beep = tpm_df[[d]]
    beep.rename({d: 'tpm'}, axis=1, inplace=True)
    temp = temp.merge(beep, left_index=True, right_index=True, how='left')
    temp = temp.merge(tss_df, how='left', left_index=True, right_on='Name')
    temp['dataset'] = d
    cols = ['Chromosome', 'Start', 'End', 'Name',
            'Strand', 'gene_id', 'tpm',
            'source', 'novelty', 'dataset']
    temp = temp[cols]
    fname = f'{d}_cerberus.bed'
    temp.to_csv(fname, sep='\t', index=False)

In [9]:
temp.head()

Unnamed: 0,Chromosome,Start,End,Name,Strand,gene_id,tpm,source,novelty,dataset
142915,chr20,50958366,50958605,ENSG00000000419_1,-,ENSG00000000419,24.781289,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,k562_3_2
142922,chr20,50936125,50936226,ENSG00000000419_8,-,ENSG00000000419,19.697948,"lapa,encode_cage,lrgasp_cage",Novel,k562_3_2
7347,chr1,169893845,169894009,ENSG00000000457_1,-,ENSG00000000457,2.541671,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,k562_3_2
0,chr1,169794989,169795129,ENSG00000000460_1,+,ENSG00000000460,45.114655,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,k562_3_2
2,chr1,169794679,169794780,ENSG00000000460_3,+,ENSG00000000460,3.812506,"v40,v29,lapa,gtex,pls,lrgasp_cage,encode_proca...",Known,k562_3_2


In [10]:
tpm_df.head()

Unnamed: 0_level_0,hffc6_1_2,mucosa_of_descending_colon_1_1,hl60_m1_12hr_1_2,ovary_3_1,hct116_1_1,hl60_m2_24hr_1_2,calu3_1_1,caco2_1_2,huvec_1_1,cardiac_septum_1_1,...,right_cardiac_atrium_1_1,posterior_vena_cava_2_1,h1_1_3,ocily7_1_2,heart_left_ventricle_4_1,h9_chondro_1_1,pgp1_1_1,lower_lobe_of_right_lung_1_1,a673_1_2,hepg2_1_1
tss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003_1,71.861107,360.746229,0.0,74.958432,61.459303,0.0,223.307303,481.080358,97.733451,4.412956,...,14.401181,26.213121,174.196734,0.0,15.994626,41.051453,117.928358,25.737767,123.006788,189.713335
ENSG00000000005_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.103239,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000419_1,95.814809,40.082914,179.579325,37.479216,37.738168,97.710321,100.908739,161.031082,41.059616,24.271256,...,25.202067,26.213121,23.072415,53.023228,8.724341,32.673606,34.182133,9.359188,51.880414,33.987731
ENSG00000000419_8,0.0,0.0,1.044066,0.0,1.078233,0.0,0.0,0.0,0.578304,1.103239,...,0.0,0.0,0.0,0.0,1.454057,0.0,0.0,3.509696,0.0,9.269381
ENSG00000000457_1,1.596913,3.81742,2.088132,0.0,5.391167,0.0,3.737361,5.032221,1.734913,3.309717,...,3.600295,0.0,5.768104,8.435514,8.724341,5.026709,1.709107,3.509696,1.673562,4.94367


In [11]:
print(len(temp))
print(len(df))

18095
64913
