In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

# pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]

## Get bed files for TSSs detected in each of the GM12878 / K562 datasets

For the purposes of Milad's TSS prediction

In [15]:
datasets = [d for d in get_datasets(species='human') if 'gm12878' in d or 'k562' in d]
datasets

['gm12878_1_1',
 'gm12878_1_2',
 'gm12878_1_3',
 'gm12878_1_4',
 'gm12878_2_1',
 'gm12878_2_2',
 'gm12878_3_1',
 'k562_1_1',
 'k562_2_1',
 'k562_2_2',
 'k562_3_1',
 'k562_3_2']

In [19]:
ab_df = pd.read_csv(filt_ab, sep='\t')
df = get_det_table(ab_df, 
                   how='tss',
                   min_tpm=1,
                   groupby='library',
                   gene_subset=None)
df = df.transpose()

Calculating tss TPM values
Enforcing minimum TPM
Total # tsss detected: 72504
# tsss >= 1 tpm: 63636
Number of tsss reported: 63636
Found 138 total libraries


In [40]:
tpm_df, _ = get_tpm_table(ab_df, 
                   how='tss',
                   min_tpm=1,
                   groupby='library',
                   gene_subset=None)

Calculating tss TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # tsss detected: 72504
# tsss >= 1 tpm: 63636
Number of tsss reported: 63636


In [52]:
ca = cerberus.read(cerberus_h5)
tss_df = ca.tss.copy(deep=True)
tss_df.head()

Unnamed: 0,Chromosome,Start,End,Strand,Name,source,novelty,gene_id,tss
0,chr1,169794989,169795129,+,ENSG00000000460_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000460,1
1,chr1,169795358,169795459,+,ENSG00000000460_2,"v40,v29,lapa,pls,encode_procap,pol2",Known,ENSG00000000460,2
2,chr1,169794679,169794780,+,ENSG00000000460_3,"v40,v29,lapa,gtex,pls,lrgasp_cage,encode_proca...",Known,ENSG00000000460,3
3,chr1,169795870,169795971,+,ENSG00000000460_4,"v40,v29,pls,encode_procap",Known,ENSG00000000460,4
4,chr1,169661956,169662057,+,ENSG00000000460_5,"v40,v29,dels",Known,ENSG00000000460,5


In [55]:
for d in datasets:
    temp = df.loc[df[d]==True].copy(deep=True)[[d]]    
    beep = tpm_df[[d]]
    beep.rename({d: 'tpm'}, axis=1, inplace=True)
    temp = temp.merge(beep, left_index=True, right_index=True, how='left')
    temp = temp.merge(tss_df, how='left', left_index=True, right_on='Name')
    temp['dataset'] = d
    cols = ['Chromosome', 'Start', 'End', 'Name',
            'Strand', 'gene_id', 'tpm',
            'source', 'novelty', 'dataset']
    temp = temp[cols]
    fname = f'{d}_cerberus.bed'
    temp.to_csv(fname, sep='\t', index=False)

In [54]:
temp.head()

Unnamed: 0,Chromosome,Start,End,Name,Strand,gene_id,tpm,source,novelty,dataset
151001,chrX,100636638,100636856,ENSG00000000003_1,-,ENSG00000000003,1.686523,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_1_1
141162,chr20,50958366,50958605,ENSG00000000419_1,-,ENSG00000000419,84.326133,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_1_1
7229,chr1,169893845,169894009,ENSG00000000457_1,-,ENSG00000000457,5.059568,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_1_1
0,chr1,169794989,169795129,ENSG00000000460_1,+,ENSG00000000460,13.492181,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_1_1
7232,chr1,27635083,27635235,ENSG00000000938_1,-,ENSG00000000938,3.373045,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_1_1


In [23]:
tpm_df.head()

tss,ENSG00000000003_1,ENSG00000000005_3,ENSG00000000419_1,ENSG00000000419_8,ENSG00000000457_1,ENSG00000000457_2,ENSG00000000457_3,ENSG00000000460_1,ENSG00000000460_2,ENSG00000000460_3,...,TALONG000281814_1,TALONG000285649_1,TALONG000299720_1,TALONG000329121_1,TALONG000372813_1,TALONG000401320_1,TALONG000418142_1,TALONG000436039_1,TALONG000436164_1,TALONG000443992_1
library,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a673_1_1,True,False,True,True,True,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
a673_1_2,True,False,True,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
adrenal_gland_1_1,True,False,True,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
adrenal_gland_2_1,True,False,True,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
adrenal_gland_3_1,True,False,True,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
print(len(temp))
print(len(df))

19575
63636
