In [15]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import matplotlib as mpl
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus
import subprocess

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [18]:
exp_meta = '../metadata.tsv'
filt_ab = '../cerberus/cerberus_filtered_abundance.tsv'
c_annot = '../cerberus/cerberus_annot.h5'

## experiment IDs

Find them and scp them over

In [7]:
df = pd.read_csv(exp_meta, sep='\t')

In [16]:
df = df[['Experiment accession', 'Biosample term name']]
samples = ['GM12878', 'K562']
df = df.loc[df['Biosample term name'].isin(samples)]
exp_ids = df['Experiment accession'].unique().tolist()
for e in exp_ids:
    cmd = 'scp freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/{}.bed .'.format(e)
    cmd = cmd.split()
    print(cmd)
    subprocess.run(cmd, check=True)

['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR526TQU.bed', '.']
['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR962BVU.bed', '.']
['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR983KDL.bed', '.']
['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR706ANY.bed', '.']
['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR589FUJ.bed', '.']
['scp', 'freese@hpc3.rcic.uci.edu:/dfs7/samlab/mcelik/rnawg/data/results/lapa/tss/dataset/ENCSR838WFC.bed', '.']


## get bed files w/ tpm for each expressed cerberus TSS in gm and k5

In [57]:
feat = 'tss'
df = pd.read_csv(filt_ab, sep='\t')
df, ic_ids = get_tpm_table(df,
               how=feat,
               min_tpm=1)

Calculating tss TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # tsss detected: 76487
# tsss >= 1 tpm: 67730
Number of tsss reported: 67730


In [58]:
# limit only to samples we care about
samples = ['gm12878', 'k562']
cols = []
for s in samples:
    cols += [col for col in df.columns if s in col]

df = df[cols]

In [59]:
# melt, threshold on min tpm 
df = df.melt(var_name='dataset', value_name='tpm', ignore_index=False)
df = df.loc[df.tpm >= 1]


In [49]:
# merge w/ cerberus tss info
ca = cerberus.read(c_annot)

In [50]:
ca.tss.head()

Unnamed: 0,Chromosome,Start,End,Strand,Name,source,novelty,gene_id,tss
0,chr1,169794989,169795129,+,ENSG00000000460_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000460,1
1,chr1,169795358,169795459,+,ENSG00000000460_2,"v40,v29,lapa,pls",Known,ENSG00000000460,2
2,chr1,169794679,169794780,+,ENSG00000000460_3,"v40,v29,lapa,gtex,pls",Known,ENSG00000000460,3
3,chr1,169795870,169795971,+,ENSG00000000460_4,"v40,v29,pls",Known,ENSG00000000460,4
4,chr1,169661956,169662057,+,ENSG00000000460_5,"v40,v29,dels",Known,ENSG00000000460,5


In [60]:
tss_df = ca.tss.copy(deep=True)
df = df.merge(tss_df, how='left', left_index=True, right_on='Name')

In [61]:
df['gene_id'] = df.Name.str.split('_', expand=True)[0]
df.head()


Unnamed: 0,dataset,tpm,Chromosome,Start,End,Strand,Name,source,novelty,gene_id,tss
140910,gm12878_3_1,31.562599,chr20,50958366,50958605,-,ENSG00000000419_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000419,1
140917,gm12878_3_1,15.237117,chr20,50936125,50936226,-,ENSG00000000419_8,"lapa,encode_cage",Novel,ENSG00000000419,8
7180,gm12878_3_1,9.795289,chr1,169893845,169894009,-,ENSG00000000457_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000457,1
0,gm12878_3_1,20.678944,chr1,169794989,169795129,+,ENSG00000000460_1,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,ENSG00000000460,1
2,gm12878_3_1,4.353462,chr1,169794679,169794780,+,ENSG00000000460_3,"v40,v29,lapa,gtex,pls",Known,ENSG00000000460,3


In [62]:
cols = ['Chromosome', 'Start', 'End', 'Name', 'Strand', 'gene_id', 'tpm', 'source', 'novelty', 'dataset']
df = df[cols]

In [63]:
df

Unnamed: 0,Chromosome,Start,End,Name,Strand,gene_id,tpm,source,novelty,dataset
140910,chr20,50958366,50958605,ENSG00000000419_1,-,ENSG00000000419,31.562599,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_3_1
140917,chr20,50936125,50936226,ENSG00000000419_8,-,ENSG00000000419,15.237117,"lapa,encode_cage",Novel,gm12878_3_1
7180,chr1,169893845,169894009,ENSG00000000457_1,-,ENSG00000000457,9.795289,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_3_1
0,chr1,169794989,169795129,ENSG00000000460_1,+,ENSG00000000460,20.678944,"v40,v29,lapa,gtex,encode_cage,fantom_cage,enco...",Known,gm12878_3_1
2,chr1,169794679,169794780,ENSG00000000460_3,+,ENSG00000000460,4.353462,"v40,v29,lapa,gtex,pls",Known,gm12878_3_1
...,...,...,...,...,...,...,...,...,...,...
123103,chr17,27089494,27089595,ENSG00000285822_1,+,ENSG00000285822,2.288277,"v40,v29,lapa,fantom_cage,encode_rampage,pls",Known,k562_2_2
108893,chr15,41332804,41332933,ENSG00000285920_3,+,ENSG00000285920,16.017940,"lapa,encode_cage,fantom_cage,encode_rampage,pls",Novel,k562_2_2
47613,chr5,36876606,36876750,ENSG00000285967_2,-,ENSG00000285967,11.441386,"v40,v29,lapa,encode_cage,fantom_cage,encode_ra...",Known,k562_2_2
51243,chr6,63572421,63572599,ENSG00000285976_1,+,ENSG00000285976,75.513146,"v40,lapa,encode_cage,fantom_cage,encode_rampag...",Known,k562_2_2


In [64]:
for d in df.dataset.unique().tolist():
    temp = df.loc[df.dataset==d]
    fname = '{}_cerberus.bed'.format(d)
    temp.to_csv(fname, sep='\t', index=False)