In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import matplotlib as mpl

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
gene_subset = 'polya'

# biosamp_name_map = '../'+expand(config['ref']['biosamp_map'])[0]

# cage_meta = '../'+expand(config['cage']['meta'], species='human')[0]
# rampage_meta = '../'+expand(config['rampage']['meta'], species='human')[0]

min_tpm = 1
gene_subset = 'polya'

In [4]:
def filt_unsup_ism(filt_ab, cerberus_h5, wildcards, ofile):
    species=wildcards['species']
    feat = 'tss'
    if species == 'human':
        ref_sources = ['v29', 'v40']
        support_sources = ['encode_cage', 'fantom_cage', 'encode_rampage', 'gtex', 'pls',
                                            'encode_procap', 'lrgasp_cage', 'pol2', 'ca_h3k4me3']
    elif species == 'mouse':
        ref_sources = ['vM21', 'vM25']
        support_sources = ['h3k4me3', 'fantom_cage', 'pls', 'pol2']
        
    tss_df = get_feat_support(filt_ab,
                              cerberus_h5,
                              feat,
                              ref_sources,
                              support_sources,
                              min_tpm=0,
                              how=feat,
                              species=species)
    feat = 'tes'
    if species == 'human':
        support_sources = ['gtex', 'pas', 'polya_atlas']
    elif species == 'mouse':
        support_sources = ['pas', 'polya_atlas']
        
    tes_df = get_feat_support(filt_ab,
                            cerberus_h5,
                            feat,
                            ref_sources,
                            support_sources,
                            min_tpm=0,
                            how=feat,
                            species=species)

    df = pd.read_csv(filt_ab, sep='\t')
    df = add_feat(df, 'annot_transcript_id', 'tss')
    df = add_feat(df, 'annot_transcript_id', 'tes')
    df = add_feat(df, 'annot_transcript_id', 'ic')
    ca = cerberus.read(cerberus_h5)
    temp_ic = ca.ic.drop('ic', axis=1).rename({'Name': 'ic'}, axis=1)
    df = df.merge(temp_ic, on='ic', how='left')
    rm_tids = []
    rm_tids += df.loc[df.novelty=='Unspliced'].annot_transcript_id.tolist()
    tss_df = tss_df.rename({'Name': 'tss', 'support':'tss_support'}, axis=1)
    tes_df = tes_df.rename({'Name': 'tes', 'support':'tes_support'}, axis=1)
    df = df.merge(tss_df, how='left', on='tss')
    df = df.merge(tes_df, how='left', on='tes')

    # unsupported at both
    rm_tids += df.loc[(df.novelty=='ISM')&\
                      (df.tss_support=='Novel')&\
                      (df.tes_support=='Novel')].annot_transcript_id.tolist()
    # unsupported at tss
    rm_tids += df.loc[(df.novelty=='ISM')&\
                    (df.tss_support=='Novel')].annot_transcript_id.tolist()
    # unsupported at tes
    rm_tids += df.loc[(df.novelty=='ISM')&\
                      (df.tes_support=='Novel')].annot_transcript_id.tolist()
    keep_tids = df.loc[~df.annot_transcript_id.isin(rm_tids)].annot_transcript_id.tolist()

    # filter the abundance file
    df = pd.read_csv(filt_ab, sep='\t')
    df = df.loc[df.annot_transcript_id.isin(keep_tids)]
    df.to_csv(ofile, sep='\t', index=False)

In [5]:
wildcards = {'species': 'human'}
filt_unsup_ism(filt_ab, cerberus_h5, wildcards, 'test_out.tsv')

Calculating tss TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Number of tsss reported: 76501
154251
76501
Calculating tes TPM values
Number of tess reported: 85348
184856
85348


In [15]:
feat = 'tss'
ref_sources = ['v29', 'v40']
support_sources = ['encode_cage', 'fantom_cage', 'encode_rampage', 'gtex', 'pls',
                                    'encode_procap', 'lrgasp_cage', 'pol2', 'ca_h3k4me3']

tss_df = get_feat_support(filt_ab,
                          cerberus_h5, 
                          feat, 
                          ref_sources,
                          support_sources,
                          min_tpm=min_tpm,
                          how=feat)


Calculating tss TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # tsss detected: 76501
# tsss >= 1 tpm: 67354
Number of tsss reported: 67354
154251
67354


In [16]:
feat = 'tes'
ref_sources = ['v29', 'v40']
support_sources = ['gtex', 'pas', 'polya_atlas']

tes_df = get_feat_support(filt_ab,
                          cerberus_h5, 
                          feat, 
                          ref_sources,
                          support_sources,
                          min_tpm=min_tpm,
                          how=feat)

Calculating tes TPM values
Enforcing minimum TPM
Total # tess detected: 85348
# tess >= 1 tpm: 74074
Number of tess reported: 74074
184856
74074


In [25]:
df = pd.read_csv(filt_ab, sep='\t')
print(len(df.index))
df = add_feat(df, 'annot_transcript_id', 'tss')
df = add_feat(df, 'annot_transcript_id', 'tes')
df = add_feat(df, 'annot_transcript_id', 'ic')

df.head()

236662


Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,hl60_m1_24hr_1_1,k562_2_1,pgp1_endo_1_1,h1_de_1_1,pgp1_astro_1_2,hl60_1_2,psoas_muscle_2_1,tss,tes,ic
0,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,1]",TSPAN6,"TSPAN6[1,1,1]",8.0,2206.0,Known,Known,...,0,0,0,5,0,0,0,ENSG00000000003_1,ENSG00000000003_1,ENSG00000000003_1
1,57105,202958202958,ENSG00000000003.14,"ENSG00000000003[1,1,5]",TSPAN6,"TSPAN6[1,1,5]",8.0,2206.0,Known,Known,...,0,0,13,31,6,0,0,ENSG00000000003_1,ENSG00000000003_5,ENSG00000000003_1
2,57105,202958202958,ENSG00000000003.14,"ENSG00000000003[1,1,6]",TSPAN6,"TSPAN6[1,1,6]",8.0,2206.0,Known,Known,...,0,0,0,1,0,0,0,ENSG00000000003_1,ENSG00000000003_6,ENSG00000000003_1
3,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,7]",TSPAN6,"TSPAN6[1,1,7]",8.0,2206.0,Known,Known,...,0,0,32,72,15,0,2,ENSG00000000003_1,ENSG00000000003_7,ENSG00000000003_1
4,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,8]",TSPAN6,"TSPAN6[1,1,8]",8.0,2206.0,Known,Known,...,0,1,31,31,12,0,1,ENSG00000000003_1,ENSG00000000003_8,ENSG00000000003_1


In [27]:
ca = cerberus.read(cerberus_h5)

In [33]:
ca.ic.head()


Unnamed: 0,Chromosome,Strand,Coordinates,Name,source,novelty,gene_id,ic
0,chr1,+,-,ENSG00000004487_18,"v40,v29,lapa",Known,ENSG00000004487,18
1,chr1,+,-,ENSG00000033122_5,"v40,v29,lapa",Known,ENSG00000033122,5
2,chr1,+,-,ENSG00000049246_6,"v40,v29,lapa",Known,ENSG00000049246,6
3,chr1,+,-,ENSG00000077157_8,"v40,v29,lapa",Known,ENSG00000077157,8
4,chr1,+,-,ENSG00000099260_3,"v40,v29,lapa",Known,ENSG00000099260,3


In [31]:
temp_ic = ca.ic.drop('ic', axis=1).rename({'Name': 'ic'}, axis=1)
df = df.merge(temp_ic, on='ic', how='left')
print(len(df.index))

236662


In [36]:
rm_tids = []
rm_tids += df.loc[df.novelty=='Unspliced'].annot_transcript_id.tolist()
print(len(rm_tids))


3369


In [39]:
tss_df = tss_df.rename({'Name': 'tss', 'support':'tss_support'}, axis=1)
tes_df = tes_df.rename({'Name': 'tes', 'support':'tes_support'}, axis=1)
tes_df.head()

Unnamed: 0,tes,tes_support
0,ENSG00000000460_1,Known
1,ENSG00000000460_2,Known
2,ENSG00000000460_3,Known
3,ENSG00000000460_6,Supported
4,ENSG00000000971_1,Known


In [43]:
tss_df.tss_support.unique()

array(['Known', 'Supported', 'Novel'], dtype=object)

In [41]:
df = df.merge(tss_df, how='left', on='tss')
len(df.index)

236662

In [42]:
df = df.merge(tes_df, how='left', on='tes')
len(df.index)

236662

In [45]:
# unsupported at both
rm_tids += df.loc[(df.novelty=='ISM')&\
                  (df.tss_support=='Novel')&\
                  (df.tes_support=='Novel')].annot_transcript_id.tolist()
print(len(rm_tids))

3474


In [46]:
# unsupported at tss
rm_tids += df.loc[(df.novelty=='ISM')&\
                  (df.tss_support=='Novel')].annot_transcript_id.tolist()
print(len(rm_tids))

7119


In [47]:
# unsupported at tes
rm_tids += df.loc[(df.novelty=='ISM')&\
                  (df.tes_support=='Novel')].annot_transcript_id.tolist()
print(len(rm_tids))

7993


In [49]:
keep_tids = df.loc[~df.annot_transcript_id.isin(rm_tids)].annot_transcript_id.tolist()
len(keep_tids)

228879

In [51]:
# filter the abundance file
df = pd.read_csv(filt_ab, sep='\t')
df = df.loc[df.annot_transcript_id.isin(keep_tids)]
# df.to_csv(ofile, sep='\t', index=False)

In [52]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,aorta_2_1,brodmann_area_46_4_1,mucosa_of_descending_colon_1_1,hl60_m1_24hr_1_1,k562_2_1,pgp1_endo_1_1,h1_de_1_1,pgp1_astro_1_2,hl60_1_2,psoas_muscle_2_1
0,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,1]",TSPAN6,"TSPAN6[1,1,1]",8.0,2206.0,Known,Known,...,0,0,0,0,0,0,5,0,0,0
1,57105,202958202958,ENSG00000000003.14,"ENSG00000000003[1,1,5]",TSPAN6,"TSPAN6[1,1,5]",8.0,2206.0,Known,Known,...,1,2,36,0,0,13,31,6,0,0
2,57105,202958202958,ENSG00000000003.14,"ENSG00000000003[1,1,6]",TSPAN6,"TSPAN6[1,1,6]",8.0,2206.0,Known,Known,...,0,0,1,0,0,0,1,0,0,0
3,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,7]",TSPAN6,"TSPAN6[1,1,7]",8.0,2206.0,Known,Known,...,4,4,60,0,0,32,72,15,0,2
4,57105,202958,ENSG00000000003.14,"ENSG00000000003[1,1,8]",TSPAN6,"TSPAN6[1,1,8]",8.0,2206.0,Known,Known,...,2,4,89,0,1,31,31,12,0,1


In [53]:
print(len(df.index))

228879
