In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus


p = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
h5 = '../cerberus_annot.h5'
ab = '../talon/human_talon_abundance.tsv'
filt_ab = '../cerberus_filtered_abundance.tsv'
obs_col = 'sample'
min_tpm = 1
major_set = '../swan/isos_sample_gene_90.tsv'
swan_file = '../swan/swan.p'
mm_tissues = ['adrenal gland', 'heart',
              'muscle', 'brain', 'pgp1_excite_neuron',
              'pgp1_astro', 'h9_osteocyte',
              'h1', 'wtc11']

In [3]:
ca = cerberus.read(h5)
sg = swan.read(swan_file)

Read in graph from ../swan/swan.p


Triplets for each source in cerberus annotation

In [4]:
df = ca.get_source_triplets(sg=sg)
ca.add_triplets(df)

Expressed triplets

In [5]:
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
               how='iso',
               min_tpm=1)
df = ca.get_subset_triplets(tids, 'obs_det', sg=sg)
ca.add_triplets(df)

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 234040
# isos >= 1 tpm: 212911
Number of isos reported: 212911


Sample-level expressed triplets

In [6]:
# TODO - check if results are the same between 
# utils.py TPM calc and this one

In [7]:
df = ca.get_expressed_triplets(sg, obs_col=obs_col,
                               min_tpm=min_tpm,
                               source='sample_det')
ca.add_triplets(df)

Union of major (90% set) expressed triplets

In [8]:
subset = pd.read_csv(major_set, sep='\t')
tids = subset.tid.unique().tolist()
df = ca.get_subset_triplets(tids, source='obs_major', sg=sg)
ca.add_triplets(df)

Sample-level major (90% set) expressed triplets

In [9]:
subset = pd.read_csv(major_set, sep='\t')
df = ca.get_expressed_triplets(sg, obs_col=obs_col,
                               min_tpm=min_tpm,
                               source='sample_major',
                               subset=subset)
ca.add_triplets(df)

Mouse match triplets

In [10]:
df = pd.read_csv(filt_ab, sep='\t')
df = get_det_table(df,
               groupby='sample',
               how='iso',
               min_tpm=1)

df = df.transpose()
df = df[mm_tissues]
df = df.loc[df.any(axis=1)]
tids = df.index.tolist()

df = ca.get_subset_triplets(tids, source='obs_mm_det', sg=sg)
ca.add_triplets(df)

Calculating iso TPM values
Enforcing minimum TPM
Total # isos detected: 234040
# isos >= 1 tpm: 212911
Number of isos reported: 212911
Found 50 total samples


Mouse match major (90% set) triplets

In [11]:
subset = pd.read_csv(major_set, sep='\t')
print(len(subset.index))
subset = subset.loc[subset['sample'].isin(mm_tissues)]
print(len(subset.index))
tids = subset.tid.unique().tolist()
df = ca.get_subset_triplets(tids, source='obs_mm_major', sg=sg)
ca.add_triplets(df)

1534016
300835


Remove non-polyA genes

In [12]:
df, _, _ = get_gtf_info(how='gene', ver='v40_cerberus', subset='polya')

In [13]:
df['gid_stable'] = cerberus.get_stable_gid(df, 'gid')

In [14]:
polya_gids = df.gid_stable.tolist()
print(len(ca.triplets.index))
ca.triplets = ca.triplets.loc[ca.triplets.gid.isin(polya_gids)]
print(len(ca.triplets.index))       

1600865
1535912


In [15]:
ca.triplets.loc[(ca.triplets.gname =='ELN')&(ca.triplets.source.isin(['obs_det', 'obs_mm_det']))]

Unnamed: 0,source,gid,n_tss,n_tes,n_ic,n_iso,splicing_ratio,tss_ratio,tes_ratio,spl_ratio,gname,sample,gene_tpm
651,obs_det,ENSG00000049540,3.0,7.0,140.0,283.0,28.0,0.078947,0.184211,0.736842,ELN,,
614,obs_mm_det,ENSG00000049540,2.0,5.0,50.0,73.0,14.285714,0.09396,0.234899,0.671141,ELN,,


Write triplets to file

In [16]:
ca.write('cerberus_annot_triplets.h5')

In [17]:
# also write out triplets separately to tsv
ca.triplets.to_csv('triplets.tsv', sep='\t', index=False)

In [18]:
ca.triplets.source.unique()

array(['v40', 'v29', 'lapa', 'gtex', 'all', 'obs_det', 'sample_det',
       'obs_major', 'sample_major', 'obs_mm_det', 'obs_mm_major'],
      dtype=object)

## add simplex coords to triplets

In [19]:
# ca = cerberus.read('cerberus_annot_triplets.h5')
# ca.triplets = cerberus.compute_simplex_coords(ca.triplets)
# ca.write('cerberus_annot_triplets.h5')

## sanity checks

In [31]:
ca.triplets.loc[(ca.triplets['sample'] =='caco2')&\
                (ca.triplets.gname=='WASH7P')&\
                (ca.triplets.source=='sample_major')]

Unnamed: 0,source,gid,n_tss,n_tes,n_ic,n_iso,splicing_ratio,gname,sample,gene_tpm
11419,sample_major,ENSG00000227232,1.0,1.0,3.0,3.0,3.0,WASH7P,caco2,17.223036


In [3]:
# temp = ca.t_map[['transcript_id', 'tss_first_sd_issue']].copy(deep=True)
# print(temp.groupby('tss_first_sd_issue').count())

# temp = ca.t_map[['transcript_id', 'tes_last_sa_issue']].copy(deep=True)
# print(temp.groupby('tes_last_sa_issue').count())

# null are ok because we feed more ends than intron chains
# for lapa
# source_trip.loc[source_trip.n_iso.isnull()]

# ca.ic.loc[(ca.ic.gene_id =='ENSG00000002079')&(ca.ic.source.str.contains('lapa'))]
# ca.tss.loc[(ca.tss.gene_id =='ENSG00000002079')&(ca.tss.source.str.contains('lapa'))]

# ca.t_map.loc[(ca.t_map.gene_id == 'ENSG00000002079')&(ca.t_map.source.str.contains('lapa'))]