In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus


p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [4]:
df, _, _ = get_gtf_info(how='gene', ver='vM25_cerberus')
df.head()

Unnamed: 0,gid,gname,length,biotype,biotype_category,tf
0,ENSMUSG00000000544.14,Gpa33,36373,protein_coding,protein_coding,False
1,ENSMUSG00000001138.13,Cnnm3,16471,protein_coding,protein_coding,False
2,ENSMUSG00000003135.15,Cnot11,11998,protein_coding,protein_coding,False
3,ENSMUSG00000003464.13,Pex19,9839,protein_coding,protein_coding,False
4,ENSMUSG00000004552.16,Ctse,37300,protein_coding,protein_coding,False


In [5]:
h5 = '/Users/fairliereese/mortazavi_lab/data/mousewg/lr_bulk/cerberus/cerberus_annot.h5'
filt_ab = '/Users/fairliereese/mortazavi_lab/data/mousewg/lr_bulk/cerberus/cerberus_filtered_abundance.tsv'
swangraph = '/Users/fairliereese/mortazavi_lab/data/mousewg/lr_bulk/cerberus/swan/swan.p'

## Get triplets functions for cerberus annotation

In [8]:
ca = cerberus.read(h5)
sg = swan.read(swangraph)
ca.set_sg(sg)

Read in graph from /Users/fairliereese/mortazavi_lab/data/mousewg/lr_bulk/cerberus/swan/swan.p


In [9]:
source_trip = ca.get_source_triplets(sg)
source_trip.to_csv('mouse_source_triplets.tsv', sep='\t', index=False)

## Get triplets from list of tids 

In [11]:
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
               how='iso',
               min_tpm=1,
               species='mouse',
               sample='mouse')

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 145708
# isos >= 1 tpm: 136394
Number of isos reported: 136394


In [12]:
def get_subset_triplets(ca, tids, source):
    df = pd.DataFrame()
    df['tid'] = tids

    df = add_feat(df, kind='tss', col='tid')
    df = add_feat(df, kind='ic', col='tid')
    df = add_feat(df, kind='tes', col='tid', drop_gid=False)
    df.rename({'temp_gid': 'gene_id',
               'tss': 'tss_id', 
               'ic': 'ic_id', 
               'tes': 'tes_id'}, axis=1, inplace=True)
    df = df.groupby('gene_id').nunique()
    df.rename({'tss_id': 'n_tss',
                 'ic_id': 'n_ic',
                 'tes_id': 'n_tes',
                 'tid': 'n_iso'}, axis=1, inplace=True)
    df.reset_index(inplace=True)
    df = cerberus.compute_splicing_ratio(df)
    
    # add the gene id
    temp = ca.sg.t_df[['gid', 'gname']].copy(deep=True)
    df.rename({'gene_id': 'gid'}, axis=1, inplace=True)
    temp.rename({'gid':'gene_id'}, axis=1, inplace=True)
    temp = cerberus.add_stable_gid(temp)
    temp.reset_index(drop=True)
    temp.drop_duplicates(inplace=True)
    temp.rename({'gene_id': 'gid'}, axis=1, inplace=True)
    df = df.merge(temp, how='left', on='gid')
    
    df['source'] = source
    return df

In [13]:
temp = get_subset_triplets(ca, tids, 'lapa_det')
source_trip = pd.concat([source_trip, temp], axis=0)
source_trip.to_csv('mouse_source_triplets.tsv', sep='\t', index=False)

## Expressed triplets

In [14]:
obs_col = 'sample'
min_tpm = 1
e_trip = ca.get_expressed_triplets(obs_col=obs_col, min_tpm=min_tpm)
e_trip.to_csv('mouse_expressed_triplets.tsv', sep='\t', index=False)

In [17]:
# # reference + expressed triplets
# e_trip = pd.read_csv('mouse_expressed_triplets.tsv', sep='\t')
# s_trip = pd.read_csv('mouse_source_triplets.tsv', sep='\t')

# # for now, hack together dfs to work with existing code
# e_trip.rename({'sample': 'source'}, axis=1, inplace=True)
# trips = pd.concat([s_trip, e_trip])
# trips.loc[trips.source == 'lapa_det', 'source'] = 'obs'
# trips.loc[trips.source=='all', 'source'] = 'cerberus'

# # remove all observed triplets
# trips = trips.loc[trips.source != 'lapa']

In [18]:
# c_dict, order = get_lr_bulk_sample_colors()
