In [3]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus
import plotly.io as pio


p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [4]:
def get_pi_table(sg, ca, obs_col, obs_conds, sources):
    # get pi table
    df, _ = swan.calc_pi(sg.adata, sg.t_df, obs_col=obs_col)
    df = df.sparse.to_dense()
    df = df.transpose()
    df = df[obs_conds]
    df = df.fillna(0)

    # get novelty and original tid from gencode sources
    feats = ['tss', 'ic', 'tes']
    feat_tables = {'tss': ca.tss, 
                   'ic': ca.ic, 
                   'tes': ca.tes}
    df.reset_index(inplace=True)
    for feat in feats:

        # novelty
        nov_c = '{}_novelty'.format(feat)
        df = add_feat(df, col='tid', kind=feat)
        feat_df = feat_tables[feat][['novelty', 'Name']]
        feat_df.rename({'novelty': nov_c, 'Name': feat}, axis=1, inplace=True)
        df = df.merge(feat_df, how='left', on=feat)

    df['transcript_novelty'] = 'Novel'
    df.loc[(df.tss_novelty=='Known')&(df.ic_novelty=='Known')&(df.tes_novelty=='Known'), 'transcript_novelty'] = 'Known'

    # old tids
    source_cols = []
    for source in sources:
        tid_df = df[['tid']]

        # get original tids for each source 
        source_col = '{}_tid'.format(source)
        source_cols.append(source)
        t_map = ca.t_map.loc[ca.t_map.source==source]
        t_map = t_map[['original_transcript_id', 'transcript_id']]
        t_map.rename({'transcript_id': 'tid',
                      'original_transcript_id': source_col}, axis=1, inplace=True)
        tid_df = tid_df.merge(t_map, how='left', on='tid')

        # dedupe and merge back in 
        tid_df = tid_df.fillna('')
        tid_df = tid_df.groupby('tid').agg(','.join).reset_index()
        tid_df.loc[tid_df[source_col]=='', source_col] = np.nan
        df = df.merge(tid_df, how='left', on='tid')    
        
    # add gid
    df['gid'] = df.tid.str.split('[', expand=True)[0]    
        
    return df


In [5]:
swan_file = '../lr_bulk/cerberus/swan/swan.p'
c_annot = '../lr_bulk/cerberus/cerberus_annot.h5'

In [6]:
sg = swan.read(swan_file)
ca = cerberus.read(c_annot)

Read in graph from ../lr_bulk/cerberus/swan/swan.p


In [23]:
# sample-level
obs_col = 'sample'
obs_conds = get_sample_datasets('ljungman')
obs_conds = list(set(sg.adata.obs.loc[obs_conds, obs_col].tolist()))
sources = ['v29', 'v40']
df = get_pi_table(sg, ca, obs_col, obs_conds, sources)
df.to_csv('cell_line_pi.tsv', sep='\t')

In [None]:
# replicate-level
obs_col = 'dataset'
obs_conds = list(set(get_sample_datasets('ljungman')))
sources = ['v29', 'v40']
df = get_pi_table(sg, ca, obs_col, obs_conds, sources)
df.to_csv('replicate_pi.tsv', sep='\t')

16