In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import swan_vis as swan


p = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
filt_ab = '../cerberus/cerberus_filtered_abundance.tsv'
swan_obj = 'swan.p'
h5 = '../cerberus/cerberus_annot.h5'
species = 'mouse'
min_tpm = 1
gene_subset = 'polya'

In [3]:
sg = swan.read(swan_obj)
t_df = pd.read_csv(filt_ab, sep='\t')

Read in graph from swan.p


In [4]:
def get_major_isos(sg, t_df, 
                   obs_col,
                   ofile,
                   min_tpm=1):
    """
    Get major isoforms per sample / gene combination
    
    Parameters: 
        sg (swan_vis SwanGraph): SwanGraph 
        obs_col (str): Column in sg.adata.obs to use
        ofile (str): Where to save results
        tids (list of str): List of transcripts to even consider
    """
    
    t_df = t_df[['annot_gene_name', 'annot_transcript_id', 'annot_gene_id']]
    t_df.rename({'annot_gene_name': 'gname',
                 'annot_gene_id': 'gid',
                 'annot_transcript_id': 'tid'}, 
                 axis=1, 
                 inplace=True)

    df, _ = swan.calc_pi(sg.adata, sg.t_df, obs_col=obs_col)
    df = df.sparse.to_dense().transpose()
    tpm_df = swan.calc_tpm(sg.adata, obs_col=obs_col, how='max').sparse.to_dense().transpose()
    tpm_df.reset_index(inplace=True)
    tpm_df.rename({'index':'tid'}, axis=1, inplace=True)
    
    # melt to have one entry per tid / sample combination
    def melt_transcript_sample(df, t_df, obs_col, col_name):
        df = df.merge(t_df[['tid', 'gname', 'gid']], how='inner', on='tid')
        df.set_index(['tid', 'gname', 'gid'], inplace=True)
        df = df.melt(ignore_index=False, value_name=col_name, var_name=obs_col)
        df = df.dropna(subset=[col_name])
        df.reset_index(inplace=True)
        return df 
        
    df = melt_transcript_sample(df, t_df, obs_col, col_name='pi')
    tpm_df = melt_transcript_sample(tpm_df, t_df, obs_col, col_name='tpm')    
    
    # add tpm info in and subset based on tpm thresh
    df = df.merge(tpm_df, how='left', on=['tid', 'gname', 'gid', obs_col])
    print(df.loc[(df.tpm < min_tpm)&(df.pi > 0)].head())
    df = df.loc[df.tpm >= min_tpm]
    df.drop('tpm', axis=1, inplace=True)

    # determine the rank of each pi value for each sample / gene combo
    df = df.sort_values(by='pi', ascending=False)
    df['pi_rank'] = df.sort_values(by='pi', ascending=False).groupby(['gname', 'gid', obs_col]).cumcount()+1

    # add a column that we can check for convergence with 
    df['gname_gid_biosamp'] = df.gname+'_'+df.gid+'_'+df[obs_col]
    
    # add total pi value so that we can return all isos that sum up 
    # to this value if we've removed the isos that bring total to >= 90%
    max_total_pis = df[['pi', 'gname_gid_biosamp']].groupby('gname_gid_biosamp').sum().reset_index()
    max_total_pis.rename({'pi': 'max_total_pi'}, axis=1, inplace=True)
    
    df.to_csv('isos_90_in_progress.tsv', sep='\t')
    
    iso_df = pd.DataFrame()
    max_pi_rank = df.pi_rank.max()
    for max_pi in range(1, max_pi_rank+1):
        pi_ranks = [i for i in range(1, max_pi+1)]
        # for the first iteration, we don't have to limit which genes we look at
        if max_pi == 1:
            temp = df.loc[df.pi_rank.isin(pi_ranks)].groupby(['gname_gid_biosamp']).sum().reset_index()
        else:
            ids = iso_df.gname_gid_biosamp.tolist()
            temp = df.loc[(~df.gname_gid_biosamp.isin(ids))&(df.pi_rank.isin(pi_ranks))].groupby(['gname_gid_biosamp']).sum().reset_index()

        # converged if no more entries to analyze
        if len(temp.index) == 0:
            break

        # get isoforms that have >90% isoform exp accounted for   
        temp = temp.merge(max_total_pis, how='left', on='gname_gid_biosamp')
        temp.loc[temp.max_total_pi >= 90, 'max_total_pi'] = 90
        temp = temp.loc[temp.pi >= temp.max_total_pi]
        temp.drop(['pi_rank', 'max_total_pi'], axis=1, inplace=True)
        temp['n_isos'] = max_pi
        iso_df = pd.concat([iso_df, temp])
        
    
    # get list of isoforms required for each sample / gene combination as well
    df = df.merge(iso_df, how='left', on='gname_gid_biosamp')
    df['in_90_set'] = df.pi_rank <= df.n_isos
    df = df.loc[df.in_90_set]
    df[['gname', 'gid', obs_col]] = df.gname_gid_biosamp.str.split('_', n=2, expand=True)    
    df.rename({'pi_x': 'pi'}, axis=1, inplace=True)
    df.drop(['gname_gid_biosamp', 
            'pi_y', 'n_isos', 'in_90_set'], axis=1, inplace=True)

    # get the sample / gene vs. n isoforms required for 90%    
    iso_df[['gname', 'gid', obs_col]] = iso_df.gname_gid_biosamp.str.split('_', n=2, expand=True)    
    iso_df.drop('gname_gid_biosamp', axis=1, inplace=True)
    iso_df = iso_df.sort_values('n_isos', ascending=False)

    df.to_csv(ofile, sep='\t', index=False)
    return df

In [7]:
# sg.adata.obs['sample'].loc[sg.adata.obs['sample'].isnull()]

In [9]:
# obs_col = 'sample'
# conditions = sg.adata.obs[obs_col].unique().tolist()

In [10]:
fname = 'isos_sample_gene_90.tsv'
df = get_major_isos(sg, t_df, 
               'sample',
               fname,
               min_tpm=min_tpm)

                              tid    gname                    gid  \
260272  ENSMUSG00000051285[1,3,6]   Pcmtd1  ENSMUSG00000051285.17   
260287  ENSMUSG00000097797[1,2,2]  Gm26901   ENSMUSG00000097797.6   
260299  ENSMUSG00000025911[1,7,6]   Adhfe1  ENSMUSG00000025911.14   
260305  ENSMUSG00000025912[1,1,1]    Mybl1  ENSMUSG00000025912.16   
260334  ENSMUSG00000098234[1,1,1]    Snhg6   ENSMUSG00000098234.7   

                 sample          pi       tpm  
260272  adrenal_18-20mo    0.369004  0.859054  
260287  adrenal_18-20mo  100.000000  0.859054  
260299  adrenal_18-20mo    0.735294  0.859054  
260305  adrenal_18-20mo   33.333336  0.859054  
260334  adrenal_18-20mo    6.666667  0.859054  


In [10]:
sg = swan.read(swan_obj)
t_df = pd.read_csv(filt_ab, sep='\t')

Read in graph from swan.p


In [5]:
fname = 'isos_tissue_gene_90.tsv'
df = get_major_isos(sg, t_df, 
                    'tissue',
                    fname,
                    min_tpm=min_tpm)

                         tid          gname                   gid  \
0  ENSMUSG00000102693[1,1,1]  4933401J01Rik  ENSMUSG00000102693.1   
2  ENSMUSG00000103377[1,1,1]        Gm37180  ENSMUSG00000103377.1   
3  ENSMUSG00000104017[1,1,1]        Gm37363  ENSMUSG00000104017.1   
4  ENSMUSG00000103201[1,1,1]        Gm37329  ENSMUSG00000103201.1   
5  ENSMUSG00000103161[1,1,1]        Gm38148  ENSMUSG00000103161.1   

        tissue     pi       tpm  
0  hippocampus  100.0  0.448623  
2  hippocampus  100.0  0.779937  
3  hippocampus  100.0  0.779937  
4  hippocampus  100.0  0.779937  
5  hippocampus  100.0  0.983370  
