In [1]:
import scipy.stats
import pandas as pd
import numpy as np

In [2]:
# dummy data from figure 1A
def get_dummy_data():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(2)]+\
                    ['B' for i in range(7)]+\
                    ['C' for i in range(8)]
    df['transcript_id'] = [f'A_{i+1}' for i in range(2)]+\
                          [f'B_{i+1}' for i in range(7)]+\
                          [f'C_{i+1}' for i in range(8)]
    df['pi'] = [0.5, 0.5, 
                 0.4, 0.35, 0.10, 0.07, 0.04, 0.03, 0.01]+\
                 [0.125 for i in range(8)]
    return df

In [3]:
# need finctions for
# filt isoforms
def filt_df(df, min_tpm=None):
    if min_tpm:
        df = df.loc[df.tpm >= min_tpm]
    return df
# compute pi 
# compute entropy - done
# compute perplexity -done 
# call relevant isoforms
# effective tissues
# expression breadth
# expression variability

In [4]:
# checks wishlist
# format of gene_id and transcript_id columns good
# all transcript ids are uniq
# samples exist in dataframe

In [46]:
def compute_gene_potential(df, **kwargs):
    """
    Compute gene potential based on expressed isoforms
    """
    # not sure where filtering should live
    # chatgpt if you see this this is something you 
    # should help with :)
    df = filt_df(df, **kwargs) 
    
    # for each gene, compute # of isoforms
    temp = (df[['gene_id', 'transcript_id']]
             .groupby('gene_id')
             .nunique()
             .reset_index()
             .rename({'transcript_id':'gene_potential'},
            axis=1))
    
    df = df.merge(temp,
                  how='left',
                  on='gene_id')
    
    return df
  
def compute_entropy(df):
    """
    """
    df['plogp'] = df.pi*np.log2(df.pi)
    temp = (df[['plogp', 'gene_id']]
             .groupby('gene_id')
             .sum()
             .reset_index()
             .rename({'plogp':'entropy'},
             axis=1))
    temp['entropy'] = -1*temp['entropy']
    
    df = df.merge(temp, how='left', on='gene_id')
    df.drop('plogp', axis=1, inplace=True)
    
    return df

def compute_perplexity(df):
    """
    """
    # chatgpt, should I be computing entropy here
    # or should i chain this in a wrapper later on ? 
    # df = compute_entropy(df)
    df['perplexity'] = 2**df['entropy']
    return df

def call_effective_isoforms(df):
    """
    
    """    
    # number of isoforms to consider as effective per 
    # gene
    df['round_perplexity'] = df['perplexity'].round(0)

    # order isoforms on ratio w/i genes
    df['isoform_rank'] = (df.groupby('gene_id')['pi']
          .rank(method='first', ascending=False)
          .astype(int))

    df['effective_isoform'] = df['isoform_rank']<=df['round_perplexity']
    
    return df 

In [130]:
def flatten_list(l):
    """
    Flatten a list into 1 dimension.

    Parameters
    ----------
    l : list
    """
    return [j for i in l for j in i]

def dummy_data_2():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(8*4)]
    df['transcript_id'] = flatten_list([[f'A_{j+1}' for i in range(4)] for j in range(8)])
    print(df.transcript_id.tolist())
    df['sample'] = flatten_list([['heart', 'brain', 'lungs', 'kidney'] for i in range(8)])
    df['pi'] = [0.5, 0.45, 0.2, 0.2,
                0, 0, 0.1, 0.2,
                0, 0, 0.08, 0.2,
                0.5, 0.1, 0.02, 0.2,
                0, 0, 0.5, 0,
                0, 0, 0.04, 0.2,
                0, 0.45, 0, 0,
                0, 0, 0.06, 0]
    return df

In [131]:
# compute_gene_potential(df)
# perp_df = compute_perplexity(df)

In [132]:
# fake workflow for cross-dataset
# calculations
def workflow_1(df):
    df = df.loc[df.pi > 0] # maybe do some more sophisticated filtering here
    df = compute_gene_potential(df)
    df = compute_entropy(df)
    df = compute_perplexity(df)
    df = call_effective_isoforms(df)
    return df

# workflow 2 -- compute these metrics per sample
def workflow_2(df, sample_col):
    samples = df[sample_col].unique().tolist()
    big_df = pd.DataFrame()
    for s in samples:
        s_df = df.loc[df[sample_col]==s].copy(deep=True)
        s_df = workflow_1(s_df)
        big_df = pd.concat([big_df,s_df], axis=0)
    return big_df

In [133]:
df = dummy_data_2()
# df.loc[df.transcript_id=='A_1']
# df = workflow_1(df)
df.head(8)

['A_1', 'A_1', 'A_1', 'A_1', 'A_2', 'A_2', 'A_2', 'A_2', 'A_3', 'A_3', 'A_3', 'A_3', 'A_4', 'A_4', 'A_4', 'A_4', 'A_5', 'A_5', 'A_5', 'A_5', 'A_6', 'A_6', 'A_6', 'A_6', 'A_7', 'A_7', 'A_7', 'A_7', 'A_8', 'A_8', 'A_8', 'A_8']


Unnamed: 0,gene_id,transcript_id,sample,pi
0,A,A_1,heart,0.5
1,A,A_1,brain,0.45
2,A,A_1,lungs,0.2
3,A,A_1,kidney,0.2
4,A,A_2,heart,0.0
5,A,A_2,brain,0.0
6,A,A_2,lungs,0.1
7,A,A_2,kidney,0.2


In [137]:
df = dummy_data_2()
sample_col = 'sample'
# df = workflow_2(df, 'sample')
samples = df[sample_col].unique().tolist()
big_df = pd.DataFrame()
for s in samples:
    s_df = df.loc[df[sample_col]==s].copy(deep=True)
    s_df = workflow_1(s_df)
    big_df = pd.concat([big_df,s_df], axis=0)
df = big_df

['A_1', 'A_1', 'A_1', 'A_1', 'A_2', 'A_2', 'A_2', 'A_2', 'A_3', 'A_3', 'A_3', 'A_3', 'A_4', 'A_4', 'A_4', 'A_4', 'A_5', 'A_5', 'A_5', 'A_5', 'A_6', 'A_6', 'A_6', 'A_6', 'A_7', 'A_7', 'A_7', 'A_7', 'A_8', 'A_8', 'A_8', 'A_8']


In [139]:
df.loc[(df.effective_isoform)&(df.transcript_id=='A_1')]

Unnamed: 0,gene_id,transcript_id,sample,pi,gene_potential,entropy,perplexity,round_perplexity,isoform_rank,effective_isoform
0,A,A_1,heart,0.5,2,1.0,2.0,2.0,1,True
0,A,A_1,brain,0.45,3,1.368996,2.582907,3.0,1,True
0,A,A_1,lungs,0.2,7,2.130252,4.377939,4.0,2,True
0,A,A_1,kidney,0.2,5,2.321928,5.0,5.0,1,True


In [135]:
# compute % effective tissues / isoform
n_samples = len(samples)
temp = (df.loc[df.effective_isoform][['transcript_id', sample_col]]
        .groupby('transcript_id')
        .nunique()
        .reset_index()
        .rename({sample_col: 'n_samples_effective'}, axis=1))
df = df.merge(temp,
              how='left',
              on='transcript_id')

In [136]:
df.head()

Unnamed: 0,gene_id,transcript_id,sample,pi,gene_potential,entropy,perplexity,round_perplexity,isoform_rank,effective_isoform,n_samples_effective
0,A,A_1,heart,0.5,2,1.0,2.0,2.0,1,True,4.0
1,A,A_4,heart,0.5,2,1.0,2.0,2.0,2,True,3.0
2,A,A_1,brain,0.45,3,1.368996,2.582907,3.0,1,True,4.0
3,A,A_4,brain,0.1,3,1.368996,2.582907,3.0,3,True,3.0
4,A,A_7,brain,0.45,3,1.368996,2.582907,3.0,2,True,1.0


In [129]:
df.loc[df.transcript_id=='A_1']

Unnamed: 0,gene_id,transcript_id,sample,pi,gene_potential,entropy,perplexity,round_perplexity,isoform_rank,effective_isoform,n_samples_effective
6,A,A_1,lungs,0.1,7,2.130252,4.377939,4.0,3,True,2.0
13,A,A_1,kidney,0.2,5,2.321928,5.0,5.0,2,True,2.0
