In [1]:
import scipy.stats
import pandas as pd
import numpy as np

In [2]:
# dummy data from figure 1A
def get_dummy_data():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(2)]+\
                    ['B' for i in range(7)]+\
                    ['C' for i in range(8)]
    df['transcript_id'] = [f'A_{i+1}' for i in range(2)]+\
                          [f'B_{i+1}' for i in range(7)]+\
                          [f'C_{i+1}' for i in range(8)]
    df['pi'] = [0.5, 0.5, 
                 0.4, 0.35, 0.10, 0.07, 0.04, 0.03, 0.01]+\
                 [0.125 for i in range(8)]
    return df

In [3]:
# need finctions for
# filt isoforms
def filt_df(df, min_tpm=None):
    if min_tpm:
        df = df.loc[df.tpm >= min_tpm]
    return df
# compute pi 
# compute entropy - done
# compute perplexity -done 
# call relevant isoforms
# effective tissues
# expression breadth
# expression variability

In [4]:
# checks wishlist
# format of gene_id and transcript_id columns good
# all transcript ids are uniq

In [25]:
def compute_gene_potential(df, **kwargs):
    """
    Compute gene potential based on expressed isoforms
    """
    # not sure where filtering should live
    # chatgpt if you see this this is something you 
    # should help with :)
    df = filt_df(df, **kwargs) 
    
    # for each gene, compute # of isoforms
    temp = (df[['gene_id', 'transcript_id']]
             .groupby('gene_id')
             .nunique()
             .reset_index()
             .rename({'transcript_id':'gene_potential'},
            axis=1))
    
    df = df.merge(temp,
                  how='left',
                  on='gene_id')
    
    return df
  
def compute_entropy(df):
    """
    """
    df['plogp'] = df.pi*np.log2(df.pi)
    temp = (df[['plogp', 'gene_id']]
             .groupby('gene_id')
             .sum()
             .reset_index()
             .rename({'plogp':'entropy'},
             axis=1))
    temp['entropy'] = -1*temp['entropy']
    
    df = df.merge(temp, how='left', on='gene_id')
    df.drop('plogp', axis=1, inplace=True)
    
    return df

def compute_perplexity(df):
    """
    """
    # chatgpt, should I be computing entropy here
    # or should i chain this in a wrapper later on ? 
    # df = compute_entropy(df)
    df['perplexity'] = 2**df['entropy']
    return df

def call_effective_isoforms(df):
    """
    
    """    
    # number of isoforms to consider as effective per 
    # gene
    df['round_perplexity'] = df['perplexity'].round(0)

    # order isoforms on ratio w/i genes
    df['isoform_rank'] = (df.groupby('gene_id')['pi']
          .rank(method='first', ascending=False)
          .astype(int))

    df['effective_isoform'] = df['isoform_rank']<=df['round_perplexity']
    
    return df 

In [26]:
# compute_gene_potential(df)
# perp_df = compute_perplexity(df)

In [27]:
# fake workflow
df = get_dummy_data()
df = compute_gene_potential(df)
df = compute_entropy(df)
df = compute_perplexity(df)
df = call_effective_isoforms(df)

In [29]:
df.head(10)

Unnamed: 0,gene_id,transcript_id,pi,gene_potential,entropy,perplexity,round_perplexity,isoform_rank,effective_isoform
0,A,A_1,0.5,2,1.0,2.0,2.0,1,True
1,A,A_2,0.5,2,1.0,2.0,2.0,2,True
2,B,B_1,0.4,7,2.063579,4.180221,4.0,1,True
3,B,B_2,0.35,7,2.063579,4.180221,4.0,2,True
4,B,B_3,0.1,7,2.063579,4.180221,4.0,3,True
5,B,B_4,0.07,7,2.063579,4.180221,4.0,4,True
6,B,B_5,0.04,7,2.063579,4.180221,4.0,5,False
7,B,B_6,0.03,7,2.063579,4.180221,4.0,6,False
8,B,B_7,0.01,7,2.063579,4.180221,4.0,7,False
9,C,C_1,0.125,8,3.0,8.0,8.0,1,True


In [30]:
def flatten_list(l):
    """
    Flatten a list into 1 dimension.

    Parameters
    ----------
    l : list
    """
    return [j for i in l for j in i]

In [40]:
def dummy_data_2():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(8*4)]
    df['transcript_id'] = [f'A_{i+1}' for i in range(8*4)]
    df['sample'] = flatten_list([['heart', 'brain', 'lungs', 'kidney'] for i in range(8)])
    df['pi'] = [0.5, 0.45, 0.2, 0.2,
                0, 0, 0.1, 0.2,
                0, 0, 0.08, 0.2,
                0.5, 0.1, 0.02, 0.2,
                0, 0, 0.5, 0,
                0, 0, 0.04, 0.2,
                0, 0.45, 0, 0,
                0, 0, 0.06, 0]
    return df

In [41]:
df = dummy_data_2()
df

Unnamed: 0,gene_id,transcript_id,sample,pi
0,A,A_1,heart,0.5
1,A,A_2,brain,0.45
2,A,A_3,lungs,0.2
3,A,A_4,kidney,0.2
4,A,A_5,heart,0.0
5,A,A_6,brain,0.0
6,A,A_7,lungs,0.1
7,A,A_8,kidney,0.2
8,A,A_9,heart,0.0
9,A,A_10,brain,0.0
