In [1]:
import scipy.stats
import pandas as pd
import numpy as np

In [2]:
# dummy data from figure 1A
def get_dummy_data():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(2)]+\
                    ['B' for i in range(7)]+\
                    ['C' for i in range(8)]
    df['transcript_id'] = [f'A_{i+1}' for i in range(2)]+\
                          [f'B_{i+1}' for i in range(7)]+\
                          [f'C_{i+1}' for i in range(8)]
    df['pi'] = [0.5, 0.5, 
                 0.4, 0.35, 0.10, 0.07, 0.04, 0.03, 0.01]+\
                 [0.125 for i in range(8)]
    return df

In [3]:
# need finctions for
# filt isoforms
def filt_df(df, min_tpm=None):
    if min_tpm:
        df = df.loc[df.tpm >= min_tpm]
    return df
# compute pi 
# compute entropy - done
# compute perplexity -done 
# call relevant isoforms
# effective tissues
# expression breadth
# expression variability

In [4]:
# checks wishlist
# format of gene_id and transcript_id columns good
# all transcript ids are uniq
# samples exist in dataframe

In [46]:
def compute_gene_potential(df, **kwargs):
    """
    Compute gene potential based on expressed isoforms
    """
    # not sure where filtering should live
    # chatgpt if you see this this is something you 
    # should help with :)
    df = filt_df(df, **kwargs) 
    
    # for each gene, compute # of isoforms
    temp = (df[['gene_id', 'transcript_id']]
             .groupby('gene_id')
             .nunique()
             .reset_index()
             .rename({'transcript_id':'gene_potential'},
            axis=1))
    
    df = df.merge(temp,
                  how='left',
                  on='gene_id')
    
    return df
  
def compute_entropy(df):
    """
    """
    df['plogp'] = df.pi*np.log2(df.pi)
    temp = (df[['plogp', 'gene_id']]
             .groupby('gene_id')
             .sum()
             .reset_index()
             .rename({'plogp':'entropy'},
             axis=1))
    temp['entropy'] = -1*temp['entropy']
    
    df = df.merge(temp, how='left', on='gene_id')
    df.drop('plogp', axis=1, inplace=True)
    
    return df

def compute_perplexity(df):
    """
    """
    # chatgpt, should I be computing entropy here
    # or should i chain this in a wrapper later on ? 
    # df = compute_entropy(df)
    df['perplexity'] = 2**df['entropy']
    return df

def call_effective_isoforms(df):
    """
    
    """    
    # number of isoforms to consider as effective per 
    # gene
    df['round_perplexity'] = df['perplexity'].round(0)

    # order isoforms on ratio w/i genes
    df['isoform_rank'] = (df.groupby('gene_id')['pi']
          .rank(method='first', ascending=False)
          .astype(int))

    df['effective_isoform'] = df['isoform_rank']<=df['round_perplexity']
    
    return df 

In [159]:
def flatten_list(l):
    """
    Flatten a list into 1 dimension.

    Parameters
    ----------
    l : list
    """
    return [j for i in l for j in i]

def dummy_data_2():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(8*4)]
    df['transcript_id'] = flatten_list([[f'A_{j+1}' for i in range(4)] for j in range(8)])
    df['sample'] = flatten_list([['heart', 'brain', 'lungs', 'kidney'] for i in range(8)])
    df['pi'] = [0.5, 0.45, 0.2, 0.2,
                0, 0, 0.1, 0.2,
                0, 0, 0.08, 0.2,
                0.5, 0.1, 0.02, 0.2,
                0, 0, 0.5, 0,
                0, 0, 0.04, 0.2,
                0, 0.45, 0, 0,
                0, 0, 0.06, 0]
    return df

In [160]:
# compute_gene_potential(df)
# perp_df = compute_perplexity(df)

In [228]:
# fake workflow for cross-dataset
# calculations
def workflow_1(df):
    # df = df.loc[df.pi > 0] # maybe do some more sophisticated filtering here
    df = compute_gene_potential(df)
    df = compute_entropy(df)
    df = compute_perplexity(df)
    df = call_effective_isoforms(df)
    return df

# workflow 2 -- compute these metrics per sample
def workflow_2(df, sample_col):
    samples = df[sample_col].unique().tolist()
    big_df = pd.DataFrame()
    for s in samples:
        s_df = df.loc[df[sample_col]==s].copy(deep=True)
        s_df = workflow_1(s_df)
        big_df = pd.concat([big_df,s_df], axis=0)
        
    df = big_df.copy(deep=True)
    
    # compute % effective tissues / isoform
    n_samples = len(samples)
    temp = (df.loc[df.effective_isoform][['transcript_id', sample_col]]
            .groupby('transcript_id')
            .nunique()
            .reset_index()
            .rename({sample_col: 'n_samples_effective'}, axis=1))
    df = df.merge(temp,
                  how='left',
                  on='transcript_id')
    df['perc_effective_isoforms'] = (df['n_samples_effective']/n_samples)*100

    # fill nans w/ 0
    df['perc_effective_isoforms'] = df['perc_effective_isoforms'].fillna(0, inplace=True)
    # df = df.fillna(0)

    
    return df

In [229]:
df = dummy_data_2()
sample_col = 'sample'
df = workflow_2(df, sample_col)

df['n_exp_samples'] = (df.groupby('transcript_id')[sample_col]
                       .transform('nunique'))
# # some chatgpt version that prevents returning nulls
# df['pi_std'] = df.groupby('transcript_id')['pi'].transform(
#     lambda x: 0.0 if len(x.dropna()) <= 1 else x.std(ddof=1, skipna=True))

normal pandas version
df['pi_std'] = (df.groupby('transcript_id')['pi']
                .transform(lambda x: x.std(ddof=1, skipna=True)))
# # numpy version
# df['pi_std'] = (df.groupby('transcript_id')['pi']
#                 .transform(lambda x: np.std(list(x), ddof=1)))
# df['pi_std'] = (
#     df.groupby(['gene_id','transcript_id'])['pi']
#       .transform(lambda x: 0 if x.notna().sum() <= 1 else x.std(ddof=1))
# )
df[['transcript_id', 'pi_std']].drop_duplicates()

# df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [230]:
temp = df[['transcript_id', 'pi']].sort_values(by='transcript_id')
temp.loc[temp.transcript_id == 'A_7']

Unnamed: 0,transcript_id,pi
14,A_7,0.45
30,A_7,0.0
6,A_7,0.0
22,A_7,0.0


In [231]:
print(np.std([0.5, 0.45, 0.2, 0.2], ddof=1))
print(np.std([0, 0, 0.1, 0.2], ddof=1))
print(np.std([0, 0, 0.08, 0.2], ddof=1))
print(np.std([0.5, 0.1, 0.02, 0.2], ddof=1))
print(np.std([0, 0, 0.5, 0], ddof=1))
print(np.std([0, 0, 0.04, 0.2], ddof=1))
print(np.std([0, 0.45, 0, 0], ddof=1))
print(np.std([0, 0, 0.06, 0], ddof=1))

0.16007810593582122
0.09574271077563382
0.09451631252505217
0.21
0.25
0.09521904571390467
0.22500000000000003
0.03


In [196]:
for arr in [[0.5, 0.45, 0.2, 0.2],
            [0,0,0.1,0.2],
            [0,0,0.08,0.2],
            [0.5,0.1,0.02,0.2]]:
    print(np.std(arr, ddof=1))

0.16007810593582122
0.09574271077563382
0.09451631252505217
0.21
