In [1]:
from utils import * 

In [2]:
def get_dummy_data_counts():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(2)]+\
                    ['B' for i in range(7)]+\
                    ['C' for i in range(8)]
    df['transcript_id'] = [f'A_{i+1}' for i in range(2)]+\
                          [f'B_{i+1}' for i in range(7)]+\
                          [f'C_{i+1}' for i in range(8)]
    df['counts'] = ([47, 47]+ 
                    [i*28 for i in [0.4, 0.35, 0.10, 0.07, 0.04, 0.03, 0.01]]+
                    [0.125 for i in range(8)])
        
    return df

def dummy_counts_data_sample():
    """
    Generate a dummy multi-sample dataset with counts instead of pi values.
    
    Returns
    -------
    pd.DataFrame
        Columns: 'gene_id', 'transcript_id', 'sample', 'counts'
    """
    import pandas as pd

    genes = ['A']  # single gene for simplicity
    transcripts = [f'A_{i+1}' for i in range(8)]
    samples = ['heart', 'brain', 'lungs', 'kidney']
    
    # create repeated combinations
    df = pd.DataFrame({
        'gene_id': flatten_list([[g]*len(samples) for g in genes*len(transcripts)]),
        'transcript_id': flatten_list([[t]*len(samples) for t in transcripts]),
        'sample': samples * len(transcripts),
    })
    
    # assign dummy counts
    df['counts'] = [
        50, 45, 20, 20,   # A_1
        0, 0, 10, 20,     # A_2
        0, 0, 8, 20,      # A_3
        50, 10, 2, 20,    # A_4
        0, 0, 50, 0,      # A_5
        0, 0, 4, 20,      # A_6
        0, 45, 0, 0,      # A_7
        0, 0, 6, 0        # A_8
    ]
    
    return df


In [3]:
# # all positive counts
# df = get_dummy_data_counts()
# validate_counts_input(df)

In [4]:
# # w/ neg counts
# df = get_dummy_data_counts()
# df['counts'] = df['counts'].tolist()[:-1]+[-1]
# validate_counts_input(df)

In [5]:
# # w/o sample col when it needs one
# df = get_dummy_data_counts()
# validate_counts_input(df, sample_col='sample')

In [6]:
# # w/o gene id col when it needs one
# df = get_dummy_data_counts()
# validate_counts_input(df, sample_col='sample')

In [7]:
# # dummy data from figure 1A
# def get_dummy_data():
#     df = pd.DataFrame()
#     df['gene_id'] = ['A' for i in range(2)]+\
#                     ['B' for i in range(7)]+\
#                     ['C' for i in range(8)]
#     df['transcript_id'] = [f'A_{i+1}' for i in range(2)]+\
#                           [f'B_{i+1}' for i in range(7)]+\
#                           [f'C_{i+1}' for i in range(8)]
#     df['pi'] = [0.5, 0.5, 
#                  0.4, 0.35, 0.10, 0.07, 0.04, 0.03, 0.01]+\
#                  [0.125 for i in range(8)]
#     return df

In [8]:
df = get_dummy_data_counts()
df = compute_global_isoform_metrics(df)
print(df[['gene_id', 'gene_potential', 'entropy', 'perplexity', 'n_effective_isoforms']].drop_duplicates())
print()
print(df[['gene_id', 'transcript_id', 'pi', 'effective_isoform']])
df.to_csv('example_bulk_metrics.tsv', sep='\t')


  gene_id  gene_potential   entropy  perplexity  n_effective_isoforms
0       A               2  1.000000    2.000000                   2.0
2       B               7  2.063579    4.180221                   4.0
9       C               8  3.000000    8.000000                   8.0

   gene_id transcript_id     pi  effective_isoform
0        A           A_1  0.500               True
1        A           A_2  0.500               True
2        B           B_1  0.400               True
3        B           B_2  0.350               True
4        B           B_3  0.100               True
5        B           B_4  0.070               True
6        B           B_5  0.040              False
7        B           B_6  0.030              False
8        B           B_7  0.010              False
9        C           C_1  0.125               True
10       C           C_2  0.125               True
11       C           C_3  0.125               True
12       C           C_4  0.125               True
13   

In [9]:
30246531

30246531

In [10]:
def collapse_counts_by_feature(df,
                               feature_col=TRANSCRIPT_COL,
                               gene_col=GENE_COL,
                               sample_col=None):
    """
    Collapse counts by a feature (e.g. ORF, transcript) instead of transcript.

    Parameters
    ----------
    df : pd.DataFrame
        Input table with counts at transcript level.
    feature_col : str
        Alternative feature column to collapse to (e.g. 'orf_id').
    gene_col : str
        Column identifying genes.
    sample_col : str, optional
        Sample column. If None, assumes single-sample bulk.

    Returns
    -------
    pd.DataFrame
        Collapsed df with summed counts per feature.
    """
    group_cols = [gene_col, feature_col]
    if sample_col is not None:
        group_cols.append(sample_col)
        
    import pdb; pdb.set_trace()
        
    # Sum counts for all transcripts mapping to the same feature
    out = (df.groupby(group_cols, as_index=False)['counts']
          .sum())

    return out

In [11]:
df = dummy_counts_data_sample()
df = compute_multi_sample_isoform_metrics(df, 'sample')
print(df[['gene_id', 'sample', 'gene_potential', 'entropy', 'perplexity', 'n_effective_isoforms']].drop_duplicates())
print()
print(df[['gene_id', 'transcript_id','expression_breadth', 'expression_var']].drop_duplicates())
df.to_csv('example_sample_metrics.tsv', sep='\t')
df.head()

   gene_id  sample  gene_potential   entropy  perplexity  n_effective_isoforms
0        A   heart               8  1.000000    2.000000                   2.0
8        A   brain               8  1.368996    2.582907                   3.0
16       A   lungs               8  2.130252    4.377939                   4.0
24       A  kidney               8  2.321928    5.000000                   5.0

  gene_id transcript_id  expression_breadth  expression_var
0       A           A_1               100.0        0.160078
1       A           A_2                50.0        0.095743
2       A           A_3                50.0        0.094516
3       A           A_4                75.0        0.210000
4       A           A_5                25.0        0.250000
5       A           A_6                25.0        0.095219
6       A           A_7                25.0        0.225000
7       A           A_8                 0.0        0.030000


Unnamed: 0,gene_id,transcript_id,sample,counts,pi,gene_potential,entropy,perplexity,n_effective_isoforms,isoform_rank,effective_isoform,n_samples_effective,expression_breadth,n_exp_samples,expression_var,avg_transcript_id_counts,avg_gene_id_counts
0,A,A_1,heart,50,0.5,8,1.0,2.0,2.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
1,A,A_2,heart,0,0.0,8,1.0,2.0,2.0,3,False,2.0,50.0,4,0.095743,15.0,100.0
2,A,A_3,heart,0,0.0,8,1.0,2.0,2.0,4,False,2.0,50.0,4,0.094516,14.0,100.0
3,A,A_4,heart,50,0.5,8,1.0,2.0,2.0,2,True,3.0,75.0,4,0.21,20.5,100.0
4,A,A_5,heart,0,0.0,8,1.0,2.0,2.0,5,False,1.0,25.0,4,0.25,50.0,100.0


In [12]:
(50+45+20+20)/4


33.75

In [13]:
df.loc[df.transcript_id=='A_1']

Unnamed: 0,gene_id,transcript_id,sample,counts,pi,gene_potential,entropy,perplexity,n_effective_isoforms,isoform_rank,effective_isoform,n_samples_effective,expression_breadth,n_exp_samples,expression_var,avg_transcript_id_counts,avg_gene_id_counts
0,A,A_1,heart,50,0.5,8,1.0,2.0,2.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
8,A,A_1,brain,45,0.45,8,1.368996,2.582907,3.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
16,A,A_1,lungs,20,0.2,8,2.130252,4.377939,4.0,2,True,4.0,100.0,4,0.160078,33.75,100.0
24,A,A_1,kidney,20,0.2,8,2.321928,5.0,5.0,1,True,4.0,100.0,4,0.160078,33.75,100.0


In [14]:
df[['gene_id', 'perplexity', 'sample']].drop_duplicates()

Unnamed: 0,gene_id,perplexity,sample
0,A,2.0,heart
8,A,2.582907,brain
16,A,4.377939,lungs
24,A,5.0,kidney


In [16]:
# df[['gene_id', 'gene_counts', 'sample']].drop_duplicates()

In [17]:
df.loc[df['sample']=='heart']

Unnamed: 0,gene_id,transcript_id,sample,counts,pi,gene_potential,entropy,perplexity,n_effective_isoforms,isoform_rank,effective_isoform,n_samples_effective,expression_breadth,n_exp_samples,expression_var,avg_transcript_id_counts,avg_gene_id_counts
0,A,A_1,heart,50,0.5,8,1.0,2.0,2.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
1,A,A_2,heart,0,0.0,8,1.0,2.0,2.0,3,False,2.0,50.0,4,0.095743,15.0,100.0
2,A,A_3,heart,0,0.0,8,1.0,2.0,2.0,4,False,2.0,50.0,4,0.094516,14.0,100.0
3,A,A_4,heart,50,0.5,8,1.0,2.0,2.0,2,True,3.0,75.0,4,0.21,20.5,100.0
4,A,A_5,heart,0,0.0,8,1.0,2.0,2.0,5,False,1.0,25.0,4,0.25,50.0,100.0
5,A,A_6,heart,0,0.0,8,1.0,2.0,2.0,6,False,1.0,25.0,4,0.095219,12.0,100.0
6,A,A_7,heart,0,0.0,8,1.0,2.0,2.0,7,False,1.0,25.0,4,0.225,45.0,100.0
7,A,A_8,heart,0,0.0,8,1.0,2.0,2.0,8,False,,0.0,4,0.03,6.0,100.0


In [18]:
# # temp = df[[sample_col, feature_col, 'counts']].drop_duplicates().copy(deep=True)

# sample_col = 'sample'
# feature_col = 'gene_id'
# temp = df.copy(deep=True)

# # in case the values are not unique (ie feature = gene_id, orf_id, etc. 
# temp = df[[feature_col, sample_col, 'counts']]
# temp.head()
# temp  = (temp.groupby([feature_col, sample_col])
#         .sum()
#         .reset_index()
#         .rename({'counts':f'{feature_col}_counts'}, axis=1))
# temp.drop(sample_col, axis=1, inplace=True)
# temp = (temp.groupby(feature_col)
#            .mean()
#            .reset_index()
#            .rename({f'{feature_col}_counts':
#                     f'avg_{feature_col}_counts'}, axis=1))
# df = df.merge(temp,
#               how='left',
#               on=[feature_col]) 

In [19]:
df

Unnamed: 0,gene_id,transcript_id,sample,counts,pi,gene_potential,entropy,perplexity,n_effective_isoforms,isoform_rank,effective_isoform,n_samples_effective,expression_breadth,n_exp_samples,expression_var,avg_transcript_id_counts,avg_gene_id_counts
0,A,A_1,heart,50,0.5,8,1.0,2.0,2.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
1,A,A_2,heart,0,0.0,8,1.0,2.0,2.0,3,False,2.0,50.0,4,0.095743,15.0,100.0
2,A,A_3,heart,0,0.0,8,1.0,2.0,2.0,4,False,2.0,50.0,4,0.094516,14.0,100.0
3,A,A_4,heart,50,0.5,8,1.0,2.0,2.0,2,True,3.0,75.0,4,0.21,20.5,100.0
4,A,A_5,heart,0,0.0,8,1.0,2.0,2.0,5,False,1.0,25.0,4,0.25,50.0,100.0
5,A,A_6,heart,0,0.0,8,1.0,2.0,2.0,6,False,1.0,25.0,4,0.095219,12.0,100.0
6,A,A_7,heart,0,0.0,8,1.0,2.0,2.0,7,False,1.0,25.0,4,0.225,45.0,100.0
7,A,A_8,heart,0,0.0,8,1.0,2.0,2.0,8,False,,0.0,4,0.03,6.0,100.0
8,A,A_1,brain,45,0.45,8,1.368996,2.582907,3.0,1,True,4.0,100.0,4,0.160078,33.75,100.0
9,A,A_2,brain,0,0.0,8,1.368996,2.582907,3.0,4,False,2.0,50.0,4,0.095743,15.0,100.0


In [20]:
# we want to be able to collapse values across feature_col / sample_col
# to get global values
df = dummy_counts_data_sample()
feature_col = 'transcript_id'
sample_col = 'sample'

In [21]:
df = df[[feature_col, sample_col, 'counts']]
df.head()

Unnamed: 0,transcript_id,sample,counts
0,A_1,heart,50
1,A_1,brain,45
2,A_1,lungs,20
3,A_1,kidney,20
4,A_2,heart,0


In [22]:
df['depth'] = (df.groupby(sample_col)['counts']
               .transform('sum'))
df[f'{feature_col}_tpm'] = df['counts'] / ( df['depth'] * 1e6 )
df

Unnamed: 0,transcript_id,sample,counts,depth,transcript_id_tpm
0,A_1,heart,50,100,5e-07
1,A_1,brain,45,100,4.5e-07
2,A_1,lungs,20,100,2e-07
3,A_1,kidney,20,100,2e-07
4,A_2,heart,0,100,0.0
5,A_2,brain,0,100,0.0
6,A_2,lungs,10,100,1e-07
7,A_2,kidney,20,100,2e-07
8,A_3,heart,0,100,0.0
9,A_3,brain,0,100,0.0


In [23]:
EXP_COL = 'counts'
FEATURE_COL = TRANSCRIPT_COL = 'transcript_id'
GENE_COL = 'gene_id'

def collapse_counts_by_feature(df,
                               feature_col=TRANSCRIPT_COL,
                               gene_col=GENE_COL,
                               sample_col=None):
    """
    Collapse counts by a feature (e.g. ORF, transcript) instead of transcript.

    Parameters
    ----------
    df : pd.DataFrame
        Input table with counts at transcript level.
    feature_col : str
        Alternative feature column to collapse to (e.g. 'orf_id').
    gene_col : str
        Column identifying genes.
    sample_col : str, optional
        Sample column. If None, assumes single-sample bulk.

    Returns
    -------
    pd.DataFrame
        Collapsed df with summed counts per feature.
    """
    group_cols = [gene_col, feature_col]
    if sample_col is not None:
        group_cols.append(sample_col)

    # Sum counts for all transcripts mapping to the same feature
    out = (df.groupby(group_cols, as_index=False)['counts']
          .sum())

    return out

def dummy_data_with_orf():
    """
    Generate dummy multi-sample isoform counts
    with an additional ORF-level grouping.

    Returns
    -------
    pd.DataFrame
    """
    import pandas as pd
    
    genes = ['A', 'B']
    samples = ['heart', 'brain', 'lungs', 'kidney']
    
    # map transcripts to ORFs
    orf_map = {}
    for g in genes:
        orf_map[f'{g}_1'] = f'{g}_orf1'
        orf_map[f'{g}_2'] = f'{g}_orf1'
        orf_map[f'{g}_3'] = f'{g}_orf2'
        orf_map[f'{g}_4'] = f'{g}_orf2'

    rows = []
    for s in samples:
        for g in genes:
            for i in range(1, 5):
                t = f'{g}_{i}'
                counts = (hash(f"{s}_{t}") % 50)  # deterministic dummy counts
                rows.append({
                    'sample': s,
                    'gene_id': g,
                    'transcript_id': t,
                    'orf_id': orf_map[t],
                    'counts': counts
                })

    return pd.DataFrame(rows)


In [27]:
df = dummy_data_with_orf()
df2 = compute_avg_expression(df, 
                             feature_col='orf_id',
                               sample_col='sample')
# df2 = collapse_counts_by_feature(df,
#                                feature_col='orf_id',
#                                gene_col='gene_id',
#                                sample_col='sample')

In [28]:
df.loc[df.orf_id=='A_orf1']

Unnamed: 0,sample,gene_id,transcript_id,orf_id,counts
0,heart,A,A_1,A_orf1,33
1,heart,A,A_2,A_orf1,43
8,brain,A,A_1,A_orf1,34
9,brain,A,A_2,A_orf1,34
16,lungs,A,A_1,A_orf1,21
17,lungs,A,A_2,A_orf1,17
24,kidney,A,A_1,A_orf1,30
25,kidney,A,A_2,A_orf1,42


In [29]:
df2.loc[df2.orf_id=='A_orf1']

Unnamed: 0,sample,gene_id,transcript_id,orf_id,counts,avg_orf_id_counts
0,heart,A,A_1,A_orf1,33,63.5
1,heart,A,A_2,A_orf1,43,63.5
8,brain,A,A_1,A_orf1,34,63.5
9,brain,A,A_2,A_orf1,34,63.5
16,lungs,A,A_1,A_orf1,21,63.5
17,lungs,A,A_2,A_orf1,17,63.5
24,kidney,A,A_1,A_orf1,30,63.5
25,kidney,A,A_2,A_orf1,42,63.5


In [31]:
((33+43)+(34+34)+(21+17)+(30+42))/4

63.5