In [74]:
from utils import *
import scipy.stats
import pandas as pd
import numpy as np
from scipy import sparse
import h5py

In [75]:
def manuscript_sample_df():
    df = pd.DataFrame()
    df['gene_id'] = ['A' for i in range(8*4)]
    df['transcript_id'] = flatten_list([[f'A_{j+1}' for i in range(4)] for j in range(8)])
    df['sample'] = flatten_list([['heart', 'brain', 'lungs', 'kidney'] for i in range(8)])
    df['pi'] = [i*100 for i in [0.5, 0.45, 0.2, 0.2,
                0, 0, 0.1, 0.2,
                0, 0, 0.08, 0.2,
                0.5, 0.1, 0.02, 0.2,
                0, 0, 0.5, 0,
                0, 0, 0.04, 0.2,
                0, 0.45, 0, 0,
                0, 0, 0.06, 0]]
    
    # long formitze it
    df = df.pivot(index=['gene_id', 'transcript_id'], columns='sample', values='pi')
    df = df.reset_index()
    df.columns.name = ''
    
    return df

# remove unexpressed transcripts and genes

In [76]:
def simple_counts_df():
    """2 genes, 2 isoforms each, with varying counts."""
    return pd.DataFrame({
        "gene_id": ["G1", "G1", "G2", "G2"],
        "transcript_id": ["T1", "T2", "T3", "T4"],
        "global_counts": [100, 50, 0, 0],
    })

In [87]:
df = simple_counts_df()
df.head()

Unnamed: 0,gene_id,transcript_id,global_counts
0,G1,T1,100
1,G1,T2,50
2,G2,T3,0
3,G2,T4,0


In [88]:

gene_col = 'gene_id'
feature_col = 'transcript_id'
expression_type = 'counts'

validate_counts_input(df,
                          gene_col=gene_col,
                          feature_col=feature_col)
df = rename_sample_col(df,
                  gene_col=gene_col,
                  feature_col=feature_col,
                  expression_type=expression_type)

# collapse counts if feature_col is not unique
df = collapse_counts_by_feature(df,
                                feature_col=feature_col,
                                expression_type=expression_type,
                                gene_col=gene_col,
                                sample_col=None)
df.head()

Unnamed: 0,gene_id,transcript_id,counts
0,G1,T1,100
1,G1,T2,50
2,G2,T3,0
3,G2,T4,0


In [89]:
# remove unexpressed transcripts 
df = df.loc[df.counts>0]
df.head()

Unnamed: 0,gene_id,transcript_id,counts
0,G1,T1,100
1,G1,T2,50


In [18]:
df = manuscript_sample_df()
df.head()

Unnamed: 0,gene_id,transcript_id,brain,heart,kidney,lungs
0,A,A_1,45.0,50.0,20.0,20.0
1,A,A_2,0.0,0.0,20.0,10.0
2,A,A_3,0.0,0.0,20.0,8.0
3,A,A_4,10.0,50.0,20.0,2.0
4,A,A_5,0.0,0.0,0.0,50.0


In [19]:
gene_col = 'gene_id'
feature_col = 'transcript_id'
expression_type = 'counts'
sample_df, global_df = compute_multi_sample_isoform_metrics(df, gene_col=gene_col,
                                          feature_col=feature_col,
                                          expression_type=expression_type)

In [20]:
sample_df.head()

Unnamed: 0,gene_id,transcript_id,counts,tpm,pi,n_detected_features,entropy,perplexity,n_effective,feature_rank,effective,sample,n_samples_effective,expression_breadth,n_exp_samples,expression_var,avg_transcript_id_tpm,max_transcript_id_tpm,avg_gene_id_tpm,max_gene_id_tpm
0,A,A_1,45.0,450000.0,0.45,3,1.368996,2.582907,3.0,1,True,brain,4.0,100.0,4,0.160078,337500.0,500000.0,1000000.0,1000000.0
1,A,A_2,0.0,0.0,0.0,3,1.368996,2.582907,3.0,4,False,brain,2.0,50.0,4,0.095743,150000.0,200000.0,1000000.0,1000000.0
2,A,A_3,0.0,0.0,0.0,3,1.368996,2.582907,3.0,5,False,brain,2.0,50.0,4,0.094516,140000.0,200000.0,1000000.0,1000000.0
3,A,A_4,10.0,100000.0,0.1,3,1.368996,2.582907,3.0,3,True,brain,3.0,75.0,4,0.21,205000.0,500000.0,1000000.0,1000000.0
4,A,A_5,0.0,0.0,0.0,3,1.368996,2.582907,3.0,6,False,brain,1.0,25.0,4,0.25,500000.0,500000.0,1000000.0,1000000.0


In [72]:
feature_out = 'test_feature.h5'

def create_h5_sparse_table(df, fname, mode, key):
    """
    """
    with h5py.File(fname, mode) as f:
        g = f.create_group(key)
        
        # get all of these separately
        X = sparse.csr_matrix(df.values)
        obs_ids = np.array(df.columns)
        var_ids = np.array(df.index)
        
        # add these all as hierarchical keys
        g.create_dataset('data', data=X.data)
        g.create_dataset('indices', data=X.indices)
        g.create_dataset('indptr', data=X.indptr)
        g.create_dataset('shape', data=X.shape)
        g.create_dataset('obs', data=obs_ids.astype('S'))
        g.create_dataset('var', data=var_ids.astype('S'))

# first, make the isoform stack of DFs
feature_cols = list(set([expression_type, 'tpm', 'pi', 'feature_rank', 'effective']))
feature_ids = [feature_col, 'sample']
for i, c in enumerate(feature_cols):
    print(c)
    temp = sample_df[[c]+feature_ids]
    temp = temp.pivot(index=feature_col, columns='sample', values=c)
    temp.columns.name = ''
    
    if i == 0: mode = 'w' 
    else: mode = 'a'
    
    key = f'feature/{c}'
    create_h5_sparse_table(temp, feature_out, mode, key)    
    
# # with pd.HDFStore(feature_out, "r") as store:
# #     print(store.keys())

effective
tpm
pi
feature_rank
counts


In [70]:
temp.head()

Unnamed: 0,effective,transcript_id,sample
0,True,A_1,brain
1,False,A_2,brain
2,False,A_3,brain
3,True,A_4,brain
4,False,A_5,brain


In [56]:
temp = pd.read_hdf(feature_out, 'pi')
temp.head()

Unnamed: 0_level_0,brain,heart,kidney,lungs
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A_1,0.45,0.5,0.2,0.2
A_2,0.0,0.0,0.2,0.1
A_3,0.0,0.0,0.2,0.08
A_4,0.1,0.5,0.2,0.02
A_5,0.0,0.0,0.0,0.5


In [38]:
temp.index

RangeIndex(start=0, stop=8, step=1)

In [73]:
df

Unnamed: 0,gene_id,transcript_id
0,A,A_1
1,A,A_2
2,A,A_3
3,A,A_4
4,A,A_5
5,A,A_6
6,A,A_7
7,A,A_8
