## Generating a Table of the BioBombe Interpreted Features in a High Scoring Ensemble model of TP53 inactivation

**Gregory Way, 2019**

I use the model previously identified that was used to predict TP53 inactivation.
I observe the BioBombe gene set enrichment scores for the features with high coefficients in this model.

In [1]:
import os
import sys
import pandas as pd

## Load the Top Model Identified Previously

In [2]:
model_file = os.path.join("results", "top_model_ensemble_tp53_feature_for_followup.tsv")
top_model_df = pd.read_table(model_file)
top_model_df

Unnamed: 0,gene,signal,z_dim,seed,algorithm,weight,num_features,percent_zero,auroc,aupr,data_type,grouping_,auroc_raw,aupr_raw,z_dim_shape
0,TP53,signal,200,ensemble,Model Ensemble,906,1022,0.886497,0.90534,0.875,cv,TP53signal,0.91836,0.88467,z >= 20


In [3]:
# The seed we used to compile single model
seed = "165158"
z_dim = top_model_df.z_dim.values[0]

## Load the BioBombe network projection results for Cancer Hallmarks

In [4]:
file = os.path.join("..", "6.biobombe-projection", "results", "tcga",
                    "gph", "signal", "tcga_z_200_GpH__geneset_scores.tsv.gz")

scores_df = (
    pd
    .read_table(file)
    .query("seed == @seed")
    .query("z == @z_dim")
)

scores_df = (
    scores_df
    .assign(full_feature=scores_df.algorithm.astype(str) + "_" + scores_df.feature.astype(str),
            abs_z_score=scores_df.z_score.abs())
)
scores_df.head()

Unnamed: 0,model_type,variable,value,z_score,algorithm,feature,z,seed,full_feature,abs_z_score
8,real,HALLMARK_OXIDATIVE_PHOSPHORYLATION,-0.260304,-21.8954,ica,59,200,165158,ica_59,21.8954
20,real,HALLMARK_MYC_TARGETS_V1,-1.54717,-19.3482,pca,7,200,165158,pca_7,19.3482
29,real,HALLMARK_TNFA_SIGNALING_VIA_NFKB,-18.5156,-18.7641,dae,114,200,165158,dae_114,18.7641
30,real,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-5.4932,-18.4886,vae,197,200,165158,vae_197,18.4886
31,real,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-4.00936,-18.0103,vae,179,200,165158,vae_179,18.0103


## Load Model Coefficients

In [5]:
file = os.path.join("results",
                    "mutation_ensemble",
                    "TP53",
                    "TP53_ensemble_all_alg_coefficients.tsv.gz")

top_n_features = 10

coef_df = (
    pd.read_table(file)
    .query("seed == @seed")
    .query("z_dim == @z_dim")
    .query("signal == 'signal'")
    .sort_values(by='abs', ascending=False)
    .head(top_n_features)
    .reset_index(drop=True)
)

# Rename columns
coef_extract_df = (
    pd.DataFrame(coef_df.feature.str.split('_').values.tolist(),
                 columns=['feature_alg', 'feature_num',
                          'feature_seed', 'feature_z',
                          'feature_signal'])
)

coef_extract_df = (
    coef_extract_df
    .assign(use_feature=coef_extract_df.feature_alg + "_" + coef_extract_df.feature_num)
)

coef_df = pd.concat([coef_df, coef_extract_df], axis='columns')

use_features = coef_df.use_feature.tolist()
coef_df

Unnamed: 0,feature,weight,abs,signal,z_dim,seed,algorithm,gene,feature_alg,feature_num,feature_seed,feature_z,feature_signal,use_feature
0,ica_191_165158_200_signal,-0.16781,0.16781,signal,200,165158,all_ensemble,TP53,ica,191,165158,200,signal,ica_191
1,dae_126_165158_200_signal,0.16749,0.16749,signal,200,165158,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
2,pca_32_165158_200_signal,0.15968,0.15968,signal,200,165158,all_ensemble,TP53,pca,32,165158,200,signal,pca_32
3,pca_23_165158_200_signal,0.14462,0.14462,signal,200,165158,all_ensemble,TP53,pca,23,165158,200,signal,pca_23
4,vae_172_165158_200_signal,-0.12311,0.12311,signal,200,165158,all_ensemble,TP53,vae,172,165158,200,signal,vae_172
5,vae_136_165158_200_signal,-0.11075,0.11075,signal,200,165158,all_ensemble,TP53,vae,136,165158,200,signal,vae_136
6,pca_99_165158_200_signal,0.10979,0.10979,signal,200,165158,all_ensemble,TP53,pca,99,165158,200,signal,pca_99
7,nmf_97_165158_200_signal,-0.10714,0.10714,signal,200,165158,all_ensemble,TP53,nmf,97,165158,200,signal,nmf_97
8,dae_7_165158_200_signal,0.10229,0.10229,signal,200,165158,all_ensemble,TP53,dae,7,165158,200,signal,dae_7
9,vae_18_165158_200_signal,-0.099937,0.099937,signal,200,165158,all_ensemble,TP53,vae,18,165158,200,signal,vae_18


In [6]:
# Explore the biobombe scores for specific DAE features
top_n_features = 10

biobombe_df = (
    scores_df
    .query("full_feature in @use_features")
    .merge(coef_df,
           how='left',
           left_on=['full_feature', 'algorithm', 'seed'],
           right_on=['use_feature', 'feature_alg', 'seed'])
    .drop(['model_type', 'feature_x', 'feature_y', 'signal'], axis='columns')
    .sort_values(by=['abs', 'abs_z_score'], ascending=False)
    .reset_index(drop=True)
)

top_biobombe_df = (
    biobombe_df
    .groupby('full_feature')
    .apply(func=lambda x: x.abs_z_score.nlargest(top_n_features))
    .reset_index()
    .merge(biobombe_df
           .reset_index(),
           right_on=['index', 'abs_z_score', 'full_feature'],
           left_on=['level_1', 'abs_z_score', 'full_feature'])
    .drop(['level_1', 'index'], axis='columns')
    .sort_values(by=['weight', 'z_score'], ascending=False)
)
    
    
print(top_biobombe_df.shape)
top_biobombe_df

(100, 19)


Unnamed: 0,full_feature,abs_z_score,variable,value,z_score,algorithm_x,z,seed,weight,abs,z_dim,algorithm_y,gene,feature_alg,feature_num,feature_seed,feature_z,feature_signal,use_feature
0,dae_126,9.58473,HALLMARK_MYC_TARGETS_V1,3.797060,9.58473,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
1,dae_126,5.68817,HALLMARK_MTORC1_SIGNALING,3.365900,5.68817,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
2,dae_126,4.99144,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,4.776580,4.99144,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
5,dae_126,3.48989,HALLMARK_COAGULATION,2.101940,3.48989,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
8,dae_126,3.08240,HALLMARK_GLYCOLYSIS,3.331440,3.08240,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
9,dae_126,3.07642,HALLMARK_WNT_BETA_CATENIN_SIGNALING,-0.432020,-3.07642,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
7,dae_126,3.30858,HALLMARK_P53_PATHWAY,-1.755170,-3.30858,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
6,dae_126,3.43755,HALLMARK_ESTROGEN_RESPONSE_EARLY,-1.173030,-3.43755,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
4,dae_126,3.65614,HALLMARK_XENOBIOTIC_METABOLISM,-1.648150,-3.65614,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126
3,dae_126,4.86171,HALLMARK_ALLOGRAFT_REJECTION,0.007916,-4.86171,dae,200,165158,0.16749,0.16749,200,all_ensemble,TP53,dae,126,165158,200,signal,dae_126


In [7]:
# Output biobombe scores applied to high scoring DAE features
file = os.path.join('results', 'tcga_tp53_classify_top_biobombe_scores_ensemble_model_table.tsv')
top_biobombe_df.to_csv(file, sep='\t', index=False)