## Generating a Table of the BioBombe Interpreted Features in a High Scoring DAE model of TP53 inactivation

**Gregory Way, 2019**

I use the model previously identified that was used to predict TP53 inactivation.
I observe the BioBombe gene set enrichment scores for the features with high coefficients in this model.

In [1]:
import os
import sys
import pandas as pd

## Load the Top Model Identified Previously

In [2]:
model_file = os.path.join("results", "top_dae_tp53_feature_for_followup.tsv")
top_model_df = pd.read_table(model_file)
top_model_df

Unnamed: 0,gene,signal,z_dim,seed,algorithm,weight,num_features,percent_zero,auroc,aupr,data_type,grouping_,auroc_raw,aupr_raw,z_dim_shape
0,TP53,signal,200,908341,DAE,179,222,0.806306,0.86364,0.82745,cv,TP53signal,0.91836,0.88467,z >= 20


In [3]:
seed = top_model_df.seed.values[0]
z_dim = top_model_df.z_dim.values[0]
algorithm = top_model_df.algorithm.values[0].lower()

## Load the BioBombe network projection results for Cancer Hallmarks

In [4]:
file = os.path.join("..", "6.analyze-weights", "results", "tcga",
                    "gph", "signal", "tcga_z_200_GpH__geneset_scores.tsv.gz")

scores_df = (
    pd
    .read_table(file)
    .query("seed == @seed")
    .query("z == @z_dim")
    .query("algorithm == @algorithm")
)

scores_df = (
    scores_df
    .assign(full_feature=scores_df.algorithm.astype(str) + "_" + scores_df.feature.astype(str),
            abs_z_score=scores_df.z_score.abs())
)
scores_df.head()

Unnamed: 0,model_type,variable,value,z_score,algorithm,feature,z,seed,full_feature,abs_z_score
17,real,HALLMARK_TNFA_SIGNALING_VIA_NFKB,-10.7069,-19.8077,dae,170,200,908341,dae_170,19.8077
27,real,HALLMARK_G2M_CHECKPOINT,-15.2559,-19.1039,dae,110,200,908341,dae_110,19.1039
32,real,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-12.6927,-17.7711,dae,99,200,908341,dae_99,17.7711
62,real,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-13.4749,-15.7488,dae,192,200,908341,dae_192,15.7488
88,real,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-11.1642,-14.7547,dae,73,200,908341,dae_73,14.7547


## Load Model Coefficients

In [5]:
file = os.path.join("results", "mutation", "TP53", "TP53_coefficients.tsv.gz")

top_n_features = 8

coef_df = (
    pd.read_table(file)
    .query("seed == @seed")
    .query("z_dim == @z_dim")
    .query("algorithm == @algorithm")
    .query("signal == 'signal'")
    .sort_values(by='abs', ascending=False)
    .head(top_n_features)
)

use_features = coef_df.feature.tolist()
coef_df

Unnamed: 0,feature,weight,abs,signal,z_dim,seed,algorithm,gene
6201,dae_156,0.23558,0.23558,signal,200,908341,dae,TP53
6202,dae_46,-0.22864,0.22864,signal,200,908341,dae,TP53
6203,dae_122,0.19832,0.19832,signal,200,908341,dae,TP53
6204,dae_24,-0.18192,0.18192,signal,200,908341,dae,TP53
6205,log10_mut,0.17411,0.17411,signal,200,908341,dae,TP53
6206,dae_7,0.16581,0.16581,signal,200,908341,dae,TP53
6207,dae_130,-0.16391,0.16391,signal,200,908341,dae,TP53
6208,dae_88,-0.15494,0.15494,signal,200,908341,dae,TP53


In [6]:
# Explore the biobombe scores for specific DAE features
top_n_features = 10

biobombe_df = (
    scores_df
    .query("full_feature in @use_features")
    .merge(coef_df,
           how='left',
           left_on=['full_feature', 'algorithm', 'seed'],
           right_on=['feature', 'algorithm', 'seed'])
    .drop(['model_type', 'feature_x', 'feature_y', 'signal'], axis='columns')
    .sort_values(by=['abs', 'abs_z_score'], ascending=False)
    .reset_index(drop=True)
)

top_biobombe_df = (
    biobombe_df
    .groupby('full_feature')
    .apply(func=lambda x: x.abs_z_score.nlargest(top_n_features))
    .reset_index()
    .merge(biobombe_df
           .reset_index(),
           right_on=['index', 'abs_z_score', 'full_feature'],
           left_on=['level_1', 'abs_z_score', 'full_feature'])
    .drop(['level_1', 'index'], axis='columns')
    .sort_values(by=['weight', 'z_score'], ascending=False)
)
    
    
print(top_biobombe_df.shape)
top_biobombe_df

(70, 12)


Unnamed: 0,full_feature,abs_z_score,variable,value,z_score,algorithm,z,seed,weight,abs,z_dim,gene
21,dae_156,4.48129,HALLMARK_INTERFERON_GAMMA_RESPONSE,1.465570,4.48129,dae,200,908341,0.23558,0.23558,200,TP53
24,dae_156,3.16951,HALLMARK_HEME_METABOLISM,1.952970,3.16951,dae,200,908341,0.23558,0.23558,200,TP53
25,dae_156,3.08817,HALLMARK_INTERFERON_ALPHA_RESPONSE,0.569345,3.08817,dae,200,908341,0.23558,0.23558,200,TP53
27,dae_156,2.90375,HALLMARK_UV_RESPONSE_UP,1.273850,2.90375,dae,200,908341,0.23558,0.23558,200,TP53
28,dae_156,2.77715,HALLMARK_ESTROGEN_RESPONSE_LATE,0.828225,2.77715,dae,200,908341,0.23558,0.23558,200,TP53
29,dae_156,2.76895,HALLMARK_HYPOXIA,-1.659480,-2.76895,dae,200,908341,0.23558,0.23558,200,TP53
26,dae_156,3.04312,HALLMARK_OXIDATIVE_PHOSPHORYLATION,-1.667030,-3.04312,dae,200,908341,0.23558,0.23558,200,TP53
23,dae_156,3.64827,HALLMARK_COMPLEMENT,-1.425400,-3.64827,dae,200,908341,0.23558,0.23558,200,TP53
22,dae_156,4.11478,HALLMARK_KRAS_SIGNALING_UP,-1.470770,-4.11478,dae,200,908341,0.23558,0.23558,200,TP53
20,dae_156,4.92812,HALLMARK_COAGULATION,-2.637230,-4.92812,dae,200,908341,0.23558,0.23558,200,TP53


In [7]:
# Output biobombe scores applied to high scoring DAE features
file = os.path.join('results', 'tcga_tp53_classify_top_biobombe_scores_dae_table.tsv')
top_biobombe_df.to_csv(file, sep='\t', index=False)