## Generating a Table of the BioBombe Interpreted Features in the All Feature Ensemble model of TP53 inactivation

**Gregory Way, 2019**

I use the model previously identified that was used to predict TP53 inactivation.
I observe the BioBombe gene set enrichment scores for the features with high coefficients in this model.

In [1]:
import os
import sys
import pandas as pd

## Load the `All Feature` Ensemble Model

In [2]:
model_file = os.path.join("results", "top_model_ensemble_all_features_tp53_feature_for_followup.tsv")
top_model_df = pd.read_table(model_file)
top_model_df

Unnamed: 0,gene,signal,z_dim,seed,algorithm,weight,num_features,percent_zero,auroc,aupr,data_type
0,TP53,signal,all,ensemble_all_features,all_feature_ensemble,30555,30872,0.989732,0.91202,0.88231,cv


In [3]:
coef_file = os.path.join("results",
                         "mutation_ensemble_all",
                         "TP53",
                         "TP53_ensemble_all_features_coefficients.tsv.gz")
coef_df = pd.read_table(coef_file).drop(['signal', 'z_dim', 'seed', 'algorithm'], axis='columns')
coef_df.head()

Unnamed: 0,feature,weight,abs,gene
0,vae_133_978124_200_signal,0.082262,0.082262,TP53
1,vae_6_908341_50_signal,0.075826,0.075826,TP53
2,vae_4_908341_150_signal,-0.065103,0.065103,TP53
3,dae_140_451283_150_signal,-0.062348,0.062348,TP53
4,nmf_97_486191_200_signal,-0.061847,0.061847,TP53


In [4]:
full_coef_id_df = (
    pd.DataFrame(coef_df.feature.str.split("_").values.tolist(),
                 columns=['algorithm', 'individual_feature', 'seed', 'k', 'signal'])
)

full_coef_id_df = pd.concat([full_coef_id_df, coef_df], axis='columns')
full_coef_id_df = full_coef_id_df.query("abs > 0").query("signal == 'signal'")

print(full_coef_id_df.shape)
full_coef_id_df.head()

(317, 9)


Unnamed: 0,algorithm,individual_feature,seed,k,signal,feature,weight,abs,gene
0,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53
1,vae,6,908341,50,signal,vae_6_908341_50_signal,0.075826,0.075826,TP53
2,vae,4,908341,150,signal,vae_4_908341_150_signal,-0.065103,0.065103,TP53
3,dae,140,451283,150,signal,dae_140_451283_150_signal,-0.062348,0.062348,TP53
4,nmf,97,486191,200,signal,nmf_97_486191_200_signal,-0.061847,0.061847,TP53


## Load Network Projection Results

In [5]:
gph_dir = os.path.join("..",
                       "6.biobombe-projection",
                       "results",
                       "tcga",
                       "gph",
                       "signal")
gph_files = os.listdir(gph_dir)

In [6]:
all_scores_list = []
for file in gph_files:
    file = os.path.join(gph_dir, file)
    scores_df = pd.read_table(file)
    all_scores_list.append(scores_df)

In [7]:
all_scores_df = pd.concat(all_scores_list, axis='rows')

print(all_scores_df.shape)
all_scores_df.head()

(1542500, 8)


Unnamed: 0,model_type,variable,value,z_score,algorithm,feature,z,seed
0,real,HALLMARK_TNFA_SIGNALING_VIA_NFKB,-0.01189,-34.8593,ica,5,8,165158
1,real,HALLMARK_TNFA_SIGNALING_VIA_NFKB,-0.010828,-25.8855,ica,4,8,978124
2,real,HALLMARK_XENOBIOTIC_METABOLISM,-0.019012,-25.6744,ica,3,8,451283
3,real,HALLMARK_XENOBIOTIC_METABOLISM,-0.018991,-25.2048,ica,6,8,486191
4,real,HALLMARK_G2M_CHECKPOINT,-0.021909,-23.2724,ica,5,8,908341


In [8]:
all_scores_df = all_scores_df.assign(big_feature_id=all_scores_df.algorithm + "_" +
                                     all_scores_df.feature.astype(str) + "_" +
                                     all_scores_df.seed.astype(str) + "_" +
                                     all_scores_df.z.astype(str) + "_signal")
all_scores_df = all_scores_df.assign(abs_z_score=all_scores_df.z_score.abs())

In [9]:
all_coef_scores_df = (
    full_coef_id_df
    .merge(all_scores_df,
           how='left',
           left_on="feature",
           right_on="big_feature_id")
    .sort_values(by=['abs', 'abs_z_score'], ascending=False)
    .reset_index(drop=True)
)

all_coef_scores_df.head()

Unnamed: 0,algorithm_x,individual_feature,seed_x,k,signal,feature_x,weight,abs,gene,model_type,variable,value,z_score,algorithm_y,feature_y,z,seed_y,big_feature_id,abs_z_score
0,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53,real,HALLMARK_ESTROGEN_RESPONSE_EARLY,-3.89942,-10.5085,vae,133,200,978124,vae_133_978124_200_signal,10.5085
1,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53,real,HALLMARK_ESTROGEN_RESPONSE_LATE,-4.346,-6.83035,vae,133,200,978124,vae_133_978124_200_signal,6.83035
2,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53,real,HALLMARK_P53_PATHWAY,-3.46517,-6.42001,vae,133,200,978124,vae_133_978124_200_signal,6.42001
3,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53,real,HALLMARK_COAGULATION,-2.60747,-6.38283,vae,133,200,978124,vae_133_978124_200_signal,6.38283
4,vae,133,978124,200,signal,vae_133_978124_200_signal,0.082262,0.082262,TP53,real,HALLMARK_XENOBIOTIC_METABOLISM,-3.64404,-6.15825,vae,133,200,978124,vae_133_978124_200_signal,6.15825


In [10]:
# Explore the biobombe scores for specific DAE features
top_n_features = 5

biobombe_df = (
    all_coef_scores_df
    .groupby('big_feature_id')
    .apply(func=lambda x: x.abs_z_score.nlargest(top_n_features))
    .reset_index()
    .merge(all_coef_scores_df
           .reset_index(),
           right_on=['index', 'abs_z_score', 'big_feature_id'],
           left_on=['level_1', 'abs_z_score', 'big_feature_id'])
    .drop(['level_1', 'index', 'feature_x',
           'algorithm_x', 'seed_x',
           'model_type', 'algorithm_y',
           'feature_y', 'seed_y', 'z'], axis='columns')
    .sort_values(by=['abs', 'abs_z_score'], ascending=False)
    .reset_index(drop=True)
)

print(biobombe_df.shape)
biobombe_df.head(20)

(1585, 11)


Unnamed: 0,big_feature_id,abs_z_score,individual_feature,k,signal,weight,abs,gene,variable,value,z_score
0,vae_133_978124_200_signal,10.5085,133,200,signal,0.082262,0.082262,TP53,HALLMARK_ESTROGEN_RESPONSE_EARLY,-3.89942,-10.5085
1,vae_133_978124_200_signal,6.83035,133,200,signal,0.082262,0.082262,TP53,HALLMARK_ESTROGEN_RESPONSE_LATE,-4.346,-6.83035
2,vae_133_978124_200_signal,6.42001,133,200,signal,0.082262,0.082262,TP53,HALLMARK_P53_PATHWAY,-3.46517,-6.42001
3,vae_133_978124_200_signal,6.38283,133,200,signal,0.082262,0.082262,TP53,HALLMARK_COAGULATION,-2.60747,-6.38283
4,vae_133_978124_200_signal,6.15825,133,200,signal,0.082262,0.082262,TP53,HALLMARK_XENOBIOTIC_METABOLISM,-3.64404,-6.15825
5,vae_6_908341_50_signal,13.4192,6,50,signal,0.075826,0.075826,TP53,HALLMARK_TNFA_SIGNALING_VIA_NFKB,-4.42049,-13.4192
6,vae_6_908341_50_signal,10.688,6,50,signal,0.075826,0.075826,TP53,HALLMARK_XENOBIOTIC_METABOLISM,-5.68437,-10.688
7,vae_6_908341_50_signal,7.52623,6,50,signal,0.075826,0.075826,TP53,HALLMARK_COAGULATION,-4.08482,-7.52623
8,vae_6_908341_50_signal,7.41978,6,50,signal,0.075826,0.075826,TP53,HALLMARK_MYC_TARGETS_V1,0.408717,7.41978
9,vae_6_908341_50_signal,6.68337,6,50,signal,0.075826,0.075826,TP53,HALLMARK_KRAS_SIGNALING_UP,-4.51147,-6.68337


In [11]:
# Output biobombe scores applied to the all feature ensemble model
file = os.path.join('results', 'tcga_tp53_classify_top_biobombe_scores_all_feature_ensemble_model_table.tsv')
biobombe_df.to_csv(file, sep='\t', index=False)

## Detect the highest contributing variables

In [12]:
neg_biobombe_df = biobombe_df.query("weight < 0")
pos_biobombe_df = biobombe_df.query("weight > 0")

top_neg_variables_df = neg_biobombe_df.groupby("variable")['weight'].sum().sort_values(ascending=True)
top_pos_variables_df = pos_biobombe_df.groupby("variable")['weight'].sum().sort_values(ascending=False)

In [13]:
full_result_df = pd.DataFrame(pd.concat([top_pos_variables_df, top_neg_variables_df]))
full_result_df = (
    full_result_df
    .assign(abs_weight=full_result_df.weight.abs())
    .sort_values(by='abs_weight', ascending=False)
)

full_result_df.head()

Unnamed: 0_level_0,weight,abs_weight
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
HALLMARK_MYC_TARGETS_V1,1.159295,1.159295
HALLMARK_G2M_CHECKPOINT,1.029499,1.029499
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,-1.028895,1.028895
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,0.977735,0.977735
HALLMARK_E2F_TARGETS,0.946652,0.946652


In [14]:
# Output biobombe scores applied to the all feature ensemble model
file = os.path.join('results', 'tcga_tp53_classify_aggregate_biobombe_scores_all_feature_ensemble.tsv')
full_result_df.to_csv(file, sep='\t', index=False)