# SHAP value analysis to identify biomarkers

In [1]:
import pandas as pd
import glob
import os
import sys
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
# Get all the raw files
python_baseline_files = glob.glob("/scratch/users/nphill22/projects/corsello_lab/20240313_prism_final_reruns/new_baseline/python_rf_output/*/shap_values.csv")

In [3]:
# load and process each
python_baseline_shap_values = []

for f in tqdm(python_baseline_files):
    drug_name = os.path.basename(os.path.dirname(f))
    df = pd.read_csv(f)
    # absolute value
    df = df.abs()
    # get column means
    shap_means = df.mean(axis=0)

    # take the top 20 features
    shap_means = shap_means.sort_values(ascending=False)
    shap_means = shap_means.iloc[:20]
    shap_means = shap_means.reset_index()

    shap_means['drug_name'] = drug_name
    
    python_baseline_shap_values.append(shap_means)

100%|██████████| 6512/6512 [20:56<00:00,  5.18it/s]


In [4]:
# concat all
df = pd.concat(python_baseline_shap_values)

In [5]:
# pivot df, make index = column names, "0" = value names, and drug_name = index
df = df.pivot(index='drug_name', columns='index', values=0)

In [6]:
df.fillna(0, inplace=True)

# Analyze the SHAP values

In [17]:
# do a UMAP on the data
import umap

reducer = umap.UMAP()
embedding = reducer.fit_transform(df)

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
# get where index = amg_232_2.5
tmp = df.loc['amg-232_2.5']
# sort descending
tmp = tmp.sort_values(ascending=False)
# get the top 20
tmp = tmp.iloc[:20]
print(tmp)

index
MUT_hs_TP53          0.207149
XPR_MDM2             0.138196
XPR_TP53             0.120102
GE_EDA2R             0.063859
GE_CDKN1A            0.061045
GE_MDM2              0.051188
GE_FDXR              0.025711
GE_SPATA18           0.025352
XPR_PPM1D            0.020095
GE_SESN1             0.011625
PROT_Bax             0.010981
GE_HENMT1            0.009282
miRNA_nmiR00324.1    0.009000
GE_MARCKS            0.008230
GE_KANK1             0.007649
GE_ZNF736            0.007579
GE_KCNMB3            0.007417
GE_RPS27L            0.007180
GE_PTCHD4            0.006504
GE_CEP126            0.006451
Name: amg-232_2.5, dtype: float64


In [7]:
def sort_by_biomarker(df, biomarker_name):
    cols = [i for i in df.columns if biomarker_name in i]
    print(cols)
    # sort rows by cols
    df = df.sort_values(by=cols, ascending=False)
    return df

In [8]:
sort_by_biomarker(df, "TRIM21")

['GE_TRIM21']


index,CNA_AACS,CNA_AASDH,CNA_ABCA10,CNA_ABCC13,CNA_ABHD3,CNA_ABRA,CNA_AC004980.1,CNA_AC005358.1,CNA_AC005481.1,CNA_AC005550.2,...,miRNA_nmiR00790.1,miRNA_nmiR00791.1,miRNA_nmiR00792.1,miRNA_nmiR00793.1,miRNA_nmiR00794.1,miRNA_nmiR00795.1,miRNA_nmiR00797.1,miRNA_nmiR00798.1,miRNA_nmiR00799.1,miRNA_nmiR00800.1
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
prlx-93936_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bms-214662_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rs-56812_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nefopam_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-acetyl-4-methylpiperazine_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
emamectin_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
elvitegravir_2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eluxadoline_2.500010466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eltrombopag_2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
df.to_csv("all_top_shap.csv")

In [None]:
q