update: 20230810;20241009

In [1]:
import numpy as np
import pandas as pd
import os, pickle
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
from sklearn.metrics import classification_report, confusion_matrix

# import tensorflow as tf # don't do this!!!
# from tensorflow import keras
from tensorflow.keras import layers, regularizers, utils, losses, optimizers
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import linalg, random

pd.set_option("display.precision", 3)

2024-10-09 14:02:21.115585: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# load model

model = load_model("../data_temp/out_model/model_20230810.mod") # main2_v1

2024-10-09 14:02:32.125013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# gdsc dict used for model training
# feature orders in TCGA should be the same as gdsc

with open('../data_temp/gdsc_dict.dict', 'rb') as f:
    gdsc_dict = pickle.load(f)

# TCGA data transfer

## data.load

In [4]:
# loada data from main1_v2

response_tcga_v2 = pd.read_csv("../data_temp/response_tcga_v2.csv")
pheno_tcga_v2 = pd.read_csv("../data_temp/pheno_tcga_v2.csv")
pheno_tcga_v2_dummy = pd.read_csv("../data_temp/pheno_tcga_v2_dummy.csv", index_col = 0)
snv_tcga_v1 = pd.read_csv("../data_temp/snv_tcga_v1.csv", index_col = 0)
tpm_tcga_v2 = pd.read_csv("../data_temp/tpm_tcga_v2.csv", index_col = 0)

df_drug = pd.read_csv("../data_temp/df_fingerprints_18drugs.csv", index_col = 0)

In [5]:
response_tcga_v2.head(2)

Unnamed: 0,Cancer,bcr_patient_barcode,drug_name,DrugBank ID,measure_of_response,days_to_drug_therapy_start,days_to_drug_therapy_end,days_to_initial_pathologic_diagnosis,method_of_sample_procurement,days_to_sample_procurement,days_to_new_tumor_event_after_initial_treatment,additional_pharmaceutical_therapy,new_tumor_event_additional_surgery_procedure,history_of_neoadjuvant_treatment
0,Breast invasive carcinoma (BRCA),TCGA-E9-A1NG,Tamoxifen,DB00675,Complete Response,34,786,0.0,Modified Radical Mastectomy,0.0,,,,No
1,Breast invasive carcinoma (BRCA),TCGA-BH-A2L8,Cyclophosphamide,DB00531,Complete Response,136,197,0.0,Simple Mastectomy,70.0,,,,No


## data.transform

In [6]:
# Question: DO drug_order in TCGA must be the same with that in gdsc?
drug_order_gdsc = gdsc_dict["y_gdsc"]['SIDM00003'].index
print(f"drug_order_gdsc.shape: {drug_order_gdsc.shape}")
print(f"drug_order_gdsc: {drug_order_gdsc}")
# Answer: I don't think so. the "id_sampel AND drug" pair is treated as unique id.

drug_order_gdsc.shape: (18,)
drug_order_gdsc: Index(['Bicalutamide', 'Cisplatin', 'Cyclophosphamide', 'Dasatinib',
       'Docetaxel', 'Doxorubicin', 'Erlotinib', 'Etoposide', 'Fluorouracil',
       'Gemcitabine', 'Oxaliplatin', 'Paclitaxel', 'Pazopanib', 'Sorafenib',
       'Tamoxifen', 'Temozolomide', 'Temsirolimus', 'Vinorelbine'],
      dtype='object')


In [7]:
# y_tcga: reponse
    
response_tcga_v2["response_status"] = np.select(
    [response_tcga_v2.measure_of_response == "Complete Response",
     response_tcga_v2.measure_of_response == "Partial Response",
     response_tcga_v2.measure_of_response == "Stable Disease",
     response_tcga_v2.measure_of_response == "Clinical Progressive Disease"],
    ["CR", "PR", "SD", "PD"]
)
print(response_tcga_v2.response_status.value_counts())


y_tcga = response_tcga_v2.loc[:, ["bcr_patient_barcode", "drug_name", "response_status"]]
y_tcga = pd.pivot_table(data = y_tcga,
                        index = "bcr_patient_barcode", 
                     columns = "drug_name", 
                     values = "response_status",
                     aggfunc = lambda x: x.unique()[0],
                     dropna = False,
                       fill_value = "No_DATA")
print(f"y_tcga.shape: {y_tcga.shape}")

# stat
drug_sum = y_tcga.apply(lambda x: (~x.isna()).sum())
pt_sum = y_tcga.apply(lambda x: (~x.isna()).sum(), axis = 1)
# double check
dc1 = (pt_sum == 1).all()
print(dc1)

# STACK
y_tcga = y_tcga.stack()
y_tcga.name = "response_status"
print(f"y_tcga.shape: {y_tcga.shape}")

CR    55
PD    35
SD     8
PR     7
Name: response_status, dtype: int64
y_tcga.shape: (105, 13)
False
y_tcga.shape: (1365,)


In [8]:
# TRANSFORM: snv, tpm, pheno
# extract the order of sample and drug
order_sample = y_tcga.index.get_level_values(0)
order_drug = y_tcga.index.get_level_values(1)
order_gene = gdsc_dict['x_snv'].columns
order_pheno = gdsc_dict['x_pheno'].columns
order_drug_features = gdsc_dict['x_drug'].columns

# x_snv
x_snv = snv_tcga_v1.loc[order_sample, order_gene]
print(f"x_snv.shape: {x_snv.shape}")

# gep
x_gep = tpm_tcga_v2.loc[order_sample, order_gene]
print(f"x_gep.shape: {x_gep.shape}")

# pheno
x_pheno = pheno_tcga_v2_dummy.loc[order_sample, order_pheno]
print(f"x_pheno.shape: {x_pheno.shape}")

# drug
x_drug = df_drug.loc[order_drug, order_drug_features]
print(f"x_drug.shape: {x_drug.shape}")

x_snv.shape: (1365, 776)
x_gep.shape: (1430, 776)
x_pheno.shape: (1365, 15)
x_drug.shape: (1365, 512)


In [9]:
x_pheno.head(2)

Unnamed: 0_level_0,Age,Sex_Female,Sex_Male,cancerType_TCGA_BRCA,cancerType_TCGA_COAD,cancerType_TCGA_ESCA,cancerType_TCGA_GBM,cancerType_TCGA_HNSC,cancerType_TCGA_KIRC,cancerType_TCGA_LUAD,cancerType_TCGA_OV,cancerType_TCGA_PAAD,cancerType_TCGA_SARC,cancerType_TCGA_SCLC,cancerType_TCGA_SKCM
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TCGA-2L-AAQA,76.0,0,1,0,0,0,0.0,0,0,0,0.0,1,0,0.0,0.0
TCGA-2L-AAQA,76.0,0,1,0,0,0,0.0,0,0,0,0.0,1,0,0.0,0.0


In [10]:
# save dict

tcga_dict = {
    "y_tcga": y_tcga,
    "x_pheno": x_pheno,
    "x_snv": x_snv,
    "x_gep": x_gep,
    "x_drug": x_drug
}

# save
with open('../data_temp/tcga_dict.dict', 'wb') as f:
    pickle.dump(tcga_dict, f)

# Model testing in TCGA

In [13]:
xs = ['x_pheno', 'x_snv', 'x_gep', 'x_drug']
xs = [tcga_dict[e].astype('float64').to_numpy() for e in xs]
y_pred_tcga = model.predict(xs, batch_size = 64)



In [15]:
tcga_dict['y_tcga'].reset_index().head(2)

Unnamed: 0,bcr_patient_barcode,drug_name,response_status
0,TCGA-2L-AAQA,Cisplatin,No_DATA
1,TCGA-2L-AAQA,Cyclophosphamide,No_DATA


In [23]:
# y_tcga_anno
ytrue_ypred_tcga = pd.concat([
    tcga_dict['y_tcga'].reset_index(),
    pd.Series(y_pred_tcga.flatten(), name = "ypred")], axis = 1)
print(f"ytrue_ypred_tcga.shape: {ytrue_ypred_tcga.shape}")

cols = ['bcr_patient_barcode', 'drug_name', 'response_status', 'Age', 'Sex', 'cancerType_TCGA', 'ypred']
ytrue_ypred_tcga = pd.merge(ytrue_ypred_tcga,
                      pheno_tcga_v2,
                       left_on = "bcr_patient_barcode",
                       right_on = "Patient ID",
                       how = "inner")
ytrue_ypred_tcga = ytrue_ypred_tcga.loc[:, cols]
print(f"y_tcga_anno.shape: {y_tcga_anno.shape}")

ytrue_ypred_tcga.shape: (1365, 4)
y_tcga_anno.shape: (1365, 104)


In [24]:
ytrue_ypred_tcga.head()

Unnamed: 0,bcr_patient_barcode,drug_name,response_status,Age,Sex,cancerType_TCGA,ypred
0,TCGA-2L-AAQA,Cisplatin,No_DATA,76.0,Male,PAAD,0.724
1,TCGA-2L-AAQA,Cyclophosphamide,No_DATA,76.0,Male,PAAD,0.949
2,TCGA-2L-AAQA,Docetaxel,No_DATA,76.0,Male,PAAD,-0.846
3,TCGA-2L-AAQA,Doxorubicin,No_DATA,76.0,Male,PAAD,-0.731
4,TCGA-2L-AAQA,Erlotinib,No_DATA,76.0,Male,PAAD,0.694


In [25]:
df_ypred_byDrug_tcga = ytrue_ypred_tcga.pivot_table(
    values = "ypred", 
    index = "bcr_patient_barcode", 
    columns = "drug_name")
df_ypred_byDrug_tcga.head()

drug_name,Cisplatin,Cyclophosphamide,Docetaxel,Doxorubicin,Erlotinib,Etoposide,Fluorouracil,Gemcitabine,Oxaliplatin,Paclitaxel,Pazopanib,Tamoxifen,Temsirolimus
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TCGA-2L-AAQA,0.724,0.949,-0.846,-0.731,0.694,0.281,0.955,-0.183,0.854,-0.846,0.609,0.809,0.099
TCGA-2L-AAQE,0.522,0.98,-0.954,-0.881,0.561,0.007,0.872,-0.449,0.74,-0.954,0.546,0.76,-0.065
TCGA-2L-AAQI,0.73,0.945,-0.84,-0.723,0.7,0.291,0.955,-0.171,0.855,-0.84,0.61,0.806,0.105
TCGA-2L-AAQJ,0.701,0.945,-0.852,-0.742,0.703,0.258,0.941,-0.199,0.829,-0.852,0.606,0.783,0.086
TCGA-3A-A9I9,0.728,0.943,-0.839,-0.724,0.702,0.29,0.953,-0.168,0.85,-0.839,0.603,0.795,0.097


In [26]:
x1 = ytrue_ypred_tcga.pivot_table(
    values = "response_status", 
    aggfunc = lambda x: np.unique(x)[0],
    index = "bcr_patient_barcode", 
    columns = "drug_name")
x1.head()

drug_name,Cisplatin,Cyclophosphamide,Docetaxel,Doxorubicin,Erlotinib,Etoposide,Fluorouracil,Gemcitabine,Oxaliplatin,Paclitaxel,Pazopanib,Tamoxifen,Temsirolimus
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TCGA-2L-AAQA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,PD,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA
TCGA-2L-AAQE,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,PD,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA
TCGA-2L-AAQI,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,PD,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA
TCGA-2L-AAQJ,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,PD,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA
TCGA-3A-A9I9,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA,SD,No_DATA,No_DATA,No_DATA,No_DATA,No_DATA


In [27]:
df_ypred_byDrug_tcga.apply(lambda x: df_ypred_byDrug_tcga.columns[np.argmin(x)], axis = 1).value_counts()

Docetaxel      85
Doxorubicin    20
dtype: int64

In [28]:
# save
ytrue_ypred_tcga.to_csv("../data_temp/ytrue_ypred_tcga.csv", index = False)
df_ypred_byDrug_tcga.to_csv("../data_temp/df_ypred_byDrug_tcga.csv", index = True)