In [3]:
import sys
sys.path.append("/scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/refract")
import argparse
import logging
import os
import pickle
import sys
import json
import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.model_selection import KFold, StratifiedKFold

from refract.trainers import AutoMLTrainer
from refract.utils import save_output

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from run_training import run

In [5]:
drug_name = 'birinapant'
response_dir = "/scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/responses"
feature_path = "/scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/processed_data/x-all.pkl"
output_dir = "/scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/outputs/small_test"
neighborhood_json = "/scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/refract/notebooks/20240706_feature_selection/neighbors.json"

In [6]:
# create output dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

logger = logging.getLogger(__name__)
logging.basicConfig(level="INFO")

# update logger to write to file
fh = logging.FileHandler(os.path.join(output_dir, "train.log"))
fh.setLevel(logging.INFO)
logger.addHandler(fh)


In [6]:
# load data
logger.info("Loading feature data...")
with open(feature_path, "rb") as f:
    feature_df = pickle.load(f)
feature_df.set_index("ccle_name", inplace=True)
feature_df.fillna(-1, inplace=True)
# drop low variance features
feature_df = feature_df.loc[:, feature_df.var() > 0]

# read the neighborhood json
logger.info("Loading neighborhood data...")
with open(neighborhood_json, "r") as f:
    neighborhood_dict = json.load(f)
    # get similar drugs from the neighborhood
similar_drugs = neighborhood_dict[drug_name]

INFO:__main__:Loading feature data...
  feature_df = pickle.load(f)
INFO:__main__:Loading neighborhood data...


In [7]:
logger.info("Loading response data...")
response_files = glob.glob(os.path.join(response_dir, "*.csv"))
response_data = {}
for response_file in response_files:
    response_name = os.path.basename(response_file).replace(".csv", "")
    response_data[response_name] = pd.read_csv(response_file)
# for every one, select LFC.CB, pert_name, ccle_name
response_data = {k: v.loc[:, ["LFC.cb", "pert_name", "ccle_name"]] for k, v in response_data.items()}
# concatenate all responses
response_data = pd.concat(response_data.values(), axis=0)
# drop duplicates on pert_name, ccle_name
response_data = response_data.drop_duplicates(subset=["pert_name", "ccle_name"])
# pivot so ccle_name is the columns and pert_name is the index
response_data = response_data.pivot(index="pert_name", columns="ccle_name", values="LFC.cb")    # get responses for all these
cluster_responses = response_data.loc[response_data.index.isin(similar_drugs), :]
# set columns as str
cluster_responses.columns = cluster_responses.columns.astype(str)
# drop column named nan
cluster_responses = cluster_responses.drop("nan", axis=1)
# transpose cluster_responses
cluster_responses = cluster_responses.T
# fill NaN with 0
cluster_responses = cluster_responses.fillna(0)
# melt cluster responses
cluster_responses = cluster_responses.reset_index().melt(id_vars="ccle_name", var_name="pert_name", value_name="LFC.cb")
# set ccle_name as index
cluster_responses = cluster_responses.set_index("ccle_name")


INFO:__main__:Loading response data...


In [7]:
cluster_responses

Unnamed: 0_level_0,pert_name,LFC.cb
ccle_name,Unnamed: 1_level_1,Unnamed: 2_level_1
143B_BONE,ar-42,0.000000
1618K_TESTIS,ar-42,0.000000
1777NRPMET_TESTIS,ar-42,0.000000
22RV1_PROSTATE,ar-42,-3.698678
2313287_STOMACH,ar-42,-4.350232
...,...,...
YD8_UPPER_AERODIGESTIVE_TRACT,lcl-161,1.255041
YH13_CENTRAL_NERVOUS_SYSTEM,lcl-161,-2.547366
YKG1_CENTRAL_NERVOUS_SYSTEM,lcl-161,0.774973
YSCCC_BILIARY_TRACT,lcl-161,0.000000


In [8]:
from refract.utils import get_fold_assignment

In [9]:
logger.info("Preparing for training...")
fold_assignment = get_fold_assignment(cluster_responses, drug_name)
cluster_responses = cluster_responses.loc[cluster_responses.index.isin(fold_assignment.keys()), :]
# merge all
df_all = cluster_responses.merge(feature_df, left_index=True, right_index=True, how='inner')
feature_cols = feature_df.columns
label_cols = cluster_responses.columns
df_all["fold"] = df_all.index.map(fold_assignment)

INFO:__main__:Preparing for training...


In [10]:
from sklearn.model_selection import GroupKFold

In [11]:
# START CV TRAIN
logger.info("Training...")
X_all = df_all.loc[:, feature_cols]
y_all = df_all.loc[:, label_cols]
groups = df_all["fold"]
outer_cv = GroupKFold(n_splits=10)
trainers = []
for i, (train_index, test_index) in enumerate(outer_cv.split(X_all, y_all, groups)):
    logger.info(f"Training fold {i}")
    X_train, X_test = X_all.iloc[train_index], X_all.iloc[test_index]
    y_train, y_test = y_all.iloc[train_index], y_all.iloc[test_index]        
    # train one fold
    trainer = AutoMLTrainer(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        feature_cols=feature_cols,
        drug_name=drug_name, 
        fold_assignment=fold_assignment
    )
    break

INFO:__main__:Training...
INFO:__main__:Training fold 0


In [12]:
from refract.utils import get_correlated_features

In [13]:
trainer.select_features()

In [14]:
trainer.train()

[flaml.automl.logger: 07-07 23:55:45] {1680} INFO - task = regression
[flaml.automl.logger: 07-07 23:55:45] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 07-07 23:55:45] {1789} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 07-07 23:55:45] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lgbm']
[flaml.automl.logger: 07-07 23:55:45] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 07-07 23:55:47] {2345} INFO - Estimated sufficient time budget=12003s. Estimated necessary time budget=12s.
[flaml.automl.logger: 07-07 23:55:47] {2392} INFO -  at 1.2s,	estimator xgboost's best error=0.9781,	best estimator xgboost's best error=0.9781
[flaml.automl.logger: 07-07 23:55:47] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 07-07 23:55:49] {2392} INFO -  at 3.2s,	estimator lgbm's best error=0.9946,	best estimator xgboost's best error=0.9781
[flaml.automl.logger: 07-07 23:55:49] {2219} INFO - iteration 2, c

In [15]:
trainers = [trainer]

In [17]:
save_output(trainers, output_dir)

INFO:refract.utils:Aggregating SHAP values and predictions...
INFO:refract.utils:Saving training results to train_results.csv...
INFO:refract.utils:Plotting scatterplot to train_results.png...
INFO:refract.utils:Computing pearson correlation...
INFO:refract.utils:Overall test correlation: 0.2964903294059182
INFO:refract.utils:Saving SHAP summary plot to shap_summary_plot.png...
INFO:refract.utils:Getting top features...
INFO:refract.utils:Saving the SHAP values and top feature names...
INFO:refract.utils:Getting network interactions...
INFO:refract.utils:Saving training details...


In [18]:
# Lets configure some runs
test_cmd = f"python run_training.py --drug_name {drug_name} --response_dir {response_dir} --feature_path {feature_path} --output_dir {output_dir} --neighborhood_json {neighborhood_json}"

In [19]:
test_cmd

'python run_training.py --drug_name birinapant --response_dir /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/responses --feature_path /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/processed_data/x-all.pkl --output_dir /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/outputs/small_test --neighborhood_json /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/refract/notebooks/20240706_feature_selection/neighbors.json'

In [None]:
python run_training.py --drug_name birinapant --response_dir /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/responses --feature_path /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/data/processed_data/x-all.pkl --output_dir /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/outputs/small_test_end_to_end --neighborhood_json /scratch/users/nphill22/projects/corsello_lab/20240706_retrain_feature_select/refract/notebooks/20240706_feature_selection/neighbors.json

In [7]:
import pandas as pd
import numpy as np
import os
import sys

In [8]:
logger.info("Loading response data...")
response_files = glob.glob(os.path.join(response_dir, "*.csv"))
response_data = {}
for response_file in response_files:
    response_name = os.path.basename(response_file).replace(".csv", "")
    response_data[response_name] = pd.read_csv(response_file)
# for every one, select LFC.CB, pert_name, ccle_name
response_data = {k: v.loc[:, ["LFC.cb", "pert_name", "ccle_name"]] for k, v in response_data.items()}
# concatenate all responses
response_data = pd.concat(response_data.values(), axis=0)
# drop duplicates on pert_name, ccle_name
response_data = response_data.drop_duplicates(subset=["pert_name", "ccle_name"])
# pivot so ccle_name is the columns and pert_name is the index
response_data = response_data.pivot(index="pert_name", columns="ccle_name", values="LFC.cb")    # get responses for all these


INFO:__main__:Loading response data...


In [12]:
perts = list(set(response_data.index.values))

In [14]:
# take a random sample of 100
rand_perts = np.random.choice(perts, 100, replace=False)

In [17]:
# save to file
with open("med_test_drugs.txt", "w") as f:
    for pert in rand_perts:
        f.write(pert + "\n")

# Read the PRISM output and get a sampling of a range of pearson score drugs

In [9]:
model_out = glob.glob("/scratch/users/nphill22/projects/corsello_lab/20240313_prism_final_reruns/new_baseline/output/*/Model_table.csv")

In [10]:
prism_perf = pd.concat([pd.read_csv(f) for f in model_out], axis=0)

In [12]:
prism_perf = prism_perf.loc[prism_perf.model == "all", :]

In [14]:
prism_perf

Unnamed: 0,MSE,MSE.se,R2,PearsonScore,pert_mfc_id,pert_name,pert_idose,model
0,0.535591,0.034252,0.031246,0.203915,BRD:BRD-K49522529-001-02-9,alfacalcidol,2.530000,all
0,0.138923,0.010593,0.002905,0.149334,BRD:BRD-K94379058-001-06-8,bml-190,2.500000,all
0,0.091434,0.006196,-0.065392,0.034882,BRD:BRD-K57427145-001-01-6,ripazepam,2.500000,all
0,0.189311,0.016263,-0.077014,-0.030325,BRD:BRD-K80315159-051-02-0,dppe,2.500000,all
0,0.290285,0.018736,-0.034425,0.074175,BRD:BRD-K34073885-001-09-3,tanshinone-i,2.540000,all
...,...,...,...,...,...,...,...,...
0,0.096832,0.008312,-0.083503,0.048577,BRD:BRD-K21586122-001-01-9,fimasartan,2.500008,all
0,1.106020,0.082285,0.107805,0.325976,BRD:BRD-K24715592-406-02-1,ethacridine-lactate-monohydrate,2.530000,all
0,0.072736,0.006479,-0.038580,0.071842,BRD:BRD-K53438416-001-01-9,yz9,2.500000,all
0,0.240671,0.017085,0.029977,0.188852,BRD:BRD-K00003370-001-01-9,cadazolid,2.500000,all


In [15]:
# assign decile
prism_perf['decile'] = pd.qcut(prism_perf['PearsonScore'], 10, labels=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prism_perf['decile'] = pd.qcut(prism_perf['PearsonScore'], 10, labels=False)


In [17]:
# randomly sample 10 per decile
rand_perts = []
for i in range(10):
    decile_perts = prism_perf.loc[prism_perf.decile == i, "pert_name"].values
    rand_perts.extend(np.random.choice(decile_perts, 10, replace=False))

In [21]:
# verify these are in the response data
checks = []
for pert in rand_perts:
    checks.append(pert in response_data.index.values)

In [25]:
# any false?
all(checks)

True

In [27]:
# save to file
with open("sampled_drugs.txt", "w") as f:
    for pert in rand_perts:
        f.write(pert + "\n")