Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ameyabhope committed Jul 23, 2019
1 parent 8add112 commit 6ab3a1b
Show file tree
Hide file tree
Showing 6 changed files with 317 additions and 0 deletions.
24 changes: 24 additions & 0 deletions Data/gdsc_dr.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Data/gdsc_tissue_by_sample_2.csv

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions Data/tcga_dr.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Data/tcga_tissue_by_sample2.csv

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions Results/GDSC_TCGA_lasso_tissue.csv

Large diffs are not rendered by default.

243 changes: 243 additions & 0 deletions lasso_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
import argparse
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import os
import itertools
from multiprocessing import Pool
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import mean_squared_error

# CSV files names.
#TRAIN_GENE_EXPRESSION = 'gdsc_expr_postCB.csv'
#TEST_GENE_EXPRESSION = 'tcga_expr_postCB.csv'
#TRAIN_DRUG_RESPONSE = 'gdsc_dr.csv'
#TEST_DRUG_RESPONSE = 'tcga_dr.csv'
#
#GDSC_TISSUE_FNAME = 'gdsc_tissue_by_sample_2.csv'
#TCGA_TISSUE_FNAME = 'tcga_tissue_by_sample2.csv'
#
#OUTPUT_FNAME = 'GDSC_TCGA_lasso_tissue.csv'


def parse_args():
"""
Parse the arguments.
Parse the command line arguments/options using the argparse module
and return the parsed arguments (as an argparse.Namespace object,
as returned by argparse.parse_args()).
Returns:
argparse.Namespace: the parsed arguments
"""
parser = argparse.ArgumentParser(description='Arguments for running TG_LASSO')
parser.add_argument('-id', '--input_directory', default='./Data', help = 'Address of the directory containing the input files')
parser.add_argument('-od', '--output_directory', default = './Results', help = 'output directory adddress')
parser.add_argument('-trg','--train_gene_expression', type = str, help = 'name of the input gene expression file from GDSC', default = 'gdsc_expr_postCB.csv')
parser.add_argument('-teg','--test_gene_expression',type = str,
help = 'name of the gene expression file from TCGA to be tested',default = 'tcga_expr_postCB.csv')
parser.add_argument('-trd','--train_drug_response', type = str,
help = 'name of the input drug response file from GDSC',default = 'gdsc_dr.csv')
parser.add_argument('-ted','--test_drug_response', type = str,
help = 'name of the drug response file from TCGA to be tested',default = 'tcga_dr.csv')
parser.add_argument('-gt','--gdsc_tissue', type = str,
help = 'name of the input gdsc tissue by sample file',default ='gdsc_tissue_by_sample_2.csv')
parser.add_argument('-tt','--tcga_tissue', type = str,
help = 'name of the input tcga tissue by sample file',default ='tcga_tissue_by_sample2.csv')
parser.add_argument('-of','--output_file', type = str,
help = 'name of the output file',default = 'GDSC_TCGA_lasso_tissue.csv')

args = parser.parse_args()
return args

def get_drug_lasso_alpha(train_expr, train_resp):
"""
Trains a lasso CV for each drug, and returns dictionary.
Key: drug name -> str
Value: alpha -> float
"""

drug_alpha_dct = {}


X_train = train_expr.values.T
# Train a lasso CV for each drug.
with warnings.catch_warnings():
for drug in train_resp.index.values:
y_train_tmp = train_resp.loc[drug].values
not_nan_ind = ~np.isnan(y_train_tmp)
y_train_tmp = y_train_tmp[not_nan_ind]
X_train_tmp = X_train[not_nan_ind,:]

warnings.filterwarnings("ignore")
reg_model = LassoCV(n_jobs=-1)

reg_model.fit(X_train_tmp, y_train_tmp)

drug_alpha_dct[drug] = reg_model.alpha_

return drug_alpha_dct

def fit_model(*args):

X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp, alpha = args[0]
reg_model = Lasso(alpha=alpha, random_state=0)
reg_model.fit(X_train_tmp, y_train_tmp)

y_pred = reg_model.predict(X_test_tmp)
rmse = mean_squared_error(y_test_tmp, y_pred)
return rmse

def tune_alpha(train_expr_non_tissue, train_resp_non_tissue, train_expr_tissue,
train_resp_tissue, all_tissue_drug_alpha_dct):
"""
Given the training expression and training response as well as a tissue
type, find the alpha that trains on all other tissue types and predicts on
the given tissue type.
"""

drug_alpha_dct = {}

X_train = train_expr_non_tissue.values.T
X_test = train_expr_tissue.values.T
# Get the best alpha for each drug.
for drug in train_resp_non_tissue.index.values:
# Get the training set.
y_train_tmp = train_resp_non_tissue.loc[drug].values
not_nan_ind = ~np.isnan(y_train_tmp)
y_train_tmp = y_train_tmp[not_nan_ind]
X_train_tmp = X_train[not_nan_ind,:]

# Get the validation set.
y_test_tmp = train_resp_tissue.loc[drug].values
not_nan_ind = ~np.isnan(y_test_tmp)
y_test_tmp = y_test_tmp[not_nan_ind]
X_test_tmp = X_test[not_nan_ind,:]

# If there are no non-nan samples for this tissue in GDDSC, then use the
# alpha from training on all of GDSC.
if X_test_tmp.shape[0] == 0:
drug_alpha_dct[drug] = all_tissue_drug_alpha_dct[drug]
continue

# Initialize the alpha dictionary for the current drug.
best_alpha_dct = {}
pool = Pool(processes=24)
alpha_space = np.logspace(-2, -1, 100)
# Check which alpha yields the highest performance.
rmse_lst = pool.map(fit_model, zip(itertools.repeat(X_train_tmp),
itertools.repeat(y_train_tmp), itertools.repeat(X_test_tmp),
itertools.repeat(y_test_tmp), alpha_space))
for i, e in enumerate(alpha_space):
best_alpha_dct[e] = rmse_lst[i]
pool.close()
pool.join()
# Get the alpha corresponding to the lowest rmse.
drug_alpha_dct[drug] = min(best_alpha_dct, key=best_alpha_dct.get)
return drug_alpha_dct

def main():

args = parse_args()

train_gene_expression = os.path.join(args.input_directory,args.train_gene_expression)
address_out_dir = args.output_directory
if not os.path.exists(address_out_dir):
os.makedirs(address_out_dir)
test_gene_expression = os.path.join(args.input_directory,args.test_gene_expression)
train_drug_response = os.path.join(args.input_directory,args.train_drug_response)
test_drug_response = os.path.join(args.input_directory,args.test_drug_response)
gdsc_tissue = os.path.join(args.input_directory,args.gdsc_tissue)
tcga_tissue = os.path.join(args.input_directory,args.tcga_tissue)

output_file = os.path.join(args.output_directory,args.output_file)
print('Importing done successfully')

# Read CSV files.
train_expr = pd.read_csv(train_gene_expression, index_col=0)
test_expr = pd.read_csv(test_gene_expression, index_col=0)
train_resp = pd.read_csv(train_drug_response, index_col=0)
test_resp = pd.read_csv(test_drug_response, index_col=0)

# Get binary matrices indicating sample-tissue membership.
gdsc_tissue = pd.read_csv(gdsc_tissue, index_col=0)
tcga_tissue = pd.read_csv(tcga_tissue, index_col=0)

all_tissue_drug_alpha_dct = get_drug_lasso_alpha(train_expr, train_resp)


# This is the drug response dictionary. Keys are (drug, sample) pairs.
predicted_dr_dct = {}


# Loop through the TCGA tissue types. We predict on all of them.
for tissue in tcga_tissue.index.values:
# If tissue in GDSC, train on GDSC samples in tissue.
if tissue in gdsc_tissue.index.values:
gdsc_tissue_samples = gdsc_tissue.loc[tissue]
gdsc_tissue_samples = gdsc_tissue_samples.iloc[gdsc_tissue_samples.to_numpy().nonzero()].index.values
# Get the samples corresponding to the tissue.
train_expr_tissue = train_expr[gdsc_tissue_samples]
train_resp_tissue = train_resp[gdsc_tissue_samples]
# Get all samples excluding current tissue.
train_expr_non_tissue = train_expr.drop(gdsc_tissue_samples, axis=1)
train_resp_non_tissue = train_resp.drop(gdsc_tissue_samples, axis=1)
# Find the best alpha with the tissue samples as the validation set.
drug_alpha_dct = tune_alpha(train_expr_non_tissue, train_resp_non_tissue,
train_expr_tissue, train_resp_tissue, all_tissue_drug_alpha_dct)
# Otherwise, train on all GDSC samples.
else:
# This dictionary is trained from lasso_tissue.py
drug_alpha_dct = all_tissue_drug_alpha_dct
# Train on all GDSC samples rather than just the tissue.
train_expr_tissue = train_expr
train_resp_tissue = train_resp

# Do the same for TCGA.
tcga_tissue_samples = tcga_tissue.loc[tissue]
tcga_tissue_samples = tcga_tissue_samples.iloc[tcga_tissue_samples.to_numpy().nonzero()].index.values

test_expr_tissue = test_expr[tcga_tissue_samples]
test_resp_tissue = test_resp[tcga_tissue_samples]

# Go through the drug names.
for drug in list(train_resp.index):
print("Predicting the response of '%s' tissue for '%s' drug"%(tissue, drug))
y_train_tmp = train_resp_tissue.loc[drug].values
not_nan_ind = ~np.isnan(y_train_tmp)
y_train_tmp = y_train_tmp[not_nan_ind]
X_train_tmp = train_expr_tissue.values.T[not_nan_ind,:]

# Remake the training set to the entire set if the tissue has no non-nan values.
if X_train_tmp.shape[0] == 0:
y_train_tmp = train_resp.loc[drug].values
not_nan_ind = ~np.isnan(y_train_tmp)
y_train_tmp = y_train_tmp[not_nan_ind]
X_train_tmp = train_expr.values.T[not_nan_ind,:]


# Use the alpha learned from training on all samples.
clf = Lasso(alpha=drug_alpha_dct[drug], random_state=0)
clf.fit(X_train_tmp, y_train_tmp)

# Write out the tissue lasso weights.
with open('%s_%s_lasso_weights' % (tissue, drug), 'wb') as fp:
pickle.dump(clf.coef_, fp)

# Predict on the test values.
y_test_hat_tmp = clf.predict(test_expr_tissue.values.T)
for i, ic50 in enumerate(y_test_hat_tmp):
predicted_dr_dct[(drug, tcga_tissue_samples[i])] = ic50


# Unstack the dictionary into a dataframe.
dr_matrix = pd.Series(predicted_dr_dct).unstack()
dr_matrix = dr_matrix[test_resp.columns]
# Write out the results.
dr_matrix.to_csv(output_file)

if __name__ == '__main__':
main()

0 comments on commit 6ab3a1b

Please sign in to comment.