# Docetaxel Sensitivity: Machine Learning using Gene Expression
---------------------------

In [18]:
import numpy as np
import pandas as pd
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()

In [19]:
## Get Sensitivity Data
doce_data = Dataset.get_by_name(ws, name='docetaxel_sensitivity')
doce_data_df = doce_data.to_pandas_dataframe()
doce_data_df

Unnamed: 0,L10_IC_50,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,ARHGAP11B,AC004593.2,AC090517.4,AL160269.1,ABCF2-H2BE1,POLR2J3,H2BE1,AL445238.1,GET1-SH3BGR,AC113348.1
0,-1.664372,2.643856,0.0,6.219556,3.418190,4.659925,0.014355,0.111031,5.846243,7.057883,...,2.111031,0.056584,0.042644,0.056584,1.124328,5.069960,0.0,0.000000,1.803227,0.0
1,-2.265796,2.985500,0.0,6.778734,4.130931,3.778209,0.000000,0.298658,7.433794,6.689299,...,1.735522,0.344828,0.367371,0.124328,2.192194,4.358959,0.0,0.042644,0.097611,0.0
2,-2.194771,4.574707,0.0,6.632414,1.937344,3.401903,0.028569,0.575312,5.775577,3.320485,...,2.477677,1.220330,0.111031,0.000000,2.841973,3.615887,0.0,0.000000,1.189034,0.0
3,-2.816851,5.868637,0.0,6.636045,2.046142,4.996389,0.176323,1.655352,6.200457,3.498251,...,0.773996,0.028569,0.137504,0.070389,1.176323,6.022812,0.0,0.000000,0.056584,0.0
4,-2.370916,5.026800,0.0,6.966130,1.899176,3.531069,0.000000,3.910733,6.371385,4.693208,...,1.952334,0.042644,0.042644,0.042644,3.104337,5.934281,0.0,0.000000,0.378512,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,-2.450142,0.000000,0.0,6.231125,2.684819,4.898208,3.026800,0.028569,3.347666,3.772941,...,2.533563,0.000000,0.056584,0.000000,1.906891,5.387500,0.0,0.056584,0.400538,0.0
677,-0.244988,5.649328,0.0,6.674828,3.085765,3.790772,0.226509,3.878725,6.167519,5.101818,...,2.084064,0.201634,0.000000,0.000000,2.375735,4.361768,0.0,0.000000,0.333424,0.0
678,-2.664806,4.863938,0.0,5.977967,2.553361,3.772941,0.275007,7.178814,6.867279,3.959770,...,3.310340,0.000000,0.000000,0.000000,3.748461,6.263973,0.0,0.000000,0.111031,0.0
679,-3.034370,5.914086,0.0,6.749668,2.809414,4.175525,0.176323,5.859224,6.535275,4.598127,...,2.263034,0.163499,0.097611,0.495695,3.379898,6.094869,0.0,0.000000,1.028569,0.0


In [20]:
doce_data_df.to_csv('docetaxel_sensitivity.csv')

## Create Training Script

In [21]:
%%writefile train.py
## Import libraries
import argparse
import joblib
import os
from azureml.core import Dataset, Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

## Set the input parameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, dest='n_estimators', help='Number of gradient boosted trees.')
parser.add_argument("--max_depth", type=float, dest='max_depth', help='Maximum tree depth for base learners.')
parser.add_argument("--booster", type=str, dest='booster', help='Boosting method.')
args = parser.parse_args()

n_estimators = args.n_estimators
max_depth = int(args.max_depth)
booster = args.booster

## Get the experiment run context
run = Run.get_context()
ws = run.experiment.workspace

## Log run options
run.log('n_estimators', str(n_estimators))
run.log('max_depth', str(max_depth))
run.log('booster', str(booster))

## load the dataset(s)
print("Loading Data...")
# dataset = run.input_datasets['doce_data'].to_pandas_dataframe() # Get the training data from the estimator input
dataset = pd.read_csv('docetaxel_sensitivity.csv')
run.log("Input data shape: ", str(dataset.shape))


## Separate features and labels
X = dataset[dataset.columns[1:]]
y = dataset[['L10_IC_50']].values

## Get X as numpy array
X = X.values

## Scale the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)


## Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25, 
    random_state = 1337
)

## Initialize Algorithm
xgb = XGBRegressor(objective ='reg:linear',
                   n_estimators = n_estimators,
                   max_depth = max_depth,
                   booster = booster,
                   seed = 1337)

## Train Model w/ Cross-Validation (5-fold)
scores = cross_validate(xgb,
                        X_train, y_train,
                        cv=5, scoring=('r2'),
                        return_estimator=True)

model = scores['estimator'][0]

## Score data
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

## Log model performance
run.log('r2', r2_score(y_train, y_train_hat))
run.log('r2_test', r2_score(y_test, y_test_hat))

## Note: Files saved in the outputs folder is automatically uploaded into experiment record
os.makedirs('outputs', exist_ok=True)

## Save model
model_name = f'model_{filtering_method}_{scaling_method}_{algorithm}'
joblib.dump(value=model, filename = f'outputs/{model_name}.pkl')

run.complete()

Overwriting train.py


## Compute Target

In [23]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ml-cluster1"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=4)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster.


## Configure Hyperdrive

In [24]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice, uniform


# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-dataprep[pandas]', 'xgboost'])
sklearn_env.python.conda_dependencies = packages

## Create a script config
script_config = ScriptRunConfig(source_directory = ".",
                                script='train.py',
                                arguments = ['--doce_data', doce_data.as_named_input('doce_data')], # Reference to dataset
                                environment = sklearn_env,
                                compute_target = training_cluster)

## Sample a range of parameter values
# params = GridParameterSampling(
params = RandomParameterSampling(
    {
        '--n_estimators': choice(1, 5, 100),
        '--max_depth': uniform(100, 5000),
        '--booster': choice('gbtree', 'gblinear', 'dart')
    }
)

## Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config = script_config, 
                              hyperparameter_sampling = params, 
                              policy = None, 
                              primary_metric_name = 'r2', 
                              primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
                              max_total_runs = 10,
                              max_concurrent_runs = 4)

## Run the Experiment

In [None]:
experiment = Experiment(workspace = ws, name = 'docetaxel_sensitivity_training_hyperdrive')
run = experiment.submit(config = hyperdrive)

In [9]:
from azureml.widgets import RunDetails

## Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

KeyboardInterrupt: 