# Docetaxel Sensitivity: Machine Learning using Gene Expression
---------------------------

In [1]:
import numpy as np
import pandas as pd
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()

In [None]:
## Get Sensitivity Data
doce_data = Dataset.get_by_name(ws, name='docetaxel_sensitivity')
doce_data_df = doce_data.to_pandas_dataframe()
doce_data_df

In [None]:
doce_data_df.to_csv('docetaxel_sensitivity.csv')

## Create Training Script

In [None]:
%%writefile train.py
## Import libraries
import argparse
import joblib
import os
from azureml.core import Dataset, Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

## Set the input parameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, dest='n_estimators', help='Number of gradient boosted trees.')
parser.add_argument("--max_depth", type=float, dest='max_depth', help='Maximum tree depth for base learners.')
parser.add_argument("--booster", type=str, dest='booster', help='Boosting method.')
args = parser.parse_args()

n_estimators = args.n_estimators
max_depth = int(args.max_depth)
booster = args.booster

## Get the experiment run context
run = Run.get_context()
ws = run.experiment.workspace

## Log run options
run.log('n_estimators', str(n_estimators))
run.log('max_depth', str(max_depth))
run.log('booster', str(booster))

## load the dataset(s)
print("Loading Data...")
# dataset = run.input_datasets['doce_data'].to_pandas_dataframe() # Get the training data from the estimator input
dataset = pd.read_csv('docetaxel_sensitivity.csv')
run.log("Input data shape: ", str(dataset.shape))


## Separate features and labels
X = dataset[dataset.columns[1:]]
y = dataset[['L10_IC_50']].values

## Get X as numpy array
X = X.values

## Scale the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)


## Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25, 
    random_state = 1337
)

## Initialize Algorithm
xgb = XGBRegressor(objective ='reg:linear',
                   n_estimators = n_estimators,
                   max_depth = max_depth,
                   booster = booster,
                   seed = 1337)

## Train Model w/ Cross-Validation (5-fold)
scores = cross_validate(xgb,
                        X_train, y_train,
                        cv=5, scoring=('r2'),
                        return_estimator=True)

model = scores['estimator'][0]

## Score data
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

## Log model performance
run.log('r2', r2_score(y_train, y_train_hat))
run.log('r2_test', r2_score(y_test, y_test_hat))

## Note: Files saved in the outputs folder is automatically uploaded into experiment record
os.makedirs('outputs', exist_ok=True)

## Save model
joblib.dump(value=model, filename = f'outputs/model.pkl')

run.complete()

## Compute Target

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ml-cluster1"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=4)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

## Configure Hyperdrive

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice, uniform


# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-dataprep[pandas]', 'xgboost'])
sklearn_env.python.conda_dependencies = packages

## Create a script config
script_config = ScriptRunConfig(source_directory = ".",
                                script='train.py',
                                # arguments = ['--doce_data', doce_data.as_named_input('doce_data')], # Reference to dataset
                                environment = sklearn_env,
                                compute_target = training_cluster)

## Sample a range of parameter values
# params = GridParameterSampling(
params = RandomParameterSampling(
    {
        '--n_estimators': choice(1, 5, 100),
        '--max_depth': uniform(100, 5000),
        '--booster': choice('gbtree', 'gblinear', 'dart')
    }
)

## Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config = script_config, 
                              hyperparameter_sampling = params, 
                              policy = None, 
                              primary_metric_name = 'r2', 
                              primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
                              max_total_runs = 10,
                              max_concurrent_runs = 4)

## Run the Experiment

In [None]:
experiment = Experiment(workspace = ws, name = 'docetaxel_sensitivity_training_hyperdrive')
run = experiment.submit(config = hyperdrive)

In [9]:
from azureml.widgets import RunDetails

## Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

KeyboardInterrupt: 