# Diabetes Model Training
## Train in a Compute Cluster, Track in Azure ML

In [1]:
from azureml.core import Workspace, Dataset, Experiment, Run, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

In [2]:
## Connect to Workspace
ws = Workspace.from_config()

In [4]:
## Load in Diabetes Dataset
dataset = Dataset.get_by_name(ws, name='diabetes_ta')
df = dataset.to_pandas_dataframe()
df.to_csv('diabetes_train.csv')
df

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
8010,1469198,6,95,85,37,267,18.497542,0.660240,31,0
8011,1432736,0,55,51,7,50,21.865341,0.086589,34,0
8012,1410962,5,99,59,47,67,30.774018,2.301594,43,1
8013,1958653,0,145,67,30,21,18.811861,0.789572,26,0


In [16]:
%%writefile train.py
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
import argparse
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics


## Get the experiment run context
run = Run.get_context()

## load the diabetes dataset
df = pd.read_csv('diabetes_train.csv')

## Separate features and labels
X = df[df.columns[1:9]]
y = df[['Diabetic']].values

## Scale the X variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25, 
    random_state = 1337
)

## Initialize algorithm
lr = LogisticRegression(random_state = 0)


## Train w/ Cross-Validation (k-fold)
scores = cross_validate(lr,
                        X_train, y_train.ravel(),
                        cv=5, scoring=('roc_auc'),
                        return_estimator=True)

model = scores['estimator'][0]

## Score data
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

## Generate Metrics and Log
train_fpr, train_tpr, train_thresholds = metrics.roc_curve(y_train, y_train_hat, pos_label=1)
test_fpr, test_tpr, test_thresholds = metrics.roc_curve(y_test, y_test_hat, pos_label=1)

run.log('auc_train', metrics.auc(train_fpr, train_tpr))
run.log('auc_test', metrics.auc(test_fpr, test_tpr))

os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting train.py


In [None]:
## Create Compute Cluster (Optional)
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "instructor_training_cluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D4S_V3', max_nodes=1)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

In [10]:
## Create Environment for sklearn
sklearn_env = Environment("sklearn-env")
packages = CondaDependencies.create(pip_packages=['scikit-learn', 'pandas', 'azureml-defaults'])
sklearn_env.python.conda_dependencies = packages

In [17]:
## Create a script config
script_config = ScriptRunConfig(source_directory='.',
                                script='train.py',
                                arguments = [],
                                #compute_target=training_cluster,
                                environment=sklearn_env) 

In [18]:
## Submit the experiment
experiment = Experiment(workspace=ws, name='instructor_diabetes')
run = experiment.submit(config=script_config)

In [19]:
## See Run Details
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'instructor_diabetes_1644938226_3ac77c5a',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2022-02-15T15:17:09.06588Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'a82e8208-04dd-40a4-a6d7-dcb53fb5443e'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datacaches': [],
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'instanceTypes': [],
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'sklearn-env',
   'version': 'Autosave_2022-02-15T15:14:14Z_11381047',
   'python': {'interpreterPath': 'python',
    'userManagedDependencies': False,
    'condaDependencies': {'channels