# Diabetes Model Training
## Train in the Compute Instance, Track in Azure ML

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import pandas as pd
import numpy as np

from azureml.core import Workspace, Dataset, Experiment, Run

In [2]:
## Connect to Workspace
ws = Workspace.from_config()

In [21]:
## Create Experiment
experiment = Experiment(workspace = ws, name = 'instructor_diabetes')

In [23]:
## Create Run
run = experiment.start_logging(outputs=None, snapshot_directory=".", display_name="logistic_regression_01")

In [24]:
## Load in Diabetes Dataset
dataset = Dataset.get_by_name(ws, name='diabetes_ta')
df = dataset.to_pandas_dataframe()
df

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
8010,1469198,6,95,85,37,267,18.497542,0.660240,31,0
8011,1432736,0,55,51,7,50,21.865341,0.086589,34,0
8012,1410962,5,99,59,47,67,30.774018,2.301594,43,1
8013,1958653,0,145,67,30,21,18.811861,0.789572,26,0


In [25]:
## Separate features and labels
X = df[df.columns[1:9]]
y = df[['Diabetic']].values

In [26]:
## Scale the X variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [27]:
## Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25, 
    random_state = 1337
)

In [28]:
## Initialize algorithm
lr = LogisticRegression(random_state = 0)

In [29]:
## Train w/ Cross-Validation (k-fold)
scores = cross_validate(lr,
                        X_train, y_train.ravel(),
                        cv=5, scoring=('roc_auc'),
                        return_estimator=True)

model = scores['estimator'][0]

In [30]:
## Score data
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

In [31]:
## Generate Metrics and Log
train_fpr, train_tpr, train_thresholds = metrics.roc_curve(y_train, y_train_hat, pos_label=1)
test_fpr, test_tpr, test_thresholds = metrics.roc_curve(y_test, y_test_hat, pos_label=1)

run.log('auc_train', metrics.auc(train_fpr, train_tpr))
run.log('auc_test', metrics.auc(test_fpr, test_tpr))

In [32]:
run.complete()