# Create an XGBoost Model using MLFlow on AzureML Compute

In [58]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
experiment = Experiment(ws, "diabetes-regression")

dataset = ws.datasets['diabetes-files']
cpu_cluster = ws.compute_targets['cpu-cluster']
environ = ws.environments["MLFlow-XGBoost"]

In [59]:
%%writefile train.py
import mlflow
from argparse import ArgumentParser
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import xgboost as xgb

def parse_args():
    parser = ArgumentParser()
    parser.add_argument("--data-path", type=str, 
                        required=True, help="The path to read the data files from")
    parser.add_argument("--random-seed", type=int, 
                        default=42, help="The random seed to use for data splitting and XGBoost")
    parser.add_argument('--colsample-bytree', type=float, default=1.0,
                        help='subsample ratio of columns when constructing each tree (default: 1.0)')
    parser.add_argument('--subsample', type=float, default=1.0,
                        help='subsample ratio of the training instances (default: 1.0)')
    return parser.parse_args()

def main(data_path, random_seed, colsample_bytree, subsample):
    with mlflow.start_run() as run:
        df = pd.read_parquet(data_path)

        X = df.copy()
        y = pd.DataFrame(X.pop('target'))

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # train model
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'mlogloss',
            'colsample_bytree': args.colsample_bytree,
            'subsample': args.subsample,
            'seed': random_seed,
        }
        
        mlflow.xgboost.autolog()
        model = xgb.train(params, dtrain)

        # evaluate model
        y_pred = model.predict(dtest)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        mlflow.log_metrics({'val_r2': r2, 'val_rmse': rmse, 'val_mae': mae})
    
if __name__ == "__main__":
    args = parse_args()
    main(data_path=args.data_path, 
         random_seed=args.random_seed,
         subsample=args.subsample,
         colsample_bytree=args.colsample_bytree
        )
    


Overwriting train.py


In [60]:
from azureml.train.estimator import Estimator

script_arguments = {"--data-path": dataset.as_mount()}

estimator = Estimator(source_directory='.',
                      compute_target=cpu_cluster,
                      entry_script='train.py',
                      environment_definition=environ,
                      script_params=script_arguments)

In [61]:
run = experiment.submit(estimator)
run



Experiment,Id,Type,Status,Details Page,Docs Page
diabetes-regression,diabetes-regression_1604509591_ca4dcf57,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [62]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [64]:
run.register_model("diabetes-xgboost", model_path="model")

Model(workspace=Workspace.create(name='ws-aml-dbx-demo', subscription_id='02559c12-8982-43b3-8180-f4e41779489b', resource_group='rg-aml-dbx-demo'), name=diabetes-xgboost, id=diabetes-xgboost:1, version=1, tags={}, properties={})