# Introduction to Automated Machine Learning (AutoML)

This sample shows how you can use AutoML to automate the process of training custom ML models.

In this case, we want to train an ML model that automatically applies a label to GitHub issues.

## Install NuGet packages

In [20]:
#r "nuget: Microsoft.ML.AutoML, 0.21.0-preview.23266.6"

## Add using statements

In [21]:
using System.Threading;
using System.IO;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML;

## Initialize MLContext

In [22]:
// Initialize MLContext
MLContext ctx = new MLContext();

## Use AutoML to infer column information

In [23]:
// Define data path
var dataPath = Path.GetFullPath(@"../Data/issues_train.tsv");

// Infer column information
ColumnInferenceResults columnInference =
    ctx.Auto().InferColumns(dataPath, separatorChar: '\t', labelColumnName: "Area", groupColumns: false);

In [24]:
columnInference

index,value
index,value
index,value
index,value
index,value
,
,
TextLoaderOptions,"Microsoft.ML.Data.TextLoader+OptionsAllowQuotingTrueAllowSparseFalseInputSize<null>Separators[ ]DecimalMarker.Columnsindexvalue0Microsoft.ML.Data.TextLoader+ColumnDataKindSingleNameIDSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>1Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameAreaSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>2Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameTitleSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>3Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameDescriptionSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>TrimWhitespaceFalseHasHeaderTrueUseThreadsTrueReadMultilinesFalseHeaderFile<null>MaxRows<null>EscapeChar""MissingRealsAsNaNsFalse"
,
AllowQuoting,True
AllowSparse,False
InputSize,<null>
Separators,[ ]
DecimalMarker,.
Columns,indexvalue0Microsoft.ML.Data.TextLoader+ColumnDataKindSingleNameIDSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>1Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameAreaSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>2Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameTitleSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>3Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameDescriptionSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>

index,value
index,value
index,value
index,value
index,value
,
AllowQuoting,True
AllowSparse,False
InputSize,<null>
Separators,[ ]
DecimalMarker,.
Columns,indexvalue0Microsoft.ML.Data.TextLoader+ColumnDataKindSingleNameIDSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>1Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameAreaSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>2Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameTitleSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>3Microsoft.ML.Data.TextLoader+ColumnDataKindStringNameDescriptionSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>
index,value
0,Microsoft.ML.Data.TextLoader+ColumnDataKindSingleNameIDSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>
,

index,value
index,value
index,value
index,value
index,value
,
0,Microsoft.ML.Data.TextLoader+ColumnDataKindSingleNameIDSourceindexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalseKeyCount<null>
,
DataKind,Single
Name,ID
Source,indexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
index,value
0,Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,0

index,value
,
DataKind,Single
Name,ID
Source,indexvalue0Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
index,value
0,Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,0
Max,0
AutoEnd,False

index,value
,
0,Microsoft.ML.Data.TextLoader+RangeMin0Max0AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,0
Max,0
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

Unnamed: 0,Unnamed: 1
Min,0
Max,0
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

index,value
,
DataKind,String
Name,Area
Source,indexvalue0Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
index,value
0,Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,1
Max,1
AutoEnd,False

index,value
,
0,Microsoft.ML.Data.TextLoader+RangeMin1Max1AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,1
Max,1
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

Unnamed: 0,Unnamed: 1
Min,1
Max,1
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

index,value
,
DataKind,String
Name,Title
Source,indexvalue0Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
index,value
0,Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,2
Max,2
AutoEnd,False

index,value
,
0,Microsoft.ML.Data.TextLoader+RangeMin2Max2AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,2
Max,2
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

Unnamed: 0,Unnamed: 1
Min,2
Max,2
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

index,value
,
DataKind,String
Name,Description
Source,indexvalue0Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
index,value
0,Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,3
Max,3
AutoEnd,False

index,value
,
0,Microsoft.ML.Data.TextLoader+RangeMin3Max3AutoEndFalseVariableEndFalseAllOtherFalseForceVectorFalse
,
Min,3
Max,3
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

Unnamed: 0,Unnamed: 1
Min,3
Max,3
AutoEnd,False
VariableEnd,False
AllOther,False
ForceVector,False

Unnamed: 0,Unnamed: 1
LabelColumnName,Area
UserIdColumnName,<null>
GroupIdColumnName,<null>
ItemIdColumnName,<null>
ExampleWeightColumnName,<null>
SamplingKeyColumnName,<null>
CategoricalColumnNames,
NumericColumnNames,[ ID ]
TextColumnNames,"[ Title, Description ]"
IgnoredColumnNames,


## Load data into IDataView

In [25]:
// Create text loader
TextLoader loader = ctx.Data.CreateTextLoader(columnInference.TextLoaderOptions);

// Load data into IDataView
IDataView data = loader.Load(dataPath);

## Remove columns

In [26]:
var columnsToExclude = new[]{"ID","Description"};

data = ctx.Transforms.DropColumns(columnsToExclude)
    .Fit(data)
    .Transform(data)

## Split data into train / validation

80% of the dataset is used for training and 20% is used for validation (tuning)

In [27]:
var trainValidationData = ctx.Data.TrainTestSplit(data, testFraction: 0.2);

In [28]:
SweepablePipeline pipeline =
    ctx.Auto().Featurizer(data, columnInformation: columnInference.ColumnInformation)
        .Append(ctx.Transforms.Conversion.MapValueToKey(columnInference.ColumnInformation.LabelColumnName))
        .Append(ctx.Auto().MultiClassification(labelColumnName: columnInference.ColumnInformation.LabelColumnName))
        .Append(ctx.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

## Create AutoML experiment

In [29]:
AutoMLExperiment experiment = ctx.Auto().CreateExperiment();

## Configure experiment settings

In [30]:
experiment
.SetPipeline(pipeline)
.SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, labelColumn: columnInference.ColumnInformation.LabelColumnName)
.SetTrainingTimeInSeconds(120)
.SetDataset(trainValidationData);

## Configure experiment monitor

In [31]:
public class AutoMLMonitor : IMonitor
{
    private readonly List<TrialResult> _completedTrials;
    private readonly SweepablePipeline _pipeline;

    public AutoMLMonitor(SweepablePipeline pipeline)
    {
        _completedTrials = new List<TrialResult>();
        _pipeline = pipeline;
    }

    public IEnumerable<TrialResult> GetCompletedTrials() => _completedTrials;

    public void ReportBestTrial(TrialResult result)
    {
        return;
    }

    public void ReportCompletedTrial(TrialResult result)
    {
        var trialId = result.TrialSettings.TrialId;
        var timeToTrain = result.DurationInMilliseconds;
        var pipeline = _pipeline.ToString(result.TrialSettings.Parameter);
        Console.WriteLine($"Trial {trialId} finished training in {timeToTrain}ms with pipeline {pipeline}");
        _completedTrials.Add(result);
    }

    public void ReportFailTrial(TrialSettings settings, Exception exception = null)
    {
        if (exception.Message.Contains("Operation was canceled."))
        {
            Console.WriteLine($"{settings.TrialId} cancelled. Time budget exceeded.");
        }
        Console.WriteLine($"{settings.TrialId} failed with exception {exception.Message}");
    }

    public void ReportRunningTrial(TrialSettings setting)
    {
        return;
    }
}

In [32]:
var monitor = new AutoMLMonitor(pipeline);
experiment.SetMonitor(monitor);

## Train the model

In [33]:
var cts = new CancellationTokenSource();
TrialResult experimentResults = await experiment.RunAsync(cts.Token);

Trial 0 finished training in 11009ms with pipeline FeaturizeText=>Concatenate=>Unknown=>FastTreeOva=>Unknown
Trial 1 finished training in 12438ms with pipeline FeaturizeText=>Concatenate=>Unknown=>FastForestOva=>Unknown
Trial 2 finished training in 723ms with pipeline FeaturizeText=>Concatenate=>Unknown=>SdcaMaximumEntropyMulti=>Unknown
Trial 3 finished training in 12862ms with pipeline FeaturizeText=>Concatenate=>Unknown=>FastForestOva=>Unknown
Trial 4 finished training in 5418ms with pipeline FeaturizeText=>Concatenate=>Unknown=>LbfgsLogisticRegressionOva=>Unknown
Trial 5 finished training in 1120ms with pipeline FeaturizeText=>Concatenate=>Unknown=>LightGbmMulti=>Unknown
Trial 6 finished training in 8544ms with pipeline FeaturizeText=>Concatenate=>Unknown=>FastTreeOva=>Unknown
Trial 7 finished training in 9082ms with pipeline FeaturizeText=>Concatenate=>Unknown=>LbfgsMaximumEntropyMulti=>Unknown
Trial 8 finished training in 8459ms with pipeline FeaturizeText=>Concatenate=>Unknown=>S

## Get the best model

In [34]:
var bestModel = experimentResults.Model;
bestModel

## Display the metric for the best model

In [35]:
experimentResults.Metric

## Try out the model

### Make predictions

In [36]:
var predictions = bestModel.Transform(trainValidationData.TestSet);

### Display prediction results

In [37]:
predictions.Preview().RowView

## Save the best model

In [38]:
ctx.Model.Save(bestModel, data.Schema, "model.mlnet");