Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
using System;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
// This example demonstrates hashing of categorical string and integer data types by using Hash transform's
// advanced options API.
public static class HashWithOptions
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext(seed: 1);

// Get a small dataset as an IEnumerable.
var rawData = new[] {
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "NFL" , Age = 14 },
new DataPoint() { Category = "NFL" , Age = 15 },
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "MLS" , Age = 14 },
};

var data = mlContext.Data.LoadFromEnumerable(rawData);

// Construct the pipeline that would hash the two columns and store the
// results in new columns. The first transform hashes the string column
// and the second transform hashes the integer column.
//
// Hashing is not a reversible operation, so there is no way to retrive
// the original value from the hashed value. Sometimes, for debugging,
// or model explainability, users will need to know what values in the
// original columns generated the values in the hashed columns, since
// the algorithms will mostly use the hashed values for further
// computations. The Hash method will preserve the mapping from the
// original values to the hashed values in the Annotations of the newly
// created column (column populated with the hashed values).
//
// Setting the maximumNumberOfInverts parameters to -1 will preserve the
// full map. If that parameter is left to the default 0 value, the
// mapping is not preserved.
var pipeline = mlContext.Transforms.Conversion.Hash(
new[]
{
new HashingEstimator.ColumnOptions(
"CategoryHashed",
"Category",
16,
useOrderedHashing: false,
maximumNumberOfInverts: -1),

new HashingEstimator.ColumnOptions(
"AgeHashed",
"Age",
8,
useOrderedHashing: false)
});

// Let's fit our pipeline, and then apply it to the same data.
var transformer = pipeline.Fit(data);
var transformedData = transformer.Transform(data);

// Convert the post transformation from the IDataView format to an
// IEnumerable <TransformedData> for easy consumption.
var convertedData = mlContext.Data.CreateEnumerable<
TransformedDataPoint>(transformedData, true);

Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed");
foreach (var item in convertedData)
Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t " +
$"{item.Age}\t {item.AgeHashed}");

// Expected data after the transformation.
//
// Category CategoryHashed Age AgeHashed
// MLB 36206 18 127
// NFL 19015 14 62
// NFL 19015 15 43
// MLB 36206 18 127
// MLS 6013 14 62

// For the Category column, where we set the maximumNumberOfInverts
// parameter, the names of the original categories, and their
// correspondance with the generated hash values is preserved in the
// Annotations in the format of indices and values.the indices array
// will have the hashed values, and the corresponding element,
// position -wise, in the values array will contain the original value.
//
// See below for an example on how to retrieve the mapping.
var slotNames = new VBuffer<ReadOnlyMemory<char>>();
transformedData.Schema["CategoryHashed"].Annotations.GetValue(
"KeyValues", ref slotNames);

var indices = slotNames.GetIndices();
var categoryNames = slotNames.GetValues();

for (int i = 0; i < indices.Length; i++)
Console.WriteLine($"The original value of the {indices[i]} " +
$"category is {categoryNames[i]}");

// Output Data
//
// The original value of the 6012 category is MLS
// The original value of the 19014 category is NFL
// The original value of the 36205 category is MLB
}

public class DataPoint
{
public string Category { get; set; }
public uint Age { get; set; }
}

public class TransformedDataPoint : DataPoint
{
public uint CategoryHashed { get; set; }
public uint AgeHashed { get; set; }
}

}
}
3 changes: 3 additions & 0 deletions src/DefaultGenApiDocIds.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// These attributes should be excluded from reference assemblies.

T:Microsoft.ML.BestFriendAttribute
1 change: 1 addition & 0 deletions src/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
API missing from old) -->
<RunApiCompatForSrc>true</RunApiCompatForSrc>
<RunMatchingRefApiCompat>false</RunMatchingRefApiCompat>
<ApiCompatExcludeAttributeList>$(MSBuildThisFileDirectory)DefaultGenApiDocIds.txt</ApiCompatExcludeAttributeList>
</PropertyGroup>

<ItemGroup>
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -517,11 +517,11 @@ internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDa
// instead of having two hash transformations.
var origStratCol = samplingKeyColumn;
samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn);
HashingEstimator.ColumnOptions columnOptions;
HashingEstimator.ColumnOptionsInternal columnOptions;
if (seed.HasValue)
columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, (uint)seed.Value);
columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30, (uint)seed.Value);
else
columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30);
data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
}
else
Expand Down
20 changes: 14 additions & 6 deletions src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

namespace Microsoft.ML
{
using static Microsoft.ML.Transforms.HashingEstimator;
using ConvertDefaults = TypeConvertingEstimator.Defaults;
using HashDefaults = HashingEstimator.Defaults;

Expand Down Expand Up @@ -46,18 +47,25 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, numberOfBits, maximumNumberOfInverts);

/// <summary>
/// Create a <see cref="HashingEstimator"/>, which hashes the input column's data type <see cref="InputOutputColumnPair.InputColumnName" />
/// to a new column: <see cref="InputOutputColumnPair.OutputColumnName" />.
/// Create a <see cref="HashingEstimator"/>, which hashes the input column's data type <see cref="ColumnOptions.InputColumnName" />
/// to a new column: <see cref="ColumnOptions.Name" />.
/// </summary>
/// <remarks>This transform can operate over several columns.</remarks>
/// <param name="catalog">The transform's catalog.</param>
/// <param name="columns">The input and output columns.
/// <param name="columns">Advanced options for the estimator that also contain the input and output column names.
/// This estimator operates over text, numeric, boolean, key and <see cref="DataViewRowId"/> data types.
/// The new column's data type will be a vector of <see cref="System.UInt32"/>, or a <see cref="System.UInt32"/> based on whether the input column data types
/// are vectors or scalars.</param>
[BestFriend]
internal static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params HashingEstimator.ColumnOptions[] columns)
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), columns);
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[Hash](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Conversion/HashWithOptions.cs)]
/// ]]></format>
/// </example>
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params ColumnOptions[] columns)
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(x => new ColumnOptionsInternal(x.Name, x.InputColumnName, x.NumberOfBits, x.Seed,
x.UseOrderedHashing, x.MaximumNumberOfInverts)).ToArray());

/// <summary>
/// Create a <see cref="TypeConvertingEstimator"/>, which converts the type of the data to the type specified in <paramref name="outputKind"/>.
Expand Down
Loading