diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index bc2a797c15..b5e7e1b5e3 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -139,6 +139,16 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.DnnImageFeatur EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.EntryPoints", "src\Microsoft.ML.EntryPoints\Microsoft.ML.EntryPoints.csproj", "{7504D46F-E4B3-43CB-9B1C-82F3131F1C99}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.StaticPipe", "src\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.csproj", "{6B1B93D0-142A-4111-A20E-62B55A3E36A3}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TensorFlow.StaticPipe", "src\Microsoft.ML.TensorFlow.StaticPipe\Microsoft.ML.TensorFlow.StaticPipe.csproj", "{F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.HalLearners.StaticPipe", "src\Microsoft.ML.HalLearners.StaticPipe\Microsoft.ML.HalLearners.StaticPipe.csproj", "{2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxTransform.StaticPipe", "src\Microsoft.ML.OnnxTransform.StaticPipe\Microsoft.ML.OnnxTransform.StaticPipe.csproj", "{D1324668-9568-40F4-AA55-30A9A516C230}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.LightGBM.StaticPipe", "src\Microsoft.ML.LightGBM.StaticPipe\Microsoft.ML.LightGBM.StaticPipe.csproj", "{22C51B08-ACAE-47B2-A312-462DC239A23B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -531,6 +541,46 @@ Global {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release|Any CPU.Build.0 = Release|Any CPU {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release|Any CPU.Build.0 = Release|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release|Any CPU.Build.0 = Release|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release|Any CPU.Build.0 = Release|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Release|Any CPU.Build.0 = Release|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {D1324668-9568-40F4-AA55-30A9A516C230}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release|Any CPU.Build.0 = Release|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -589,6 +639,11 @@ Global {4805129D-78C8-46D4-9519-0AD9B0574D6D} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {DB7CEB5E-8BE6-48A7-87BE-B91D9AE96F71} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {7504D46F-E4B3-43CB-9B1C-82F3131F1C99} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {6B1B93D0-142A-4111-A20E-62B55A3E36A3} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {D1324668-9568-40F4-AA55-30A9A516C230} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {22C51B08-ACAE-47B2-A312-462DC239A23B} = {09EADF06-BE25-4228-AB53-95AE3E15B530} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 414e271b86..44b673640f 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -8,11 +8,13 @@ + + diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs index 53899d95fe..ce02dc4864 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs @@ -1,8 +1,6 @@ -using Microsoft.ML.Runtime.Data; +using Microsoft.ML.LightGBM.StaticPipe; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.StaticPipe; -using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Categorical; -using Microsoft.ML.Transforms.FeatureSelection; using System; namespace Microsoft.ML.Samples.Static diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs index 9ab90eaf95..505663190b 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs @@ -1,6 +1,6 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.LightGBM; -using Microsoft.ML.StaticPipe; +using Microsoft.ML.LightGBM.StaticPipe; using System; namespace Microsoft.ML.Samples.Static diff --git a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs index 50b698c5b9..352ca8bb2f 100644 --- a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs @@ -41,4 +41,10 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Transforms" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow.StaticPipe" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.HalLearners.StaticPipe" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransform.StaticPipe" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGBM.StaticPipe" + PublicKey.Value)] + [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.Data/Microsoft.ML.Data.csproj b/src/Microsoft.ML.Data/Microsoft.ML.Data.csproj index 982f5b07e9..f3e7d96b59 100644 --- a/src/Microsoft.ML.Data/Microsoft.ML.Data.csproj +++ b/src/Microsoft.ML.Data/Microsoft.ML.Data.csproj @@ -7,14 +7,6 @@ CORECLR - - - True - True - TermStaticExtensions.tt - - - @@ -27,34 +19,8 @@ - - - TextTemplatingFileGenerator - ConvertStaticExtensions.cs - - - TextTemplatingFileGenerator - TermStaticExtensions.cs - - - - - - True - True - ConvertStaticExtensions.tt - - - True - True - TermStaticExtensions.tt - TextTemplatingFileGenerator - TermStaticExtensions.cs - - - \ No newline at end of file diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index 50febaa558..9d001f1a7f 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -38,4 +38,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Transforms" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] + [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs index 81df252d58..0dad71cd36 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs @@ -121,213 +121,4 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(result.Values); } } - - /// - /// The extension methods and implementation support for concatenating columns together. - /// - public static class ConcatStaticExtensions - { - /// - /// Given a scalar vector, produce a vector of length one. - /// - /// The value type. - /// The scalar column. - /// The vector column, whose single item has the same value as the input. - public static Vector AsVector(this Scalar me) - => new Impl(Join(me, (PipelineColumn[])null)); - - /// - /// Given a bunch of normalized vectors, concatenate them together into a normalized vector. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static NormVector ConcatWith(this NormVector me, params NormVector[] others) - => new ImplNorm(Join(me, others)); - - /// - /// Given a set of columns, concatenate them together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static Vector ConcatWith(this Scalar me, params ScalarOrVector[] others) - => new Impl(Join(me, others)); - - /// - /// Given a set of columns, concatenate them together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static Vector ConcatWith(this Vector me, params ScalarOrVector[] others) - => new Impl(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this Scalar me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this Vector me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this VarVector me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - private interface IContainsColumn - { - PipelineColumn WrappedColumn { get; } - } - - /// - /// A wrapping object for the implicit conversions in - /// and other related methods. - /// - /// The value type. - public sealed class ScalarOrVector : ScalarOrVectorOrVarVector - { - private ScalarOrVector(PipelineColumn col) : base(col) { } - public static implicit operator ScalarOrVector(Scalar c) => new ScalarOrVector(c); - public static implicit operator ScalarOrVector(Vector c) => new ScalarOrVector(c); - public static implicit operator ScalarOrVector(NormVector c) => new ScalarOrVector(c); - } - - /// - /// A wrapping object for the implicit conversions in - /// and other related methods. - /// - /// The value type. - public class ScalarOrVectorOrVarVector : IContainsColumn - { - public PipelineColumn WrappedColumn { get; } - - private protected ScalarOrVectorOrVarVector(PipelineColumn col) - { - Contracts.CheckValue(col, nameof(col)); - WrappedColumn = col; - } - - public static implicit operator ScalarOrVectorOrVarVector(VarVector c) - => new ScalarOrVectorOrVarVector(c); - } - - #region Implementation support - private sealed class Rec : EstimatorReconciler - { - /// - /// For the moment the concat estimator can only do one at a time, so I want to apply these operations - /// one at a time, which means a separate reconciler. Otherwise there may be problems with name overwriting. - /// If that is ever adjusted, then we can make a slightly more efficient reconciler, though this is probably - /// not that important of a consideration from a runtime perspective. - /// - public static Rec Inst => new Rec(); - - private Rec() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - // For the moment, the concat estimator can only do one concatenation at a time. - // So we will chain the estimators. - Contracts.AssertNonEmpty(toOutput); - IEstimator est = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var ccol = (IConcatCol)toOutput[i]; - string[] inputs = ccol.Sources.Select(s => inputNames[s]).ToArray(); - var localEst = new ColumnConcatenatingEstimator (env, outputNames[toOutput[i]], inputs); - if (i == 0) - est = localEst; - else - est = est.Append(localEst); - } - return est; - } - } - - private static PipelineColumn[] Join(PipelineColumn col, IContainsColumn[] cols) - { - if (Utils.Size(cols) == 0) - return new[] { col }; - var retVal = new PipelineColumn[cols.Length + 1]; - retVal[0] = col; - for (int i = 0; i < cols.Length; ++i) - retVal[i + 1] = cols[i].WrappedColumn; - return retVal; - } - - private static PipelineColumn[] Join(PipelineColumn col, PipelineColumn[] cols) - { - if (Utils.Size(cols) == 0) - return new[] { col }; - var retVal = new PipelineColumn[cols.Length + 1]; - retVal[0] = col; - Array.Copy(cols, 0, retVal, 1, cols.Length); - return retVal; - } - - private interface IConcatCol - { - PipelineColumn[] Sources { get; } - } - - private sealed class Impl : Vector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public Impl(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - - private sealed class ImplVar : VarVector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public ImplVar(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - - private sealed class ImplNorm : NormVector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public ImplNorm(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - #endregion - } } diff --git a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs index b5de375360..2af09c5eef 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs @@ -533,117 +533,4 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(result.Values); } } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class KeyToValueStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - private sealed class OutKeyColumn : Key, IColInput - { - public PipelineColumn Input { get; } - - public OutKeyColumn(Key> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutScalarColumn : Scalar, IColInput - { - public PipelineColumn Input { get; } - - public OutScalarColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new (string input, string output)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (IColInput)toOutput[i]; - cols[i] = (inputNames[outCol.Input], outputNames[toOutput[i]]); - } - return new KeyToValueMappingEstimator(env, cols); - } - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Key ToValue(this Key> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutKeyColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Scalar ToValue(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalarColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Vector ToValue(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static VarVector ToValue(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - } } diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeUtils.cs b/src/Microsoft.ML.Data/Transforms/NormalizeUtils.cs index 0f1ea0ef62..50b59f02e3 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeUtils.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeUtils.cs @@ -53,6 +53,7 @@ public interface IColumnAggregator void Finish(); } + [BestFriend] internal interface IColumnFunction : ICanSaveModel { Delegate GetGetter(Row input, int icol); diff --git a/src/Microsoft.ML.Data/Transforms/Normalizer.cs b/src/Microsoft.ML.Data/Transforms/Normalizer.cs index 1ba236e3da..80c1a9435c 100644 --- a/src/Microsoft.ML.Data/Transforms/Normalizer.cs +++ b/src/Microsoft.ML.Data/Transforms/Normalizer.cs @@ -28,6 +28,7 @@ namespace Microsoft.ML.Transforms.Normalizers { public sealed class NormalizingEstimator : IEstimator { + [BestFriend] internal static class Defaults { public const bool FixZero = true; @@ -352,6 +353,7 @@ public ColumnFunctionAccessor(ImmutableArray infos) } /// An accessor of the column functions within . + [BestFriend] internal readonly IReadOnlyList ColumnFunctions; public readonly ImmutableArray Columns; diff --git a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs index 74eb46f367..87c0142f07 100644 --- a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs +++ b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs @@ -577,64 +577,4 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(result.Values); } } - - public static partial class ConvertStaticExtensions - { - - private interface IConvertCol - { - PipelineColumn Input { get; } - DataKind Kind { get; } - } - - private sealed class ImplScalar : Scalar, IConvertCol - { - public PipelineColumn Input { get; } - public DataKind Kind { get; } - public ImplScalar(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class ImplVector : Vector, IConvertCol - { - public PipelineColumn Input { get; } - public DataKind Kind { get; } - public ImplVector(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class ImplVarVector : VarVector, IConvertCol - { - public PipelineColumn Input { get; } - public DataKind Kind { get; } - public ImplVarVector(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new TypeConvertingTransformer.ColumnInfo[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (IConvertCol)toOutput[i]; - infos[i] = new TypeConvertingTransformer.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], tcol.Kind); - } - return new TypeConvertingEstimator(env, infos); - } - } - } } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index 6381b27654..cc7d0559b7 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -123,104 +123,4 @@ internal ToKeyFitResult(ValueToKeyMappingTransformer.TermMap map) { } } - - public static partial class TermStaticExtensions - { - // I am not certain I see a good way to cover the distinct types beyond complete enumeration. - // Raw generics would allow illegal possible inputs, for example, Scalar. So, this is a partial - // class, and all the public facing extension methods for each possible type are in a T4 generated result. - - private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; - - private readonly struct Config - { - public readonly KeyValueOrder Order; - public readonly int Max; - public readonly Action OnFit; - - public Config(KeyValueOrder order, int max, Action onFit) - { - Order = order; - Max = max; - OnFit = onFit; - } - } - - private static Action Wrap(ToKeyFitResult.OnFit onFit) - { - if (onFit == null) - return null; - // The type T asociated with the delegate will be the actual value type once #863 goes in. - // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. - // For now construct the useless object then pass it into the delegate. - return map => onFit(new ToKeyFitResult(map)); - } - - private interface ITermCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Key, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector>, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVarVector : VarVector>, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVarVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new ValueToKeyMappingTransformer.ColumnInfo[toOutput.Length]; - Action onFit = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ITermCol)toOutput[i]; - infos[i] = new ValueToKeyMappingTransformer.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], - tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); - if (tcol.Config.OnFit != null) - { - int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. - onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); - } - } - var est = new ValueToKeyMappingEstimator(env, infos); - if (onFit == null) - return est; - return est.WithOnFitDelegate(onFit); - } - } - } } diff --git a/src/Microsoft.ML.HalLearners.StaticPipe/Microsoft.ML.HalLearners.StaticPipe.csproj b/src/Microsoft.ML.HalLearners.StaticPipe/Microsoft.ML.HalLearners.StaticPipe.csproj new file mode 100644 index 0000000000..239dbee4b3 --- /dev/null +++ b/src/Microsoft.ML.HalLearners.StaticPipe/Microsoft.ML.HalLearners.StaticPipe.csproj @@ -0,0 +1,12 @@ + + + + netcoreapp2.1 + + + + + + + + diff --git a/src/Microsoft.ML.HalLearners/TransformsStatic.cs b/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs similarity index 90% rename from src/Microsoft.ML.HalLearners/TransformsStatic.cs rename to src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs index 87ea5a2c0f..2adaf92e13 100644 --- a/src/Microsoft.ML.HalLearners/TransformsStatic.cs +++ b/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs @@ -4,16 +4,17 @@ using Microsoft.ML.Core.Data; using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe; using Microsoft.ML.StaticPipe.Runtime; using Microsoft.ML.Transforms.Projections; using System.Collections.Generic; -namespace Microsoft.ML.StaticPipe +namespace Microsoft.ML.HalLearners.StaticPipe { /// /// Extensions for statically typed Whitening estimator. /// - public static class VectorWhiteningExtensions + public static class VectorWhiteningStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -57,7 +58,7 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// + /// /// The column to which the transform will be applied. /// Whitening constant, prevents division by zero when scaling the data by inverse of eigenvalues. /// Maximum number of rows used to train the transform. @@ -68,7 +69,7 @@ public static Vector PcaWhitening(this Vector input, int pcaNum = VectorWhiteningTransformer.Defaults.PcaNum) => new OutPipelineColumn(input, WhiteningKind.Pca, eps, maxRows, pcaNum); - /// + /// /// The column to which the transform will be applied. /// Whitening constant, prevents division by zero. /// Maximum number of rows used to train the transform. diff --git a/src/Microsoft.ML.HalLearners/Properties/AssemblyInfo.cs b/src/Microsoft.ML.HalLearners/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..694c53c65f --- /dev/null +++ b/src/Microsoft.ML.HalLearners/Properties/AssemblyInfo.cs @@ -0,0 +1,10 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using Microsoft.ML; + +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.HalLearners.StaticPipe" + PublicKey.Value)] + +[assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.HalLearners/VectorWhitening.cs b/src/Microsoft.ML.HalLearners/VectorWhitening.cs index 5279ae54cd..48a31b00ab 100644 --- a/src/Microsoft.ML.HalLearners/VectorWhitening.cs +++ b/src/Microsoft.ML.HalLearners/VectorWhitening.cs @@ -45,6 +45,7 @@ public enum WhiteningKind /// public sealed class VectorWhiteningTransformer : OneToOneTransformerBase { + [BestFriend] internal static class Defaults { public const WhiteningKind Kind = WhiteningKind.Zca; diff --git a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs similarity index 98% rename from src/Microsoft.ML.LightGBM/LightGbmStatic.cs rename to src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs index 3cdad93257..354abfd075 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs +++ b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs @@ -3,20 +3,19 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Internal.Internallearn; -using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Runtime.LightGBM; +using Microsoft.ML.StaticPipe; using Microsoft.ML.StaticPipe.Runtime; using Microsoft.ML.Trainers; using System; -namespace Microsoft.ML.StaticPipe +namespace Microsoft.ML.LightGBM.StaticPipe { /// /// Regression trainer estimators. /// - public static class LightGbmTrainers + public static class LightGbmStaticExtensions { /// /// Predict a target using a tree regression model trained with the . diff --git a/src/Microsoft.ML.LightGBM.StaticPipe/Microsoft.ML.LightGBM.StaticPipe.csproj b/src/Microsoft.ML.LightGBM.StaticPipe/Microsoft.ML.LightGBM.StaticPipe.csproj new file mode 100644 index 0000000000..acf7b19db3 --- /dev/null +++ b/src/Microsoft.ML.LightGBM.StaticPipe/Microsoft.ML.LightGBM.StaticPipe.csproj @@ -0,0 +1,12 @@ + + + + netcoreapp2.1 + + + + + + + + diff --git a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs index 54812d1a6b..044afe77ef 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs @@ -82,8 +82,10 @@ private static string GetArgName(string name) return strBuf.ToString(); } + [BestFriend] internal static class Defaults { + [BestFriend] internal const int NumBoostRound = 100; } diff --git a/src/Microsoft.ML.LightGBM/Properties/AssemblyInfo.cs b/src/Microsoft.ML.LightGBM/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..99672010c6 --- /dev/null +++ b/src/Microsoft.ML.LightGBM/Properties/AssemblyInfo.cs @@ -0,0 +1,10 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using Microsoft.ML; + +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGBM.StaticPipe" + PublicKey.Value)] + +[assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.OnnxTransform.StaticPipe/DnnImageFeaturizerStaticExtensions.cs b/src/Microsoft.ML.OnnxTransform.StaticPipe/DnnImageFeaturizerStaticExtensions.cs new file mode 100644 index 0000000000..2762a29084 --- /dev/null +++ b/src/Microsoft.ML.OnnxTransform.StaticPipe/DnnImageFeaturizerStaticExtensions.cs @@ -0,0 +1,67 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.StaticPipe; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.OnnxTransform.StaticPipe +{ + public static class DnnImageFeaturizerStaticExtensions + { + private sealed class OutColumn : Vector + { + public PipelineColumn Input { get; } + + public OutColumn(Vector input, Func> modelFactory) + : base(new Reconciler(modelFactory), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly Func> _modelFactory; + + public Reconciler(Func> modelFactory) + { + _modelFactory = modelFactory; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var outCol = (OutColumn)toOutput[0]; + return new DnnImageFeaturizerEstimator(env, _modelFactory, inputNames[outCol.Input], outputNames[outCol]); + } + } + + /// + /// Creates and applies a DnnImageFeaturizer transform to be used by the static API. + /// for more information about how the transformation works. + /// + /// Vector of image pixel weights. + /// An extension method on the that creates a chain of two + /// s (one for preprocessing and one with a pretrained image DNN) with specific models + /// included in a package together with that extension method. + /// For an example, see Microsoft.ML.DnnImageFeaturizer.ResNet18 + /// A vector of float feature weights based on the input image. + public static Vector DnnImageFeaturizer(this Vector input, Func> modelFactory) + { + Contracts.CheckValue(input, nameof(input)); + return new OutColumn(input, modelFactory); + } + } +} diff --git a/src/Microsoft.ML.OnnxTransform.StaticPipe/Microsoft.ML.OnnxTransform.StaticPipe.csproj b/src/Microsoft.ML.OnnxTransform.StaticPipe/Microsoft.ML.OnnxTransform.StaticPipe.csproj new file mode 100644 index 0000000000..ed9adecae2 --- /dev/null +++ b/src/Microsoft.ML.OnnxTransform.StaticPipe/Microsoft.ML.OnnxTransform.StaticPipe.csproj @@ -0,0 +1,12 @@ + + + + netcoreapp2.1 + + + + + + + + diff --git a/src/Microsoft.ML.OnnxTransform.StaticPipe/OnnxStaticExtensions.cs b/src/Microsoft.ML.OnnxTransform.StaticPipe/OnnxStaticExtensions.cs new file mode 100644 index 0000000000..fe1f245f79 --- /dev/null +++ b/src/Microsoft.ML.OnnxTransform.StaticPipe/OnnxStaticExtensions.cs @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms; +using System.Collections.Generic; + +namespace Microsoft.ML.OnnxTransform.StaticPipe +{ + public static class OnnxStaticExtensions + { + + private sealed class OutColumn : Vector + { + public PipelineColumn Input { get; } + + public OutColumn(Vector input, string modelFile) + : base(new Reconciler(modelFile), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly string _modelFile; + + public Reconciler(string modelFile) + { + Contracts.AssertNonEmpty(modelFile); + _modelFile = modelFile; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var outCol = (OutColumn)toOutput[0]; + return new OnnxScoringEstimator(env, _modelFile, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); + } + } + + /// + /// Run a Onnx model on the input column and extract one output column. + /// The inputs and outputs are matched to Onnx graph nodes by name. + /// + public static Vector ApplyOnnxModel(this Vector input, string modelFile) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); + return new OutColumn(input, modelFile); + } + } +} diff --git a/src/Microsoft.ML.OnnxTransform/DnnImageFeaturizerTransform.cs b/src/Microsoft.ML.OnnxTransform/DnnImageFeaturizerTransform.cs index 626cbfab69..a18159d94e 100644 --- a/src/Microsoft.ML.OnnxTransform/DnnImageFeaturizerTransform.cs +++ b/src/Microsoft.ML.OnnxTransform/DnnImageFeaturizerTransform.cs @@ -85,56 +85,4 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return _modelChain.GetOutputSchema(inputSchema); } } - - public static class DnnImageFeaturizerStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, Func> modelFactory) - : base(new Reconciler(modelFactory), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly Func> _modelFactory; - - public Reconciler(Func> modelFactory) - { - _modelFactory = modelFactory; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - return new DnnImageFeaturizerEstimator(env, _modelFactory, inputNames[outCol.Input], outputNames[outCol]); - } - } - - /// - /// Creates and applies a DnnImageFeaturizer transform to be used by the static API. - /// for more information about how the transformation works. - /// - /// Vector of image pixel weights. - /// An extension method on the that creates a chain of two - /// s (one for preprocessing and one with a pretrained image DNN) with specific models - /// included in a package together with that extension method. - /// For an example, see Microsoft.ML.DnnImageFeaturizer.ResNet18 - /// A vector of float feature weights based on the input image. - public static Vector DnnImageFeaturizer(this Vector input, Func> modelFactory) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, modelFactory); - } - } } diff --git a/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs b/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs index c52b2a6c8e..c496c650db 100644 --- a/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs +++ b/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs @@ -502,54 +502,5 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(resultDic.Values); } } - - public static class OnnxStaticExtensions - { - - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, string modelFile) - : base(new Reconciler(modelFile), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly string _modelFile; - - public Reconciler(string modelFile) - { - Contracts.AssertNonEmpty(modelFile); - _modelFile = modelFile; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - return new OnnxScoringEstimator(env, _modelFile, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); - } - } - - /// - /// Run a Onnx model on the input column and extract one output column. - /// The inputs and outputs are matched to Onnx graph nodes by name. - /// - public static Vector ApplyOnnxModel(this Vector input, string modelFile) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); - return new OutColumn(input, modelFile); - } - } } diff --git a/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs new file mode 100644 index 0000000000..07e45be1e7 --- /dev/null +++ b/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs @@ -0,0 +1,173 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms.Categorical; +using System.Collections.Generic; + +namespace Microsoft.ML.StaticPipe +{ + public static class CategoricalHashStaticExtensions + { + public enum OneHotHashVectorOutputKind : byte + { + /// + /// Output is a bag (multi-set) vector + /// + Bag = 1, + + /// + /// Output is an indicator vector + /// + Ind = 2, + + /// + /// Output is binary encoded + /// + Bin = 4, + } + + public enum OneHotHashScalarOutputKind : byte + { + /// + /// Output is an indicator vector + /// + Ind = 2, + + /// + /// Output is binary encoded + /// + Bin = 4, + } + + private const OneHotHashVectorOutputKind DefOut = (OneHotHashVectorOutputKind)OneHotHashEncodingEstimator.Defaults.OutputKind; + private const int DefHashBits = OneHotHashEncodingEstimator.Defaults.HashBits; + private const uint DefSeed = OneHotHashEncodingEstimator.Defaults.Seed; + private const bool DefOrdered = OneHotHashEncodingEstimator.Defaults.Ordered; + private const int DefInvertHash = OneHotHashEncodingEstimator.Defaults.InvertHash; + + private readonly struct Config + { + public readonly int HashBits; + public readonly uint Seed; + public readonly bool Ordered; + public readonly int InvertHash; + public readonly OneHotHashVectorOutputKind OutputKind; + + public Config(OneHotHashVectorOutputKind outputKind, int hashBits, uint seed, bool ordered, int invertHash) + { + OutputKind = outputKind; + HashBits = hashBits; + Seed = seed; + Ordered = ordered; + InvertHash = invertHash; + } + } + + private interface ICategoricalCol + { + PipelineColumn Input { get; } + Config Config { get; } + } + + private sealed class ImplScalar : Vector, ICategoricalCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class ImplVector : Vector, ICategoricalCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class Rec : EstimatorReconciler + { + public static readonly Rec Inst = new Rec(); + + public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) + { + var infos = new OneHotHashEncodingEstimator.ColumnInfo[toOutput.Length]; + for (int i = 0; i < toOutput.Length; ++i) + { + var tcol = (ICategoricalCol)toOutput[i]; + infos[i] = new OneHotHashEncodingEstimator.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], (OneHotEncodingTransformer.OutputKind)tcol.Config.OutputKind, + tcol.Config.HashBits, tcol.Config.Seed, tcol.Config.Ordered, tcol.Config.InvertHash); + } + return new OneHotHashEncodingEstimator(env, infos); + } + } + + /// + /// Converts the categorical value into an indicator array by hashing categories into certain value and using that value as the index in the array. + /// + /// Incoming data. + /// Specify the output type of indicator array: array or binary encoded data. + /// Amount of bits to use for hashing. + /// Seed value used for hashing. + /// Whether the position of each term should be included in the hash. + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + public static Vector OneHotHashEncoding(this Scalar input, OneHotHashScalarOutputKind outputKind = (OneHotHashScalarOutputKind)DefOut, + int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) + { + Contracts.CheckValue(input, nameof(input)); + return new ImplScalar(input, new Config((OneHotHashVectorOutputKind)outputKind, hashBits, seed, ordered, invertHash)); + } + + /// + /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array + /// + /// Incoming data. + /// Specify the output type of indicator array: array or binary encoded data. + /// Amount of bits to use for hashing. + /// Seed value used for hashing. + /// Whether the position of each term should be included in the hash. + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + public static Vector OneHotHashEncoding(this Vector input, OneHotHashVectorOutputKind outputKind = DefOut, + int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) + { + Contracts.CheckValue(input, nameof(input)); + return new ImplVector(input, new Config(outputKind, hashBits, seed, ordered, invertHash)); + } + + /// + /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array + /// + /// Incoming data. + /// Specify the output type of indicator array: array or binary encoded data. + /// Amount of bits to use for hashing. + /// Seed value used for hashing. + /// Whether the position of each term should be included in the hash. + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + public static Vector OneHotHashEncoding(this VarVector input, OneHotHashVectorOutputKind outputKind = DefOut, + int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) + { + Contracts.CheckValue(input, nameof(input)); + return new ImplVector(input, new Config(outputKind, hashBits, seed, ordered, invertHash)); + } + } +} diff --git a/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs new file mode 100644 index 0000000000..5e7327c9ca --- /dev/null +++ b/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs @@ -0,0 +1,163 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms.Categorical; +using Microsoft.ML.Transforms.Conversions; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.StaticPipe +{ + public static class CategoricalStaticExtensions + { + public enum OneHotVectorOutputKind : byte + { + /// + /// Output is a bag (multi-set) vector + /// + Bag = 1, + + /// + /// Output is an indicator vector + /// + Ind = 2, + + /// + /// Output is binary encoded + /// + Bin = 4, + } + + public enum OneHotScalarOutputKind : byte + { + /// + /// Output is an indicator vector + /// + Ind = 2, + + /// + /// Output is binary encoded + /// + Bin = 4, + } + + private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; + private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; + private const OneHotVectorOutputKind DefOut = (OneHotVectorOutputKind)OneHotEncodingEstimator.Defaults.OutKind; + + private readonly struct Config + { + public readonly KeyValueOrder Order; + public readonly int Max; + public readonly OneHotVectorOutputKind OutputKind; + public readonly Action OnFit; + + public Config(OneHotVectorOutputKind outputKind, KeyValueOrder order, int max, Action onFit) + { + OutputKind = outputKind; + Order = order; + Max = max; + OnFit = onFit; + } + } + + private static Action Wrap(ToKeyFitResult.OnFit onFit) + { + if (onFit == null) + return null; + // The type T asociated with the delegate will be the actual value type once #863 goes in. + // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. + // For now construct the useless object then pass it into the delegate. + return map => onFit(new ToKeyFitResult(map)); + } + + private interface ICategoricalCol + { + PipelineColumn Input { get; } + Config Config { get; } + } + + private sealed class ImplScalar : Vector, ICategoricalCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class ImplVector : Vector, ICategoricalCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class Rec : EstimatorReconciler + { + public static readonly Rec Inst = new Rec(); + + public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) + { + var infos = new OneHotEncodingEstimator.ColumnInfo[toOutput.Length]; + Action onFit = null; + for (int i = 0; i < toOutput.Length; ++i) + { + var tcol = (ICategoricalCol)toOutput[i]; + infos[i] = new OneHotEncodingEstimator.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], (OneHotEncodingTransformer.OutputKind)tcol.Config.OutputKind, + tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); + if (tcol.Config.OnFit != null) + { + int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. + onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); + } + } + var est = new OneHotEncodingEstimator(env, infos); + if (onFit != null) + est.WrapTermWithDelegate(onFit); + return est; + } + } + + /// + /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. + /// + /// Incoming data. + /// Specify the output type of indicator array: array or binary encoded data. + /// How the Id for each value would be assigined: by occurrence or by value. + /// Maximum number of ids to keep during data scanning. + /// Called upon fitting with the learnt enumeration on the dataset. + public static Vector OneHotEncoding(this Scalar input, OneHotScalarOutputKind outputKind = (OneHotScalarOutputKind)DefOut, KeyValueOrder order = DefSort, + int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) + { + Contracts.CheckValue(input, nameof(input)); + return new ImplScalar(input, new Config((OneHotVectorOutputKind)outputKind, order, maxItems, Wrap(onFit))); + } + + /// + /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. + /// + /// Incoming data. + /// Specify the output type of indicator array: Multiarray, array or binary encoded data. + /// How the Id for each value would be assigined: by occurrence or by value. + /// Maximum number of ids to keep during data scanning. + /// Called upon fitting with the learnt enumeration on the dataset. + public static Vector OneHotEncoding(this Vector input, OneHotVectorOutputKind outputKind = DefOut, KeyValueOrder order = DefSort, int maxItems = DefMax, + ToKeyFitResult>.OnFit onFit = null) + { + Contracts.CheckValue(input, nameof(input)); + return new ImplVector(input, new Config(outputKind, order, maxItems, Wrap(onFit))); + } + } +} diff --git a/src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs similarity index 99% rename from src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.cs rename to src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs index cb91106bf6..1389e60e55 100644 --- a/src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs @@ -6,7 +6,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.StaticPipe; -namespace Microsoft.ML.Transforms.Conversions +namespace Microsoft.ML.StaticPipe { public static partial class ConvertStaticExtensions { diff --git a/src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.tt b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt similarity index 97% rename from src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.tt rename to src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt index 78169dfb6d..993039eb78 100644 --- a/src/Microsoft.ML.Data/Transforms/ConvertStaticExtensions.tt +++ b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt @@ -12,7 +12,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.StaticPipe; -namespace Microsoft.ML.Transforms.Conversions +namespace Microsoft.ML.StaticPipe { public static partial class ConvertStaticExtensions { diff --git a/src/Microsoft.ML.Transforms/Text/LdaStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs similarity index 98% rename from src/Microsoft.ML.Transforms/Text/LdaStaticExtensions.cs rename to src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs index 05acdca178..6e40636a73 100644 --- a/src/Microsoft.ML.Transforms/Text/LdaStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs @@ -4,7 +4,6 @@ using Microsoft.ML.Core.Data; using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.Data; using Microsoft.ML.StaticPipe.Runtime; using Microsoft.ML.Transforms.Text; using System; @@ -137,7 +136,7 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// + /// /// A vector of floats representing the document. /// The number of topics. /// Dirichlet prior on document-topic vectors. diff --git a/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs new file mode 100644 index 0000000000..44f733a571 --- /dev/null +++ b/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms.Projections; +using System.Collections.Generic; + +namespace Microsoft.ML.StaticPipe +{ + /// + /// Extensions for statically typed . + /// + public static class LpNormalizerStaticExtensions + { + private sealed class OutPipelineColumn : Vector + { + public readonly Vector Input; + + public OutPipelineColumn(Vector input, LpNormalizingEstimatorBase.NormalizerKind normKind, bool subMean) + : base(new Reconciler(normKind, subMean), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly LpNormalizingEstimatorBase.NormalizerKind _normKind; + private readonly bool _subMean; + + public Reconciler(LpNormalizingEstimatorBase.NormalizerKind normKind, bool subMean) + { + _normKind = normKind; + _subMean = subMean; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var pairs = new List<(string input, string output)>(); + foreach (var outCol in toOutput) + pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); + + return new LpNormalizingEstimator(env, pairs.ToArray(), _normKind, _subMean); + } + } + + /// + /// The column to apply to. + /// Type of norm to use to normalize each sample. + /// Subtract mean from each value before normalizing. + public static Vector LpNormalize(this Vector input, + LpNormalizingEstimatorBase.NormalizerKind normKind = LpNormalizingEstimatorBase.Defaults.NormKind, + bool subMean = LpNormalizingEstimatorBase.Defaults.LpSubstractMean) => new OutPipelineColumn(input, normKind, subMean); + } +} diff --git a/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj b/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj new file mode 100644 index 0000000000..3273c610a4 --- /dev/null +++ b/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj @@ -0,0 +1,43 @@ + + + + netcoreapp2.1 + + + + + + + + + + + ConvertStaticExtensions.tt + True + True + + + TermStaticExtensions.tt + TermStaticExtensions.cs + True + TextTemplatingFileGenerator + True + + + + + + ConvertStaticExtensions.cs + TextTemplatingFileGenerator + + + TermStaticExtensions.cs + TextTemplatingFileGenerator + + + + + + + + diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs similarity index 99% rename from src/Microsoft.ML.Data/Transforms/NormalizerStaticExtensions.cs rename to src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs index 733158b94e..1493ba78f3 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Core.Data; -using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Internal.Utilities; diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs b/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs similarity index 99% rename from src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs rename to src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs index 2549206ec8..0ed811ca96 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs +++ b/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Internal.Calibration; using Microsoft.ML.Runtime.Learners; using Microsoft.ML.StaticPipe.Runtime; @@ -15,7 +14,7 @@ namespace Microsoft.ML.StaticPipe /// /// Extension methods and utilities for instantiating SDCA trainer estimators inside statically typed pipelines. /// - public static class SdcaExtensions + public static class SdcaStaticExtensions { /// /// Predict a target using a linear regression model trained with the SDCA trainer. diff --git a/src/Microsoft.ML.Data/Transforms/TermStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs similarity index 99% rename from src/Microsoft.ML.Data/Transforms/TermStaticExtensions.cs rename to src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs index e148698a91..4016234300 100644 --- a/src/Microsoft.ML.Data/Transforms/TermStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs @@ -5,8 +5,9 @@ using System; using Microsoft.ML.StaticPipe; using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms.Conversions; -namespace Microsoft.ML.Transforms.Conversions +namespace Microsoft.ML.StaticPipe { public static partial class TermStaticExtensions { diff --git a/src/Microsoft.ML.Data/Transforms/TermStaticExtensions.tt b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt similarity index 97% rename from src/Microsoft.ML.Data/Transforms/TermStaticExtensions.tt rename to src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt index d219abbef0..6aaf5aa48f 100644 --- a/src/Microsoft.ML.Data/Transforms/TermStaticExtensions.tt +++ b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt @@ -11,8 +11,9 @@ using System; using Microsoft.ML.StaticPipe; using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms.Conversions; -namespace Microsoft.ML.Transforms.Conversions +namespace Microsoft.ML.StaticPipe { public static partial class TermStaticExtensions { diff --git a/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs similarity index 97% rename from src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs rename to src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 4091243062..9a176bbfca 100644 --- a/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -14,7 +14,7 @@ namespace Microsoft.ML.StaticPipe /// /// Extensions for statically typed word tokenizer. /// - public static class WordTokenizerExtensions + public static class WordTokenizerStaticExtensions { private sealed class OutPipelineColumn : VarVector { @@ -63,7 +63,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// Extensions for statically typed character tokenizer. /// - public static class CharacterTokenizerExtensions + public static class CharacterTokenizerStaticExtensions { private sealed class OutPipelineColumn : VarVector> { @@ -117,7 +117,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// Extensions for statically typed stop word remover. /// - public static class StopwordRemoverExtensions + public static class StopwordRemoverStaticExtensions { private sealed class OutPipelineColumn : VarVector { @@ -172,7 +172,7 @@ public static VarVector RemoveStopwords(this VarVector input, /// /// Extensions for statically typed text normalizer. /// - public static class TextNormalizerExtensions + public static class TextNormalizerStaticExtensions { private sealed class OutPipelineColumn : Scalar { @@ -243,7 +243,7 @@ public static Scalar NormalizeText(this Scalar input, /// /// Extensions for statically typed bag of word converter. /// - public static class WordBagEstimatorExtensions + public static class WordBagEstimatorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -326,7 +326,7 @@ public static Vector ToBagofWords(this Scalar input, /// /// Extensions for statically typed bag of wordhash converter. /// - public static class WordHashBagEstimatorExtensions + public static class WordHashBagEstimatorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -422,7 +422,7 @@ public static Vector ToBagofHashedWords(this Scalar input, /// /// Extensions for statically typed ngram estimator. /// - public static class NgramEstimatorExtensions + public static class NgramEstimatorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -487,8 +487,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// - /// /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Ngram length. @@ -508,7 +508,7 @@ public static Vector ToNgrams(this VarVector> inp /// /// Extensions for statically typed ngram hash estimator. /// - public static class NgramHashEstimatorExtensions + public static class NgramHashEstimatorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -573,8 +573,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text. /// It does so by hashing each ngram and using the hash value as the index in the bag. /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Number of bits to hash into. Must be between 1 and 30, inclusive. diff --git a/src/Microsoft.ML.Transforms/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs similarity index 66% rename from src/Microsoft.ML.Transforms/TransformsStatic.cs rename to src/Microsoft.ML.StaticPipe/TransformsStatic.cs index cebf9a6fbb..fed8bdbfdb 100644 --- a/src/Microsoft.ML.Transforms/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -4,73 +4,24 @@ using Microsoft.ML.Core.Data; using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.StaticPipe.Runtime; using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions; using Microsoft.ML.Transforms.FeatureSelection; using Microsoft.ML.Transforms.Projections; +using Microsoft.ML.Transforms.Text; using System; using System.Collections.Generic; +using System.Linq; namespace Microsoft.ML.StaticPipe { - /// - /// Extensions for statically typed . - /// - public static class LpNormalizerExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, LpNormalizingEstimatorBase.NormalizerKind normKind, bool subMean) - : base(new Reconciler(normKind, subMean), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly LpNormalizingEstimatorBase.NormalizerKind _normKind; - private readonly bool _subMean; - - public Reconciler(LpNormalizingEstimatorBase.NormalizerKind normKind, bool subMean) - { - _normKind = normKind; - _subMean = subMean; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string input, string output)>(); - foreach (var outCol in toOutput) - pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); - - return new LpNormalizingEstimator(env, pairs.ToArray(), _normKind, _subMean); - } - } - - /// - /// The column to apply to. - /// Type of norm to use to normalize each sample. - /// Subtract mean from each value before normalizing. - public static Vector LpNormalize(this Vector input, - LpNormalizingEstimatorBase.NormalizerKind normKind = LpNormalizingEstimatorBase.Defaults.NormKind, - bool subMean = LpNormalizingEstimatorBase.Defaults.LpSubstractMean) => new OutPipelineColumn(input, normKind, subMean); - } - /// /// Extensions for statically typed . /// - public static class GlobalContrastNormalizerExtensions + public static class GlobalContrastNormalizerStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -112,7 +63,7 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// + /// /// The column to apply to. /// Subtract mean from each value before normalizing. /// Normalize by standard deviation rather than L2 norm. @@ -124,9 +75,9 @@ public static Vector GlobalContrastNormalize(this Vector input, } /// - /// Extensions for statically typed . + /// Extensions for statically typed . /// - public static class MutualInformationFeatureSelectorExtensions + public static class MutualInformationFeatureSelectorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -175,7 +126,7 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -193,7 +144,7 @@ public static Vector SelectFeaturesBasedOnMutualInformation( int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -211,7 +162,7 @@ public static Vector SelectFeaturesBasedOnMutualInformation( int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -229,7 +180,7 @@ public static Vector SelectFeaturesBasedOnMutualInformation( int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -247,7 +198,7 @@ public static Vector SelectFeaturesBasedOnMutualInformation( int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -265,7 +216,7 @@ public static Vector SelectFeaturesBasedOnMutualInformation( int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - /// + /// /// Name of the input column. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. @@ -285,9 +236,9 @@ public static Vector SelectFeaturesBasedOnMutualInformation( } /// - /// Extensions for statically typed . + /// Extensions for statically typed . /// - public static class CountFeatureSelectorExtensions + public static class CountFeatureSelectorStaticExtensions { private sealed class OutPipelineColumn : Vector { @@ -325,7 +276,7 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// + /// /// Name of the input column. /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. /// @@ -338,7 +289,7 @@ public override IEstimator Reconcile(IHostEnvironment env, public static Vector SelectFeaturesBasedOnCount(this Vector input, long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); - /// + /// /// Name of the input column. /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. /// @@ -351,7 +302,7 @@ public static Vector SelectFeaturesBasedOnCount(this Vector input, public static Vector SelectFeaturesBasedOnCount(this Vector input, long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); - /// + /// /// Name of the input column. /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. /// @@ -365,159 +316,10 @@ public static Vector SelectFeaturesBasedOnCount(this Vector inpu long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); } - public static class CategoricalStaticExtensions - { - public enum OneHotVectorOutputKind : byte - { - /// - /// Output is a bag (multi-set) vector - /// - Bag = 1, - - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - public enum OneHotScalarOutputKind : byte - { - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; - private const OneHotVectorOutputKind DefOut = (OneHotVectorOutputKind)OneHotEncodingEstimator.Defaults.OutKind; - - private readonly struct Config - { - public readonly KeyValueOrder Order; - public readonly int Max; - public readonly OneHotVectorOutputKind OutputKind; - public readonly Action OnFit; - - public Config(OneHotVectorOutputKind outputKind, KeyValueOrder order, int max, Action onFit) - { - OutputKind = outputKind; - Order = order; - Max = max; - OnFit = onFit; - } - } - - private static Action Wrap(ToKeyFitResult.OnFit onFit) - { - if (onFit == null) - return null; - // The type T asociated with the delegate will be the actual value type once #863 goes in. - // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. - // For now construct the useless object then pass it into the delegate. - return map => onFit(new ToKeyFitResult(map)); - } - - private interface ICategoricalCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new OneHotEncodingEstimator.ColumnInfo[toOutput.Length]; - Action onFit = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ICategoricalCol)toOutput[i]; - infos[i] = new OneHotEncodingEstimator.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], (OneHotEncodingTransformer.OutputKind)tcol.Config.OutputKind, - tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); - if (tcol.Config.OnFit != null) - { - int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. - onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); - } - } - var est = new OneHotEncodingEstimator(env, infos); - if (onFit != null) - est.WrapTermWithDelegate(onFit); - return est; - } - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// How the Id for each value would be assigined: by occurrence or by value. - /// Maximum number of ids to keep during data scanning. - /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector OneHotEncoding(this Scalar input, OneHotScalarOutputKind outputKind = (OneHotScalarOutputKind)DefOut, KeyValueOrder order = DefSort, - int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplScalar(input, new Config((OneHotVectorOutputKind)outputKind, order, maxItems, Wrap(onFit))); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: Multiarray, array or binary encoded data. - /// How the Id for each value would be assigined: by occurrence or by value. - /// Maximum number of ids to keep during data scanning. - /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector OneHotEncoding(this Vector input, OneHotVectorOutputKind outputKind = DefOut, KeyValueOrder order = DefSort, int maxItems = DefMax, - ToKeyFitResult>.OnFit onFit = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, order, maxItems, Wrap(onFit))); - } - } - /// /// Extension methods for the static-pipeline over objects. /// - public static class KeyToBinaryVectorExtensions + public static class KeyToBinaryVectorStaticExtensions { private interface IColInput { @@ -676,7 +478,7 @@ public static VarVector ToBinaryVector(this VarVector> in /// /// Extension methods for the static-pipeline over objects. /// - public static class KeyToVectorExtensions + public static class KeyToVectorStaticExtensions { private interface IColInput { @@ -928,7 +730,7 @@ public static Vector ToBaggedVector(this VarVector> input /// /// Extension methods for the static-pipeline over objects. /// - public static class NAReplacerExtensions + public static class NAReplacerStaticExtensions { private readonly struct Config { @@ -1081,4 +883,639 @@ public static VarVector ReplaceNaNValues(this VarVector input, M return new OutVarVectorColumn(input, new Config(replacementMode, false)); } } + + public static partial class ConvertStaticExtensions + { + + private interface IConvertCol + { + PipelineColumn Input { get; } + DataKind Kind { get; } + } + + private sealed class ImplScalar : Scalar, IConvertCol + { + public PipelineColumn Input { get; } + public DataKind Kind { get; } + public ImplScalar(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) + { + Input = input; + Kind = kind; + } + } + + private sealed class ImplVector : Vector, IConvertCol + { + public PipelineColumn Input { get; } + public DataKind Kind { get; } + public ImplVector(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) + { + Input = input; + Kind = kind; + } + } + + private sealed class ImplVarVector : VarVector, IConvertCol + { + public PipelineColumn Input { get; } + public DataKind Kind { get; } + public ImplVarVector(PipelineColumn input, DataKind kind) : base(Rec.Inst, input) + { + Input = input; + Kind = kind; + } + } + + private sealed class Rec : EstimatorReconciler + { + public static readonly Rec Inst = new Rec(); + + public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) + { + var infos = new TypeConvertingTransformer.ColumnInfo[toOutput.Length]; + for (int i = 0; i < toOutput.Length; ++i) + { + var tcol = (IConvertCol)toOutput[i]; + infos[i] = new TypeConvertingTransformer.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], tcol.Kind); + } + return new TypeConvertingEstimator(env, infos); + } + } + } + + public static partial class TermStaticExtensions + { + // I am not certain I see a good way to cover the distinct types beyond complete enumeration. + // Raw generics would allow illegal possible inputs, for example, Scalar. So, this is a partial + // class, and all the public facing extension methods for each possible type are in a T4 generated result. + + private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; + private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; + + private readonly struct Config + { + public readonly KeyValueOrder Order; + public readonly int Max; + public readonly Action OnFit; + + public Config(KeyValueOrder order, int max, Action onFit) + { + Order = order; + Max = max; + OnFit = onFit; + } + } + + private static Action Wrap(ToKeyFitResult.OnFit onFit) + { + if (onFit == null) + return null; + // The type T asociated with the delegate will be the actual value type once #863 goes in. + // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. + // For now construct the useless object then pass it into the delegate. + return map => onFit(new ToKeyFitResult(map)); + } + + private interface ITermCol + { + PipelineColumn Input { get; } + Config Config { get; } + } + + private sealed class ImplScalar : Key, ITermCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class ImplVector : Vector>, ITermCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class ImplVarVector : VarVector>, ITermCol + { + public PipelineColumn Input { get; } + public Config Config { get; } + public ImplVarVector(PipelineColumn input, Config config) : base(Rec.Inst, input) + { + Input = input; + Config = config; + } + } + + private sealed class Rec : EstimatorReconciler + { + public static readonly Rec Inst = new Rec(); + + public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) + { + var infos = new ValueToKeyMappingTransformer.ColumnInfo[toOutput.Length]; + Action onFit = null; + for (int i = 0; i < toOutput.Length; ++i) + { + var tcol = (ITermCol)toOutput[i]; + infos[i] = new ValueToKeyMappingTransformer.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], + tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); + if (tcol.Config.OnFit != null) + { + int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. + onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); + } + } + var est = new ValueToKeyMappingEstimator(env, infos); + if (onFit == null) + return est; + return est.WithOnFitDelegate(onFit); + } + } + } + + /// + /// Extension methods for the static-pipeline over objects. + /// + public static class KeyToValueStaticExtensions + { + private interface IColInput + { + PipelineColumn Input { get; } + } + + private sealed class OutKeyColumn : Key, IColInput + { + public PipelineColumn Input { get; } + + public OutKeyColumn(Key> input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class OutScalarColumn : Scalar, IColInput + { + public PipelineColumn Input { get; } + + public OutScalarColumn(Key input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class OutVectorColumn : Vector, IColInput + { + public PipelineColumn Input { get; } + + public OutVectorColumn(Vector> input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class OutVarVectorColumn : VarVector, IColInput + { + public PipelineColumn Input { get; } + + public OutVarVectorColumn(VarVector> input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + public static Reconciler Inst = new Reconciler(); + + private Reconciler() { } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + var cols = new (string input, string output)[toOutput.Length]; + for (int i = 0; i < toOutput.Length; ++i) + { + var outCol = (IColInput)toOutput[i]; + cols[i] = (inputNames[outCol.Input], outputNames[toOutput[i]]); + } + return new KeyToValueMappingEstimator(env, cols); + } + } + + /// + /// Convert a key column to a column containing the corresponding value. + /// + public static Key ToValue(this Key> input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutKeyColumn(input); + } + + /// + /// Convert a key column to a column containing the corresponding value. + /// + public static Scalar ToValue(this Key input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutScalarColumn(input); + } + + /// + /// Convert a key column to a column containing the corresponding value. + /// + public static Vector ToValue(this Vector> input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVectorColumn(input); + } + + /// + /// Convert a key column to a column containing the corresponding value. + /// + public static VarVector ToValue(this VarVector> input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVarVectorColumn(input); + } + } + + /// + /// The extension methods and implementation support for concatenating columns together. + /// + public static class ConcatStaticExtensions + { + /// + /// Given a scalar vector, produce a vector of length one. + /// + /// The value type. + /// The scalar column. + /// The vector column, whose single item has the same value as the input. + public static Vector AsVector(this Scalar me) + => new Impl(Join(me, (PipelineColumn[])null)); + + /// + /// Given a bunch of normalized vectors, concatenate them together into a normalized vector. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static NormVector ConcatWith(this NormVector me, params NormVector[] others) + => new ImplNorm(Join(me, others)); + + /// + /// Given a set of columns, concatenate them together into a vector valued column of the same type. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static Vector ConcatWith(this Scalar me, params ScalarOrVector[] others) + => new Impl(Join(me, others)); + + /// + /// Given a set of columns, concatenate them together into a vector valued column of the same type. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static Vector ConcatWith(this Vector me, params ScalarOrVector[] others) + => new Impl(Join(me, others)); + + /// + /// Given a set of columns including at least one variable sized vector column, concatenate them + /// together into a vector valued column of the same type. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static VarVector ConcatWith(this Scalar me, params ScalarOrVectorOrVarVector[] others) + => new ImplVar(Join(me, others)); + + /// + /// Given a set of columns including at least one variable sized vector column, concatenate them + /// together into a vector valued column of the same type. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static VarVector ConcatWith(this Vector me, params ScalarOrVectorOrVarVector[] others) + => new ImplVar(Join(me, others)); + + /// + /// Given a set of columns including at least one variable sized vector column, concatenate them + /// together into a vector valued column of the same type. + /// + /// The value type. + /// The first input column. + /// Subsequent input columns. + /// The result of concatenating all input columns together. + public static VarVector ConcatWith(this VarVector me, params ScalarOrVectorOrVarVector[] others) + => new ImplVar(Join(me, others)); + + private interface IContainsColumn + { + PipelineColumn WrappedColumn { get; } + } + + /// + /// A wrapping object for the implicit conversions in + /// and other related methods. + /// + /// The value type. + public sealed class ScalarOrVector : ScalarOrVectorOrVarVector + { + private ScalarOrVector(PipelineColumn col) : base(col) { } + public static implicit operator ScalarOrVector(Scalar c) => new ScalarOrVector(c); + public static implicit operator ScalarOrVector(Vector c) => new ScalarOrVector(c); + public static implicit operator ScalarOrVector(NormVector c) => new ScalarOrVector(c); + } + + /// + /// A wrapping object for the implicit conversions in + /// and other related methods. + /// + /// The value type. + public class ScalarOrVectorOrVarVector : IContainsColumn + { + public PipelineColumn WrappedColumn { get; } + + private protected ScalarOrVectorOrVarVector(PipelineColumn col) + { + Contracts.CheckValue(col, nameof(col)); + WrappedColumn = col; + } + + public static implicit operator ScalarOrVectorOrVarVector(VarVector c) + => new ScalarOrVectorOrVarVector(c); + } + + #region Implementation support + private sealed class Rec : EstimatorReconciler + { + /// + /// For the moment the concat estimator can only do one at a time, so I want to apply these operations + /// one at a time, which means a separate reconciler. Otherwise there may be problems with name overwriting. + /// If that is ever adjusted, then we can make a slightly more efficient reconciler, though this is probably + /// not that important of a consideration from a runtime perspective. + /// + public static Rec Inst => new Rec(); + + private Rec() { } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + // For the moment, the concat estimator can only do one concatenation at a time. + // So we will chain the estimators. + Contracts.AssertNonEmpty(toOutput); + IEstimator est = null; + for (int i = 0; i < toOutput.Length; ++i) + { + var ccol = (IConcatCol)toOutput[i]; + string[] inputs = ccol.Sources.Select(s => inputNames[s]).ToArray(); + var localEst = new ColumnConcatenatingEstimator(env, outputNames[toOutput[i]], inputs); + if (i == 0) + est = localEst; + else + est = est.Append(localEst); + } + return est; + } + } + + private static PipelineColumn[] Join(PipelineColumn col, IContainsColumn[] cols) + { + if (Utils.Size(cols) == 0) + return new[] { col }; + var retVal = new PipelineColumn[cols.Length + 1]; + retVal[0] = col; + for (int i = 0; i < cols.Length; ++i) + retVal[i + 1] = cols[i].WrappedColumn; + return retVal; + } + + private static PipelineColumn[] Join(PipelineColumn col, PipelineColumn[] cols) + { + if (Utils.Size(cols) == 0) + return new[] { col }; + var retVal = new PipelineColumn[cols.Length + 1]; + retVal[0] = col; + Array.Copy(cols, 0, retVal, 1, cols.Length); + return retVal; + } + + private interface IConcatCol + { + PipelineColumn[] Sources { get; } + } + + private sealed class Impl : Vector, IConcatCol + { + public PipelineColumn[] Sources { get; } + public Impl(PipelineColumn[] cols) + : base(Rec.Inst, cols) + { + Sources = cols; + } + } + + private sealed class ImplVar : VarVector, IConcatCol + { + public PipelineColumn[] Sources { get; } + public ImplVar(PipelineColumn[] cols) + : base(Rec.Inst, cols) + { + Sources = cols; + } + } + + private sealed class ImplNorm : NormVector, IConcatCol + { + public PipelineColumn[] Sources { get; } + public ImplNorm(PipelineColumn[] cols) + : base(Rec.Inst, cols) + { + Sources = cols; + } + } + #endregion + } + + /// + /// Extension methods for the static-pipeline over objects. + /// + public static class NAIndicatorStaticExtensions + { + private interface IColInput + { + PipelineColumn Input { get; } + } + + private sealed class OutScalar : Scalar, IColInput + { + public PipelineColumn Input { get; } + + public OutScalar(Scalar input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class OutVectorColumn : Vector, IColInput + { + public PipelineColumn Input { get; } + + public OutVectorColumn(Vector input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class OutVarVectorColumn : VarVector, IColInput + { + public PipelineColumn Input { get; } + + public OutVarVectorColumn(VarVector input) + : base(Reconciler.Inst, input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + public static Reconciler Inst = new Reconciler(); + + private Reconciler() { } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + var columnPairs = new (string input, string output)[toOutput.Length]; + for (int i = 0; i < toOutput.Length; ++i) + { + var col = (IColInput)toOutput[i]; + columnPairs[i] = (inputNames[col.Input], outputNames[toOutput[i]]); + } + return new MissingValueIndicatorEstimator(env, columnPairs); + } + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static Scalar IsMissingValue(this Scalar input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutScalar(input); + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static Scalar IsMissingValue(this Scalar input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutScalar(input); + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static Vector IsMissingValue(this Vector input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVectorColumn(input); + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static Vector IsMissingValue(this Vector input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVectorColumn(input); + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static VarVector IsMissingValue(this VarVector input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVarVectorColumn(input); + } + + /// + /// Produces a column of boolean entries indicating whether input column entries were missing. + /// + /// The input column. + /// A column indicating whether input column entries were missing. + public static VarVector IsMissingValue(this VarVector input) + { + Contracts.CheckValue(input, nameof(input)); + return new OutVarVectorColumn(input); + } + } + + /// + /// Extension methods for the static-pipeline over objects. + /// + public static class TextFeaturizerStaticExtensions + { + /// + /// Accept text data and converts it to array which represent combinations of ngram/skip-gram token counts. + /// + /// Input data. + /// Additional data. + /// Delegate which allows you to set transformation settings. + /// + public static Vector FeaturizeText(this Scalar input, Scalar[] otherInputs = null, Action advancedSettings = null) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckValueOrNull(otherInputs); + otherInputs = otherInputs ?? new Scalar[0]; + return new TextFeaturizingEstimator.OutPipelineColumn(new[] { input }.Concat(otherInputs), advancedSettings); + } + } } diff --git a/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs new file mode 100644 index 0000000000..6438af5aca --- /dev/null +++ b/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms.Text; +using System.Collections.Generic; + +namespace Microsoft.ML.StaticPipe +{ + public static class WordEmbeddingsStaticExtensions + { + /// + /// Vector of tokenized text. + /// The pretrained word embedding model. + /// + public static Vector WordEmbeddings(this VarVector input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) + { + Contracts.CheckValue(input, nameof(input)); + return new OutColumn(input, modelKind); + } + + /// + /// Vector of tokenized text. + /// The custom word embedding model file. + public static Vector WordEmbeddings(this VarVector input, string customModelFile) + { + Contracts.CheckValue(input, nameof(input)); + return new OutColumn(input, customModelFile); + } + + private sealed class OutColumn : Vector + { + public PipelineColumn Input { get; } + + public OutColumn(VarVector input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) + : base(new Reconciler(modelKind), input) + { + Input = input; + } + + public OutColumn(VarVector input, string customModelFile = null) + : base(new Reconciler(customModelFile), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly WordEmbeddingsExtractingTransformer.PretrainedModelKind? _modelKind; + private readonly string _customLookupTable; + + public Reconciler(WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) + { + _modelKind = modelKind; + _customLookupTable = null; + } + + public Reconciler(string customModelFile) + { + _modelKind = null; + _customLookupTable = customModelFile; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var cols = new WordEmbeddingsExtractingTransformer.ColumnInfo[toOutput.Length]; + for (int i = 0; i < toOutput.Length; ++i) + { + var outCol = (OutColumn)toOutput[i]; + cols[i] = new WordEmbeddingsExtractingTransformer.ColumnInfo(inputNames[outCol.Input], outputNames[outCol]); + } + + bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable); + if (customLookup) + return new WordEmbeddingsExtractingEstimator(env, _customLookupTable, cols); + else + return new WordEmbeddingsExtractingEstimator(env, _modelKind.Value, cols); + } + } + } +} diff --git a/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj b/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj new file mode 100644 index 0000000000..dd18ff9216 --- /dev/null +++ b/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj @@ -0,0 +1,12 @@ + + + + netcoreapp2.1 + + + + + + + + diff --git a/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs b/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs new file mode 100644 index 0000000000..c93dc72d99 --- /dev/null +++ b/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs @@ -0,0 +1,96 @@ +using Microsoft.ML.Core.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.StaticPipe; +using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms; +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.TensorFlow.StaticPipe +{ + public static class TensorFlowStaticExtensions + { + private sealed class OutColumn : Vector + { + public PipelineColumn Input { get; } + + public OutColumn(Vector input, string modelFile) + : base(new Reconciler(modelFile), input) + { + Input = input; + } + + public OutColumn(Vector input, TensorFlowModelInfo tensorFlowModel) + : base(new Reconciler(tensorFlowModel), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly string _modelFile; + private readonly TensorFlowModelInfo _tensorFlowModel; + + public Reconciler(string modelFile) + { + Contracts.AssertNonEmpty(modelFile); + _modelFile = modelFile; + _tensorFlowModel = null; + } + + public Reconciler(TensorFlowModelInfo tensorFlowModel) + { + Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); + + _modelFile = null; + _tensorFlowModel = tensorFlowModel; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var outCol = (OutColumn)toOutput[0]; + if (_modelFile == null) + { + return new TensorFlowEstimator(env, _tensorFlowModel, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); + } + else + { + return new TensorFlowEstimator(env, _modelFile, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); + } + } + } + + // REVIEW: this method only covers one use case of using TensorFlow models: consuming one + // input and producing one output of floats. + // We could consider selectively adding some more extensions to enable common scenarios. + /// + /// Load the TensorFlow model from and run it on the input column and extract one output column. + /// The inputs and outputs are matched to TensorFlow graph nodes by name. + /// + public static Vector ApplyTensorFlowGraph(this Vector input, string modelFile) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); + return new OutColumn(input, modelFile); + } + + /// + /// Run a TensorFlow model provided through on the input column and extract one output column. + /// The inputs and outputs are matched to TensorFlow graph nodes by name. + /// + public static Vector ApplyTensorFlowGraph(this Vector input, TensorFlowModelInfo tensorFlowModel) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); + return new OutColumn(input, tensorFlowModel); + } + } +} diff --git a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs index b4d9bd0993..cf47583df8 100644 --- a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs +++ b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs @@ -1137,89 +1137,4 @@ public TensorFlowTransform Fit(IDataView input) return _transformer; } } - - public static class TensorFlowStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, string modelFile) - : base(new Reconciler(modelFile), input) - { - Input = input; - } - - public OutColumn(Vector input, TensorFlowModelInfo tensorFlowModel) - : base(new Reconciler(tensorFlowModel), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly string _modelFile; - private readonly TensorFlowModelInfo _tensorFlowModel; - - public Reconciler(string modelFile) - { - Contracts.AssertNonEmpty(modelFile); - _modelFile = modelFile; - _tensorFlowModel = null; - } - - public Reconciler(TensorFlowModelInfo tensorFlowModel) - { - Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); - - _modelFile = null; - _tensorFlowModel = tensorFlowModel; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - if (_modelFile == null) - { - return new TensorFlowEstimator(env, _tensorFlowModel, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); - } - else - { - return new TensorFlowEstimator(env, _modelFile, new[] { inputNames[outCol.Input] }, new[] { outputNames[outCol] }); - } - } - } - - // REVIEW: this method only covers one use case of using TensorFlow models: consuming one - // input and producing one output of floats. - // We could consider selectively adding some more extensions to enable common scenarios. - /// - /// Load the TensorFlow model from and run it on the input column and extract one output column. - /// The inputs and outputs are matched to TensorFlow graph nodes by name. - /// - public static Vector ApplyTensorFlowGraph(this Vector input, string modelFile) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); - return new OutColumn(input, modelFile); - } - - /// - /// Run a TensorFlow model provided through on the input column and extract one output column. - /// The inputs and outputs are matched to TensorFlow graph nodes by name. - /// - public static Vector ApplyTensorFlowGraph(this Vector input, TensorFlowModelInfo tensorFlowModel) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); - return new OutColumn(input, tensorFlowModel); - } - } } diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index cdf377b365..3bef7d9188 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -29,6 +29,7 @@ public sealed class CountFeatureSelectingEstimator : IEstimator private readonly IHost _host; private readonly ColumnInfo[] _columns; + [BestFriend] internal static class Defaults { public const long Count = 1; diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 9410535b5f..d1a8e9c574 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -758,6 +758,7 @@ public enum NormalizerKind : byte LInf = 3 } + [BestFriend] internal static class Defaults { public const NormalizerKind NormKind = NormalizerKind.L2Norm; diff --git a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs index 8e2580accb..ce572c26a4 100644 --- a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs +++ b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs @@ -477,136 +477,4 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(result.Values); } } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class NAIndicatorExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - private sealed class OutScalar : Scalar, IColInput - { - public PipelineColumn Input { get; } - - public OutScalar(Scalar input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - - public OutVarVectorColumn(VarVector input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var columnPairs = new (string input, string output)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (IColInput)toOutput[i]; - columnPairs[i] = (inputNames[col.Input], outputNames[toOutput[i]]); - } - return new MissingValueIndicatorEstimator(env, columnPairs); - } - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Scalar IsMissingValue(this Scalar input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Scalar IsMissingValue(this Scalar input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Vector IsMissingValue(this Vector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Vector IsMissingValue(this Vector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static VarVector IsMissingValue(this VarVector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static VarVector IsMissingValue(this VarVector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - } } \ No newline at end of file diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index 7e357259d4..f15040eeff 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -168,6 +168,7 @@ public OneHotEncodingTransformer(ValueToKeyMappingEstimator term, IEstimator public sealed class OneHotEncodingEstimator : IEstimator { + [BestFriend] internal static class Defaults { public const OneHotEncodingTransformer.OutputKind OutKind = OneHotEncodingTransformer.OutputKind.Ind; @@ -272,6 +273,7 @@ public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, public OneHotEncodingTransformer Fit(IDataView input) => new OneHotEncodingTransformer(_term, _toSomething, input); + [BestFriend] internal void WrapTermWithDelegate(Action onFit) { _term = (ValueToKeyMappingEstimator)_term.WithOnFitDelegate(onFit); diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index 61e810219f..aca4ee8802 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -200,6 +200,7 @@ internal OneHotHashEncoding(HashingEstimator hash, IEstimator keyT /// public sealed class OneHotHashEncodingEstimator : IEstimator { + [BestFriend] internal static class Defaults { public const int HashBits = 16; @@ -318,165 +319,4 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col public OneHotHashEncoding Fit(IDataView input) => new OneHotHashEncoding(_hash, _toSomething, input); } - - public static class CategoricalHashStaticExtensions - { - public enum OneHotHashVectorOutputKind : byte - { - /// - /// Output is a bag (multi-set) vector - /// - Bag = 1, - - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - public enum OneHotHashScalarOutputKind : byte - { - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - private const OneHotHashVectorOutputKind DefOut = (OneHotHashVectorOutputKind)OneHotHashEncodingEstimator.Defaults.OutputKind; - private const int DefHashBits = OneHotHashEncodingEstimator.Defaults.HashBits; - private const uint DefSeed = OneHotHashEncodingEstimator.Defaults.Seed; - private const bool DefOrdered = OneHotHashEncodingEstimator.Defaults.Ordered; - private const int DefInvertHash = OneHotHashEncodingEstimator.Defaults.InvertHash; - - private readonly struct Config - { - public readonly int HashBits; - public readonly uint Seed; - public readonly bool Ordered; - public readonly int InvertHash; - public readonly OneHotHashVectorOutputKind OutputKind; - - public Config(OneHotHashVectorOutputKind outputKind, int hashBits, uint seed, bool ordered, int invertHash) - { - OutputKind = outputKind; - HashBits = hashBits; - Seed = seed; - Ordered = ordered; - InvertHash = invertHash; - } - } - - private interface ICategoricalCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new OneHotHashEncodingEstimator.ColumnInfo[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ICategoricalCol)toOutput[i]; - infos[i] = new OneHotHashEncodingEstimator.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], (OneHotEncodingTransformer.OutputKind)tcol.Config.OutputKind, - tcol.Config.HashBits, tcol.Config.Seed, tcol.Config.Ordered, tcol.Config.InvertHash); - } - return new OneHotHashEncodingEstimator(env, infos); - } - } - - /// - /// Converts the categorical value into an indicator array by hashing categories into certain value and using that value as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this Scalar input, OneHotHashScalarOutputKind outputKind = (OneHotHashScalarOutputKind)DefOut, - int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplScalar(input, new Config((OneHotHashVectorOutputKind)outputKind, hashBits, seed, ordered, invertHash)); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this Vector input, OneHotHashVectorOutputKind outputKind = DefOut, - int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, hashBits, seed, ordered, invertHash)); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this VarVector input, OneHotHashVectorOutputKind outputKind = DefOut, - int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, hashBits, seed, ordered, invertHash)); - } - } } diff --git a/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..443f2304db --- /dev/null +++ b/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs @@ -0,0 +1,10 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using Microsoft.ML; + +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] + +[assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs index 583111ab65..42b7e79f89 100644 --- a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs @@ -342,6 +342,7 @@ internal LdaSummary(ImmutableArray> w } } + [BestFriend] internal LdaSummary GetLdaDetails(int iinfo) { Contracts.Assert(0 <= iinfo && iinfo < _ldas.Length); @@ -1075,6 +1076,7 @@ private protected override IRowMapper MakeRowMapper(Schema schema) /// public sealed class LatentDirichletAllocationEstimator : IEstimator { + [BestFriend] internal static class Defaults { public const int NumTopic = 100; diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 2584708cc8..7b7bf1f085 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -623,6 +623,7 @@ private static VersionInfo GetVersionInfo() } } + [BestFriend] internal sealed class OutPipelineColumn : Vector { public readonly Scalar[] Inputs; @@ -657,25 +658,4 @@ public override IEstimator Reconcile(IHostEnvironment env, } } } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class TextFeaturizerStaticPipe - { - /// - /// Accept text data and converts it to array which represent combinations of ngram/skip-gram token counts. - /// - /// Input data. - /// Additional data. - /// Delegate which allows you to set transformation settings. - /// - public static Vector FeaturizeText(this Scalar input, Scalar[] otherInputs = null, Action advancedSettings = null) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckValueOrNull(otherInputs); - otherInputs = otherInputs ?? new Scalar[0]; - return new TextFeaturizingEstimator.OutPipelineColumn(new[] { input }.Concat(otherInputs), advancedSettings); - } - } } diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs index ea85bfb3d4..6d04a38df2 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs @@ -869,83 +869,4 @@ public WordEmbeddingsExtractingTransformer Fit(IDataView input) return new WordEmbeddingsExtractingTransformer(_host, _modelKind.Value, _columns); } } - - public static class WordEmbeddingsStaticExtensions - { - /// - /// Vector of tokenized text. - /// The pretrained word embedding model. - /// - public static Vector WordEmbeddings(this VarVector input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, modelKind); - } - - /// - /// Vector of tokenized text. - /// The custom word embedding model file. - public static Vector WordEmbeddings(this VarVector input, string customModelFile) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, customModelFile); - } - - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(VarVector input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) - : base(new Reconciler(modelKind), input) - { - Input = input; - } - - public OutColumn(VarVector input, string customModelFile = null) - : base(new Reconciler(customModelFile), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly WordEmbeddingsExtractingTransformer.PretrainedModelKind? _modelKind; - private readonly string _customLookupTable; - - public Reconciler(WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe) - { - _modelKind = modelKind; - _customLookupTable = null; - } - - public Reconciler(string customModelFile) - { - _modelKind = null; - _customLookupTable = customModelFile; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var cols = new WordEmbeddingsExtractingTransformer.ColumnInfo[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (OutColumn)toOutput[i]; - cols[i] = new WordEmbeddingsExtractingTransformer.ColumnInfo(inputNames[outCol.Input], outputNames[outCol]); - } - - bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable); - if (customLookup) - return new WordEmbeddingsExtractingEstimator(env, _customLookupTable, cols); - else - return new WordEmbeddingsExtractingEstimator(env, _modelKind.Value, cols); - } - } - } } diff --git a/test/Microsoft.ML.OnnxTransformTest/DnnImageFeaturizerTest.cs b/test/Microsoft.ML.OnnxTransformTest/DnnImageFeaturizerTest.cs index d5d071c0e5..5b86c0bd85 100644 --- a/test/Microsoft.ML.OnnxTransformTest/DnnImageFeaturizerTest.cs +++ b/test/Microsoft.ML.OnnxTransformTest/DnnImageFeaturizerTest.cs @@ -4,6 +4,7 @@ using Microsoft.ML.Core.Data; using Microsoft.ML.Data; +using Microsoft.ML.OnnxTransform.StaticPipe; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.ImageAnalytics; diff --git a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj index 68d4b3f192..32a0d50b43 100644 --- a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj +++ b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj @@ -3,6 +3,7 @@ + diff --git a/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs b/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs index 6fb551a5ac..0542aee0b2 100644 --- a/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs +++ b/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs @@ -4,6 +4,7 @@ using Microsoft.ML.Core.Data; using Microsoft.ML.Data; +using Microsoft.ML.OnnxTransform.StaticPipe; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.ImageAnalytics; @@ -259,7 +260,7 @@ public void OnnxModelScenario() } }); - var onnx = OnnxTransform.Create(env, dataView, modelFile, + var onnx = Transforms.OnnxTransform.Create(env, dataView, modelFile, new[] { "data_0" }, new[] { "softmaxout_1" }); @@ -297,7 +298,7 @@ public void OnnxModelMultiInput() } }); - var onnx = OnnxTransform.Create(env, dataView, modelFile, + var onnx = Transforms.OnnxTransform.Create(env, dataView, modelFile, new[] { "ina", "inb" }, new[] { "outa", "outb" }); diff --git a/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj b/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj index ae02e94492..06bb2aef5b 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj +++ b/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj @@ -4,10 +4,13 @@ + + + diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 8761899855..9ade06e5cb 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.HalLearners.StaticPipe; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.Internal.Utilities; @@ -10,9 +11,6 @@ using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework; using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Categorical; -using Microsoft.ML.Transforms.Conversions; -using Microsoft.ML.Transforms.FeatureSelection; using Microsoft.ML.Transforms.Projections; using Microsoft.ML.Transforms.Text; using System; diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index 1d43e5579e..877fc6bbd1 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.LightGBM.StaticPipe; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.FactorizationMachine; @@ -15,8 +16,6 @@ using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers.KMeans; using Microsoft.ML.Trainers.Recommender; -using Microsoft.ML.Transforms.Categorical; -using Microsoft.ML.Transforms.Conversions; using System; using System.Linq; using Xunit; diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index ddaea902c6..4cab6dba5e 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -18,6 +18,8 @@ + + diff --git a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs index 925ace12b5..6c072b9ca1 100644 --- a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs @@ -9,6 +9,7 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.Runtime.Tools; +using Microsoft.ML.TensorFlow.StaticPipe; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.TensorFlow; using System; diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 8b24dd01f9..0ef86d7aa2 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -7,6 +7,7 @@ using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.Runtime.Tools; +using Microsoft.ML.StaticPipe; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions;