diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs new file mode 100644 index 0000000000..f2cbec7c28 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs @@ -0,0 +1,63 @@ +using Microsoft.ML.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LdaTransformExample + { + public static void LdaTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of one of the columns of the the topics data. + // The Review column contains the keys associated with a particular body of text. + // + // Review + // "animals birds cats dogs fish horse" + // "horse birds house fish duck cats" + // "car truck driver bus pickup" + // "car truck driver bus pickup horse" + + string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review); + string ldaFeatures = "LdaFeatures"; + + // A pipeline for featurizing the "Review" column + var pipeline = ml.Transforms.Text.ProduceWordBags(review). + Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numTopic:3)); + + // The transformed data + var transformer = pipeline.Fit(trainData); + var transformed_data = transformer.Transform(trainData); + + // Column obtained after processing the input. + var ldaFeaturesColumn = transformed_data.GetColumn>(ml, ldaFeatures); + + Console.WriteLine($"{ldaFeatures} column obtained post-transformation."); + foreach (var featureRow in ldaFeaturesColumn) + { + foreach (var value in featureRow.GetValues()) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + + // LdaFeatures column obtained post-transformation. + // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3. + + //0.1818182 0.4545455 0.3636364 + //0.3636364 0.1818182 0.4545455 + //0.2222222 0.2222222 0.5555556 + //0.2727273 0.09090909 0.6363636 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 2b9041c51b..abea9f3e8d 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -17,6 +17,7 @@ + false diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d8730391b2..83a7e8c2e8 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - MatrixFactorizationExample.MatrixFactorizationInMemoryData(); + LdaTransformExample.LdaTransform(); } } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index aaed670b0b..52cf642d2f 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -507,6 +507,13 @@ public static NgramHashEstimator ProduceHashedNgrams(this TransformsCatalog.Text /// The number of words to summarize the topic. /// The number of burn-in iterations. /// Reset the random number generator for each document. + /// + /// + /// + /// + /// public static LatentDirichletAllocationEstimator LatentDirichletAllocation(this TransformsCatalog.TextTransforms catalog, string inputColumn, string outputColumn = null,