In [42]:
#r "nuget:Microsoft.ML"

In [43]:
using Microsoft.ML.Transforms.Text;
using Microsoft.ML;
using Microsoft.ML.Data;

In [44]:
var featurizeTextOptions = new TextFeaturizingEstimator.Options()
{
    // Produce cleaned up tokens for possible input to the word embedding transform
    OutputTokensColumnName = "OutputTokens",

    KeepDiacritics = false, // diarcitic is a mark added to a letter to indicate a special pronunciation
    KeepPunctuations = false, 
    KeepNumbers = true,
    CaseMode  = TextNormalizingEstimator.CaseMode.Lower,

    StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options()
    {
        Language = TextFeaturizingEstimator.Language.English
    },

     // Row-wise normalization (see: NormalizeLpNorm)
    Norm = TextFeaturizingEstimator. NormFunction.L2,

    // ngram options
    WordFeatureExtractor = new WordBagEstimator.Options()
    {
        NgramLength = 2,
        UseAllLengths = true, // Produce both unigrams and bigrams
        Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf, // TF-IDF
    },

    // chargram options
    CharFeatureExtractor = new WordBagEstimator.Options()
    {
        NgramLength = 3,
        UseAllLengths = false, // Produce only tri-chargrams and not single/double characters
        Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf, // TF-IDF
    },
};

In [45]:
MLContext mlContext = new MLContext();

// Featurization pipeline
//var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label", "Label") // Needed for multi-class to convert string labels to the Key type
            //.Append(mlContext.Transforms.Text.FeaturizeText("Features", featurizeTextOptions, new[] { "SentimentText" })) // Use above options object
            //.Append(mlContext.Transforms.Concatenate("Features", new[] { "FeaturesText" })) // Concat is incase there are additional columns
            //.Append(mlContext.Transforms.NormalizeMinMax("Features", "Features")) // Generally unneeded if only using the output from FeaturizeText as it's row-wise normalized w/ a L2-norm
            //.AppendCacheCheckpoint(mlContext); // Cache the featurized dataset in memory for added speed

In [46]:
private class TextData
{
    public string Text { get; set; }
}

private class TransformedTextData : TextData
{
    public float[] Features { get; set; }
    public string[] OutputTokens { get; set; }
}

In [47]:
var samples = new List<TextData>()
{
    new TextData(){ Text = "ML.NET's FeaturizeText API uses a " +
    "composition of several basic transforms to convert text into " +
    "numeric features." },

    new TextData(){ Text = "This API can be used as a featurizer to " +
    "perform text classification." },

    new TextData(){ Text = "There are a number of approaches to text " +
    "classification." },

    new TextData(){ Text = "One of the simplest and most common " +
    "approaches is called “Bag of Words”." },

    new TextData(){ Text = "Text classification can be used for a " +
    "wide variety of tasks" },

    new TextData(){ Text = "such as sentiment analysis, topic " +
    "detection, intent identification etc." },
};

In [48]:
 var options = new TextFeaturizingEstimator.Options()
{
    // Also output tokenized words
    OutputTokensColumnName = "OutputTokens",
    CaseMode = TextNormalizingEstimator.CaseMode.Lower,
    // Use ML.NET's built-in stop word remover
    StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options()
    {
        Language = TextFeaturizingEstimator.Language.English
    },

    WordFeatureExtractor = new WordBagEstimator.Options()
    {
        NgramLength
        = 2,
        UseAllLengths = true
    },

    CharFeatureExtractor = new WordBagEstimator.Options()
    {
        NgramLength
        = 3,
        UseAllLengths = false
    },
};

In [49]:
IDataView dataview = mlContext.Data.LoadFromEnumerable(samples);
TextFeaturizingEstimator textPipeline = mlContext.Transforms.Text.FeaturizeText("Features",
    options, "Text");
// Create the prediction engine to get the features extracted from the
// text.
// Fit to data.
ITransformer textTransformer = textPipeline.Fit(dataview);
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData,
    TransformedTextData>(textTransformer);

In [50]:
// Trainer 
//var trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(labelColumnName: "Label", numberOfIterations: 10, featureColumnName: "Features"), labelColumnName: "Label")
            //.Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel", "PredictedLabel"));
//var trainingPipeline = pipeline.Append(trainer);

In [51]:
public class SentimentData
{
    [LoadColumn(0)]
    public string SentimentText;

    [LoadColumn(1), ColumnName("Label")]
    public bool Sentiment;
}

public class SentimentPrediction : SentimentData
{

    [ColumnName("PredictedLabel")]
    public bool Prediction { get; set; }

    public float Probability { get; set; }

    public float Score { get; set; }
}


In [52]:
//IDataView dataview = mlContext.Data.LoadFromTextFile<SentimentData>("yelp_labelled.txt", hasHeader: false);
//DataOperationsCatalog.TrainTestData splitDataView = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.2);

In [59]:
for(int j = 0; j < samples.Count; j++)
{
    // Convert the text into numeric features.
    var prediction = predictionEngine.Predict(samples[j]);

    // Print the length of the feature vector.
    Console.WriteLine($"Number of Features: {prediction.Features.Length}");

    // Print feature values and tokens.
    Console.Write("Features: ");
    for (int i = 0; i < 10; i++)
        Console.Write($"{prediction.Features[i]:F4}  ");
    Console.WriteLine();
    for (int k = 0; k < prediction.OutputTokens.Length; k++)
    {
        Console.Write($"[{prediction.OutputTokens[k]}]");
    }
    Console.WriteLine();
}



Number of Features: 282
Features: 0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.1881  
[ml.net's][featurizetext][api][uses][composition][basic][transforms][convert][text][numeric][features.]
Number of Features: 282
Features: 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.1443  
[api][used][featurizer][perform][text][classification.]
Number of Features: 282
Features: 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
[number][approaches][text][classification.]
Number of Features: 282
Features: 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
[simplest][common][approaches][called][“bag][words”.]
Number of Features: 282
Features: 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
[text][classification][used][wide][variety][tasks]
Number of Features: 282
Features: 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
[