In [1]:
#r "nuget: Microsoft.ML"
#r "nuget: LemmaSharpPreBuilt-std"
#r "NeuterBot.StandardSearch.dll"
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;
using NeuterBot.Search;
var mlContext = new MLContext();



Installed package Microsoft.ML version 1.5.2

Installed package LemmaSharpPreBuilt-std version 1.0.1

# Outline

1. Context

2. Ideas contest

3. How Pat Works



 # What Pat Does and Does Not Do
 

![what pat does](beans.png) 

Pat does _not_:

* Search the website
* Synthesize answers 
* Extract passages from answers
* Retain much context between questions
* Escalate to humans
* Learn automatically
 

# Ideas Contest

What _simple_ impactful changes can we make to Pat to better help veterans and their families?

So far:

* Nudge users to enter shorter questions (Monita Lal's idea, implemented in production)
* Disambiguate providers and veterans (Nick Miller's idea, currently in test)
* Suggest common questions (Kristen Robertson's idea, currently in test)
* Socratic style dialogs, esp around hearing (Tegan Sloan's idea, currently in development)

Email: content.hub@dva.gov.au with your idea by COB 17 December.

Prize for any idea Kristen agrees to seek approval to implement: glory and renown, recognition and status.


# How Pat Works

* Currently Pat uses a simple algorithm called TF-IDF search.

* A Brit by the name of Karen Jones figured out the guts of it in 1972.

[1972 Paper](https://www.emerald.com/insight/content/doi/10.1108/eb026526/full/html)

![Karen_Spärck](ks.jpeg)


# Two simple steps

1. Build the knowledge base (on startup)
2. Infer an answer to a query (at runtime)

## Steps to Build the Knowledge Base

 1. Start with sets of multiple answers to one question.
 1. Flatten the input data to pairs of questions and answers.
 1. Nomalise the questions (remove punctuation and convert to lower case).
 1. Split to tokens.
 1. Remove stop words.
 1. Lemmatize.
 1. Make a vocab of all ngrams in the knowledge base.
 1. Find TF vectors.
 1. Find IDF vectors.
 1. Multiply both vectors to get a TFIDF vector for each question.
    
    ![TFIDF Formulas](idfFormula.png)




In [2]:
class QnASet {
    public IEnumerable<string> Questions {get;set;}
    public string Answer {get;set;}
 }

var qnaSet1 = new QnASet() {Questions = new [] {"What eats rats?", "What preys on rats?"},Answer = "Cats."};
var qnaSet2 = new QnASet() {Questions = new [] {"What do rats eat?"}, Answer = "Everything."};


In [3]:
display(qnaSet1);
display(qnaSet2);

Questions,Answer
"[ What eats rats?, What preys on rats? ]",Cats.


Questions,Answer
[ What do rats eat? ],Everything.


In [4]:
class QnaPair  {
    public string Question {get;set;}
    public string Answer {get;set;}
    public string NormalisedQuestion {get;set;}
    public string[] Tokens {get;set;}
    public string[] TokensWithStopWordsRemoved {get;set;}
    public string[] LemmatizedTokens {get;set;}
    
    public float[] TFVector {get;set;}
    public float[] IDFVector {get;set;}
    public float[] TFIDFVector {get;set;}
}
var pairs = (new [] {qnaSet1, qnaSet2}).SelectMany(qnaset => qnaset.Questions.Select(q => new QnaPair() {Question = q, Answer = qnaset.Answer}));




In [5]:
display(pairs);

index,Question,Answer,NormalisedQuestion,Tokens,TokensWithStopWordsRemoved,LemmatizedTokens,TFVector,IDFVector,TFIDFVector
0,What eats rats?,Cats.,<null>,<null>,<null>,<null>,<null>,<null>,<null>
1,What preys on rats?,Cats.,<null>,<null>,<null>,<null>,<null>,<null>,<null>
2,What do rats eat?,Everything.,<null>,<null>,<null>,<null>,<null>,<null>,<null>


In [11]:
// No build-in lemmatizer in ML.Net, so we have to add our own.
Action<QnaPair, QnaPair> customLemmatizerFunction = (x, y) =>
{
    var lemmatizer = new WrappedLemmatizer();
    if (x.TokensWithStopWordsRemoved != null)
    {
        y.Question = x.Question;
        y.Answer = x.Answer;
        y.NormalisedQuestion = x.NormalisedQuestion;
        y.Tokens = x.Tokens;
        y.TokensWithStopWordsRemoved = x.TokensWithStopWordsRemoved;      
        y.LemmatizedTokens = x.TokensWithStopWordsRemoved.Select(t => lemmatizer.Lemmatize(t)).ToArray();
    }
    else
    {
        y.LemmatizedTokens = null;
    }
};

var textPipeline = mlContext.Transforms.Text.NormalizeText("NormalisedQuestion", "Question", TextNormalizingEstimator.CaseMode.Lower, false, false, true)
    .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalisedQuestion"))
    .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("TokensWithStopWordsRemoved","Tokens"))
    .Append(mlContext.Transforms.CustomMapping<QnaPair, QnaPair>(customLemmatizerFunction, "CMContract"))
    .Append(mlContext.Transforms.Text.ProduceWordBags(outputColumnName:  "TFVector", inputColumnName: "LemmatizedTokens", ngramLength: 2, skipLength: 0, useAllLengths: true, maximumNgramsCount: 100000, weighting: NgramExtractingEstimator.WeightingCriteria.Tf ))
    .Append(mlContext.Transforms.Text.ProduceWordBags(outputColumnName:  "IDFVector", inputColumnName: "LemmatizedTokens", ngramLength: 2, skipLength: 0, useAllLengths: true, maximumNgramsCount: 100000, weighting: NgramExtractingEstimator.WeightingCriteria.Idf ))
    .Append(mlContext.Transforms.Conversion.MapValueToKey("LemmatizedTokensKeyed","LemmatizedTokens"))
    
    .Append(mlContext.Transforms.Text.ProduceNgrams("TFIDFVector",
        "LemmatizedTokensKeyed",
        ngramLength: 2,
        useAllLengths: true,
        weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf));

var dataView = mlContext.Data.LoadFromEnumerable(pairs);
var textTransformer = textPipeline.Fit(dataView); 
var transformedDataView = textTransformer.Transform(dataView);

// show the vocab
var featuresColumn = transformedDataView.Schema["TFIDFVector"];
VBuffer<ReadOnlyMemory<char>> slotNames = default;
featuresColumn.GetSlotNames(ref slotNames);
var slotNamesMap = slotNames.Items();
List<string> vocab = new List<string>();
foreach (var ngram in slotNamesMap)
{
    vocab.Add(ngram.Value.ToString());
}


var enumerated = mlContext.Data.CreateEnumerable<QnaPair>(transformedDataView,false).ToList();








### Completed Knowledge Base

In [13]:
display(enumerated);

index,Question,Answer,NormalisedQuestion,Tokens,TokensWithStopWordsRemoved,LemmatizedTokens,TFVector,IDFVector,TFIDFVector
0,What eats rats?,Cats.,what eats rats,"[ what, eats, rats ]","[ eats, rats ]","[ eat, rat ]","[ 1, 1, 1, 0, 0, 0 ]","[ 0.4054651, 1.0986123, 0, 0, 0, 0 ]","[ 0.4054651, 1.0986123, 0, 0, 0, 0 ]"
1,What preys on rats?,Cats.,what preys on rats,"[ what, preys, on, rats ]","[ preys, rats ]","[ prey, rat ]","[ 0, 0, 1, 1, 1, 0 ]","[ 0, 0, 0, 1.0986123, 1.0986123, 0 ]","[ 0, 0, 0, 1.0986123, 1.0986123, 0 ]"
2,What do rats eat?,Everything.,what do rats eat,"[ what, do, rats, eat ]","[ rats, eat ]","[ rat, eat ]","[ 1, 0, 1, 0, 0, 1 ]","[ 0.4054651, 0, 0, 0, 0, 1.0986123 ]","[ 0.4054651, 0, 0, 0, 0, 1.0986123 ]"


### Vocab

In [12]:
display(vocab);

index,value
0,eat
1,eat|rat
2,rat
3,prey
4,prey|rat
5,rat|eat


## Steps to Query the Knowledge Base

1. Get the TFIDF Vector for the query, using the vocab in the knowledge base.
1. Find the questions with the most _similar_ TFIDF vectors.

In [7]:
class Query {
    public string QueryText {get;set;}
    public float[] TFIDFVector {get;set;}
}

var testQueries = new [] {
    "What do rats eat?",
    "What do I eat?",
    "Can I have a cake?"
}.Select(i => new Query() {QueryText = i});

var predictionEngine =  mlContext.Model.CreatePredictionEngine<QnaPair,Query>(textTransformer);

List<Query> predictionResults = new List<Query>();
foreach (var q in testQueries)
{
    Query prediction = new Query() {QueryText = q.QueryText};
    predictionEngine.Predict(new QnaPair() {Question = q.QueryText}, ref prediction);
    predictionResults.Add(prediction);

}



In [8]:
display(predictionResults);

index,QueryText,TFIDFVector
0,What do rats eat?,"[ 0.4054651, 0, 0, 0, 0, 1.0986123 ]"
1,What do I eat?,"[ 0.4054651, 0, 0, 0, 0, 0 ]"
2,Can I have a cake?,"[ 0, 0, 0, 0, 0, 0 ]"



![Cosign Similarity formula](cosignSimilarityFormula.png)

![Cosign Similarity diagram](cosignDiagram.png)

[Source](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a)



In [9]:
double CalculateSimilarity(float[] queryVector, float[] documentVector)
{
    Func<float[], float> calulateDotProduct = (qv) =>
    {
        float dotProduct = 0;
        for (int i = 0; i < queryVector.Count(); i++)
        {
            dotProduct += (qv[i] * documentVector[i]);
        }
        return dotProduct;
    };

    Func<float[], double> calculateMagnitude = v => {
        return Math.Sqrt(v.Sum(f => f * f));
    };
    
    var css = calulateDotProduct(queryVector) / ((calculateMagnitude(queryVector) * calculateMagnitude(documentVector)));
    return (Double.IsNaN(css) ? 0 : css);
}


var comp = predictionResults.Select(query => new { 
    Query = query.QueryText,  
    ComparisonScores = enumerated.Select(r => new {Question = r.Question, Similarity = CalculateSimilarity(r.TFIDFVector, query.TFIDFVector)})
    });




In [10]:
display(comp);

index,Query,ComparisonScores
0,What do rats eat?,"[ { Question = What eats rats?, Similarity = 0.11988320132103283 }, { Question = What preys on rats?, Similarity = 0 }, { Question = What do rats eat?, Similarity = 0.9999999999999999 } ]"
1,What do I eat?,"[ { Question = What eats rats?, Similarity = 0.3462415361002097 }, { Question = What preys on rats?, Similarity = 0 }, { Question = What do rats eat?, Similarity = 0.3462415361002097 } ]"
2,Can I have a cake?,"[ { Question = What eats rats?, Similarity = 0 }, { Question = What preys on rats?, Similarity = 0 }, { Question = What do rats eat?, Similarity = 0 } ]"
