Adding helpful comments to Ngram + NgramCounter

esbie · Apr 1, 2009 · fbdf8f5 · fbdf8f5
1 parent a08e897
commit fbdf8f5
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 16 deletions.
diff --git a/src/Ngram.java b/src/Ngram.java
@@ -15,10 +15,16 @@ public class Ngram
     public Set<String> samples; // Set of sample sentences to train from
     public int n; // (as in n-gram)
     public NgramCounter ngc; // The data structure for holding n-gram counts
-    public Set<String> wordSet;
-    public double vocabSize;
-    public int numTrainingNgrams;
-    public HashMap<Double, Double> numberOfNgramsWithCount;
+
+    // For add-one smoothing
+    public Set<String> wordSet; // Used to find the vocabulary
+    public double vocabSize; // Size of the vocabulary
+
+    // For Good Turing Smoothing
+    public double numTrainingNgrams; // The size of the training set (# non-distinct words)
+    public HashMap<Double, Double> numberOfNgramsWithCount; // The number of ngrams that occur x times
+    public boolean goodTuringCountsAvailable = false; // True when good turing counts are available
+
     public final String START = ":S"; // The sentence start symbol
 
     public static void main(String[] args)
@@ -36,10 +42,6 @@ public static void main(String[] args)
 
         System.out.println("Done training.");
 
-        System.out.println("Making good turing counts...");
-        n.makeGoodTuringCounts();
-        System.out.println("Done making good turing counts.");
-
         NgramParser test = new NgramParser(args[1], true);
         HashSet<String> testset = test.parse();
         System.out.println("Perplexity of the test set: " + n.perplexity(testset));
@@ -64,7 +66,9 @@ public Ngram(HashSet<String> samples, int n)
         this.n = n;
         this.numberOfNgramsWithCount = new HashMap<Double, Double>();
         this.ngc = new NgramCounter(n, numberOfNgramsWithCount);
+
         this.wordSet = new HashSet<String>();
+
         this.numTrainingNgrams = 0;
     }
 
@@ -79,6 +83,8 @@ public void train()
             while (matcher.find()) {
                 String match = matcher.group();
                 sampleWords.add(match);
+
+                // Add to vocab for +1 smoothing
                 wordSet.add(match);
             }
 
@@ -99,16 +105,18 @@ public void train()
                 }
                 nWords[n-1] = word;
 
-                numTrainingNgrams++;
+                // Add to the size of the training set for gt-smoothing
+                numTrainingNgrams += 1;
 
-                // Insert the words into the counter
+                // Insert the words into the counter and receive count for this ngram
                 double countForNgram = ngc.insert(nWords);
 
-                // Update numberOfNgramsWithCount
+                // Decrement the number of ngrams with old countForNgram for gt-smoothing
                 if (countForNgram != 1.0) {
                     numberOfNgramsWithCount.put(countForNgram-1,
                         numberOfNgramsWithCount.get(countForNgram-1) - 1.0);
                 }
+                // Increment the number of ngrams with the new countForNgram for gt-smoothing
                 if (!numberOfNgramsWithCount.containsKey(countForNgram)) {
                     numberOfNgramsWithCount.put(countForNgram, 1.0);
                 } else {
@@ -118,35 +126,49 @@ public void train()
             }
         }
 
+        // Set the vocab size so we don't have to call wordSet.size() more than once
         vocabSize = wordSet.size();
     }
 
     public double unsmoothedProbability(String[] words)
     {
+        // If this ngram has occurred, return count / level1Count
         double count = ngc.count(words);
         if (count > 0) {
             return count / ngc.level1Count(words);
         }
+        // Otherwise, return a 0 probability
         return 0.0;
     }
 
     public double addOneSmoothedProbability(String[] words)
     {
+        // (count(Wn) + 1) / (count(Wn-1) + V)
         return (ngc.count(words) + 1.0) / (ngc.level1Count(words) + vocabSize);
     }
 
     public double goodTuringSmoothedProbability(String[] words)
     {
+        if (!goodTuringCountsAvailable) {
+            System.out.println("Making good turing counts...");
+            makeGoodTuringCounts();
+            System.out.println("Done making good turing counts.");
+        }
+
+        // If this ngram has occurred, return good turing probability
         double gtcount = ngc.gtcount(words);
         if (gtcount > 0) {
             return gtcount / ngc.level1GTCount(words);
         }
-        return numberOfNgramsWithCount.get(1.0)/(double)numTrainingNgrams;
+        // Otherwise, return N1/N as per book (page 101?)
+        return numberOfNgramsWithCount.get(1.0)/numTrainingNgrams;
     }
 
     public void makeGoodTuringCounts()
     {
+        // Generate good turing counts in the NgramCounter
         ngc.makeGoodTuringCounts();
+        goodTuringCountsAvailable = true;
     }
 
     public String getSentence()
@@ -186,8 +208,8 @@ public String getSentence()
 
     public double perplexity(Set<String> testSamples)
     {
-        int wordCount = 0;
-        Stack<Double> probabilities = new Stack<Double>();
+        int wordCount = 0; // size of the test set
+        Stack<Double> probabilities = new Stack<Double>(); // collection of probabilities to multiply
 
         String regexp = "('?\\w+|\\p{Punct})";
         Pattern pattern = Pattern.compile(regexp);
@@ -201,6 +223,7 @@ public double perplexity(Set<String> testSamples)
 
             while (matcher.find()) {
                 String match = matcher.group();
+                // For each match, nWords is the ngram ending in match
                 for (int i = 0; i < n-1; i++) {
                     nWords[i] = nWords[i+1];
                 }
@@ -212,12 +235,13 @@ public double perplexity(Set<String> testSamples)
             }
         }
 
+        // perplexity = ((P1 * P2 * ... * Pn) ^ (1/n)) ^ -1
+        //            = (P1^(1/n) * P2^(1/n) * ... * Pn^(1/n)) ^ -1
         double product = 1;
         double power = 1.0/wordCount;
         while (!probabilities.empty()) {
             product *= Math.pow(probabilities.pop(), power);
         }
-
         double perplexity = 1 / product;
         return perplexity;
     }

diff --git a/src/NgramCounter.java b/src/NgramCounter.java
@@ -6,13 +6,16 @@ public class NgramCounter
     public int level; // level into the tree (root = highest numbered level)
     public HashMap<String, NgramCounter> map; // links to child nodes, each link is the next word
     public double count; // leaf node's count for an n-gram
+
+    // For Good Turing Smoothing Counts
     public double gtcount; // leaf nodes' good-turing count for an n-gram
-    public HashMap<Double, Double> numberOfNgramsWithCount;
+    public HashMap<Double, Double> numberOfNgramsWithCount; // // The number of ngrams that occur x times
 
     public NgramCounter(int level, HashMap<Double, Double> numberOfNgramsWithCount)
     {
         this.level = level;
         this.numberOfNgramsWithCount = numberOfNgramsWithCount;
+
         if (level == 0) {
             // There are no links to child nodes, we are a leaf node
             this.map = null;
@@ -101,8 +104,10 @@ public String generateNextWord(String[] ngram)
         return map.get(ngram[ngram.length-level]).generateNextWord(ngram);
     }
 
+    // Generate Good Turing Counts based on original counts and the numberOfNgramsWithCount map
     public void makeGoodTuringCounts()
     {
+        // One level above leaf nodes, do the same as for any other non-leaf, but set the level 1 gtcount
         if (level == 1) {
             gtcount = 0;
             for (NgramCounter ngc : map.values()) {
@@ -112,14 +117,17 @@ public void makeGoodTuringCounts()
             return;
         }
 
+        // On leaf level, set the gtcount
         if (level == 0) {
             if (!numberOfNgramsWithCount.containsKey(count+1)) {
                 numberOfNgramsWithCount.put(count+1, 0.0);
             }
+            // c* = (c+1) * N(c+1) / N(c)
             gtcount = (count+1)*(numberOfNgramsWithCount.get(count+1.0))/(numberOfNgramsWithCount.get(count));
             return;
         }
 
+        // Recursive step - Recurse to each child
         for (NgramCounter ngc : map.values()) {
             ngc.makeGoodTuringCounts();
         }