Skip to content

Commit

Permalink
Adding helpful comments to Ngram + NgramCounter
Browse files Browse the repository at this point in the history
  • Loading branch information
ruddzw committed Apr 1, 2009
1 parent a08e897 commit fbdf8f5
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 16 deletions.
54 changes: 39 additions & 15 deletions src/Ngram.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,16 @@ public class Ngram
public Set<String> samples; // Set of sample sentences to train from
public int n; // (as in n-gram)
public NgramCounter ngc; // The data structure for holding n-gram counts
public Set<String> wordSet;
public double vocabSize;
public int numTrainingNgrams;
public HashMap<Double, Double> numberOfNgramsWithCount;

// For add-one smoothing
public Set<String> wordSet; // Used to find the vocabulary
public double vocabSize; // Size of the vocabulary

// For Good Turing Smoothing
public double numTrainingNgrams; // The size of the training set (# non-distinct words)
public HashMap<Double, Double> numberOfNgramsWithCount; // The number of ngrams that occur x times
public boolean goodTuringCountsAvailable = false; // True when good turing counts are available

public final String START = ":S"; // The sentence start symbol

public static void main(String[] args)
Expand All @@ -36,10 +42,6 @@ public static void main(String[] args)

System.out.println("Done training.");

System.out.println("Making good turing counts...");
n.makeGoodTuringCounts();
System.out.println("Done making good turing counts.");

NgramParser test = new NgramParser(args[1], true);
HashSet<String> testset = test.parse();
System.out.println("Perplexity of the test set: " + n.perplexity(testset));
Expand All @@ -64,7 +66,9 @@ public Ngram(HashSet<String> samples, int n)
this.n = n;
this.numberOfNgramsWithCount = new HashMap<Double, Double>();
this.ngc = new NgramCounter(n, numberOfNgramsWithCount);

this.wordSet = new HashSet<String>();

this.numTrainingNgrams = 0;
}

Expand All @@ -79,6 +83,8 @@ public void train()
while (matcher.find()) {
String match = matcher.group();
sampleWords.add(match);

// Add to vocab for +1 smoothing
wordSet.add(match);
}

Expand All @@ -99,16 +105,18 @@ public void train()
}
nWords[n-1] = word;

numTrainingNgrams++;
// Add to the size of the training set for gt-smoothing
numTrainingNgrams += 1;

// Insert the words into the counter
// Insert the words into the counter and receive count for this ngram
double countForNgram = ngc.insert(nWords);

// Update numberOfNgramsWithCount
// Decrement the number of ngrams with old countForNgram for gt-smoothing
if (countForNgram != 1.0) {
numberOfNgramsWithCount.put(countForNgram-1,
numberOfNgramsWithCount.get(countForNgram-1) - 1.0);
}
// Increment the number of ngrams with the new countForNgram for gt-smoothing
if (!numberOfNgramsWithCount.containsKey(countForNgram)) {
numberOfNgramsWithCount.put(countForNgram, 1.0);
} else {
Expand All @@ -118,35 +126,49 @@ public void train()
}
}

// Set the vocab size so we don't have to call wordSet.size() more than once
vocabSize = wordSet.size();
}

public double unsmoothedProbability(String[] words)
{
// If this ngram has occurred, return count / level1Count
double count = ngc.count(words);
if (count > 0) {
return count / ngc.level1Count(words);
}
// Otherwise, return a 0 probability
return 0.0;
}

public double addOneSmoothedProbability(String[] words)
{
// (count(Wn) + 1) / (count(Wn-1) + V)
return (ngc.count(words) + 1.0) / (ngc.level1Count(words) + vocabSize);
}

public double goodTuringSmoothedProbability(String[] words)
{
if (!goodTuringCountsAvailable) {
System.out.println("Making good turing counts...");
makeGoodTuringCounts();
System.out.println("Done making good turing counts.");
}

// If this ngram has occurred, return good turing probability
double gtcount = ngc.gtcount(words);
if (gtcount > 0) {
return gtcount / ngc.level1GTCount(words);
}
return numberOfNgramsWithCount.get(1.0)/(double)numTrainingNgrams;
// Otherwise, return N1/N as per book (page 101?)
return numberOfNgramsWithCount.get(1.0)/numTrainingNgrams;
}

public void makeGoodTuringCounts()
{
// Generate good turing counts in the NgramCounter
ngc.makeGoodTuringCounts();
goodTuringCountsAvailable = true;
}

public String getSentence()
Expand Down Expand Up @@ -186,8 +208,8 @@ public String getSentence()

public double perplexity(Set<String> testSamples)
{
int wordCount = 0;
Stack<Double> probabilities = new Stack<Double>();
int wordCount = 0; // size of the test set
Stack<Double> probabilities = new Stack<Double>(); // collection of probabilities to multiply

String regexp = "('?\\w+|\\p{Punct})";
Pattern pattern = Pattern.compile(regexp);
Expand All @@ -201,6 +223,7 @@ public double perplexity(Set<String> testSamples)

while (matcher.find()) {
String match = matcher.group();
// For each match, nWords is the ngram ending in match
for (int i = 0; i < n-1; i++) {
nWords[i] = nWords[i+1];
}
Expand All @@ -212,12 +235,13 @@ public double perplexity(Set<String> testSamples)
}
}

// perplexity = ((P1 * P2 * ... * Pn) ^ (1/n)) ^ -1
// = (P1^(1/n) * P2^(1/n) * ... * Pn^(1/n)) ^ -1
double product = 1;
double power = 1.0/wordCount;
while (!probabilities.empty()) {
product *= Math.pow(probabilities.pop(), power);
}

double perplexity = 1 / product;
return perplexity;
}
Expand Down
10 changes: 9 additions & 1 deletion src/NgramCounter.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ public class NgramCounter
public int level; // level into the tree (root = highest numbered level)
public HashMap<String, NgramCounter> map; // links to child nodes, each link is the next word
public double count; // leaf node's count for an n-gram

// For Good Turing Smoothing Counts
public double gtcount; // leaf nodes' good-turing count for an n-gram
public HashMap<Double, Double> numberOfNgramsWithCount;
public HashMap<Double, Double> numberOfNgramsWithCount; // // The number of ngrams that occur x times

public NgramCounter(int level, HashMap<Double, Double> numberOfNgramsWithCount)
{
this.level = level;
this.numberOfNgramsWithCount = numberOfNgramsWithCount;

if (level == 0) {
// There are no links to child nodes, we are a leaf node
this.map = null;
Expand Down Expand Up @@ -101,8 +104,10 @@ public String generateNextWord(String[] ngram)
return map.get(ngram[ngram.length-level]).generateNextWord(ngram);
}

// Generate Good Turing Counts based on original counts and the numberOfNgramsWithCount map
public void makeGoodTuringCounts()
{
// One level above leaf nodes, do the same as for any other non-leaf, but set the level 1 gtcount
if (level == 1) {
gtcount = 0;
for (NgramCounter ngc : map.values()) {
Expand All @@ -112,14 +117,17 @@ public void makeGoodTuringCounts()
return;
}

// On leaf level, set the gtcount
if (level == 0) {
if (!numberOfNgramsWithCount.containsKey(count+1)) {
numberOfNgramsWithCount.put(count+1, 0.0);
}
// c* = (c+1) * N(c+1) / N(c)
gtcount = (count+1)*(numberOfNgramsWithCount.get(count+1.0))/(numberOfNgramsWithCount.get(count));
return;
}

// Recursive step - Recurse to each child
for (NgramCounter ngc : map.values()) {
ngc.makeGoodTuringCounts();
}
Expand Down

0 comments on commit fbdf8f5

Please sign in to comment.