From a34f07f82e63aee34e4f7b1bfed0d037b9b39a73 Mon Sep 17 00:00:00 2001 From: Rudd Zwolinski Date: Fri, 8 May 2009 12:16:57 -0400 Subject: [PATCH] Good Turing smoothing implemented. I think. It's not very good, though. Lost a lot of performance! --- scoring/gtscore.html | 47 +++++++++++++++++++++++++++ src/HMM.java | 77 ++++++++++++++++++++++++++++++++++++++++++++ src/HMMParser.java | 3 ++ 3 files changed, 127 insertions(+) create mode 100644 scoring/gtscore.html diff --git a/scoring/gtscore.html b/scoring/gtscore.html new file mode 100644 index 0000000..6e1d4b2 --- /dev/null +++ b/scoring/gtscore.html @@ -0,0 +1,47 @@ +Part of Speech Scoring
Agree: 163912
Disagree: 62303
Percentage Right: 72.45850186769223%
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
''UHRBWP$CD#-RRB-FWJJVBG-LRB-MDNNVBPWRBRPINVBDSYM$TO:VBNNNPSRBRVBZDTVBNNSLSJJSRBSNNP<s>,POSCCPRP$JJRPDTWP.WDTEXPRP``
''17490000000000000000000000000000000000100000000000
UH01600000000001000000000000020000020000000000050
RB0785288010013081008843547350050000301300845606631700055060160022910
WP$00049000000000000000000000000000000000000000000
CD0000703500313000382006800005290002003514251001030280000000001880
#00000260000000000000000000000000000000000000000
-RRB-000000317000000000000000000000000000000000000000
FW00200001800000000450000000010000040000000000060
JJ00846033001485108700136093035173790005028381141337504548582042323000132150400350
VBG000000041042539004710007840002110009610000171000000000030
-LRB-000000000031600000000000000000000000000000000000
MD000000000001709158000000000000002600000180000000000000
NN0422802800886687450132023712780141462300134111300271545294310543530201180500460
VBP0270000002600158987001855000021102312930000280000000000000
WRB001000000000004910000000000010000000000000000010
RP0096000051000100321195000000021003000010000000000000
IN0016150000526560005440664112256010000067502732000090003600000680000
VBD00501000750006071003237560000343800422815000121000000000000
SYM00000000000000000013000000010000000000000000000
$0000000000000000000164200000000000000000000000000
TO00000000700000000136900035380000000000000000000000000
:3000000000000000000001155000000000000000000000000
VBN0090000121300027110036121800012859000741392000120000000010040
NNPS0000100010009000200001027700200270001830000000000020
RBR0010000010003200000000000213008001200000001570000000
VBZ00001000100062300221300006010352118069600041001830000000020
DT0852730990012471630001186800147101385000000090902500300298702440140020600259010
VB096011000670094802051014321500001350002431150000220000020000040
NNS002201400138000178800884000281470151917811119700010220020000000150
LS00000000000000000000000000000130030100000000000
JJS00000000100060000000000000200030512640000000000000
RBS000000000000000000000000001000158910000000000000
NNP0038022005933725017529910618970203629438219807853018201870741024200100450
<s>0000000000000000000000000000000009145000000000000
,00000000000010000000000000000000001145400000000000
POS13200000000000000000000000044900000061300457000000004220
CC005900001130001003154202670000000801000130003629003000000
PRP$000000003000000000000000000000003000016620000002100
JJR0040000110006000200000003430812001301000003530000010
PDT001000003000000000000000002100000000000059000000
WP0000000000000000000000000000000000000000443049000
.0000000000000000000000000000000000000000090200000
WDT000000000000030035900000000019000000000000040578000
EX001000000000000000000000000000000000000000020800
PRP0000670090000027000000001003000000210030500000037440
``0000000000000000000000000000000000000000000001806
\ No newline at end of file diff --git a/src/HMM.java b/src/HMM.java index b53a07f..d7c7462 100644 --- a/src/HMM.java +++ b/src/HMM.java @@ -22,10 +22,16 @@ public static void main(String[] args){ HashMap> wordCounts; HashMap> tagBigramCounts; HashMap> tagForWordCounts; + HashMap> goodTuringTagBigramCounts; + HashMap goodTuringTagUnigramCounts; + HashMap numberOfBigramsWithCount; + boolean goodTuringCountsAvailable = false; + int numTrainingBigrams; String mostFreqTag; FileWriter writer; final boolean ADDONE = true; + final boolean GOODTURING = false; public HMM(HMMParser p){ this.tagCounts = p.tagCounts; @@ -33,6 +39,11 @@ public HMM(HMMParser p){ this.tagBigramCounts = p.tagBigramCounts; this.tagForWordCounts = p.tagForWordCounts; this.mostFreqTag = p.mostFreqTag; + + this.goodTuringTagBigramCounts = new HashMap>(); + this.goodTuringTagUnigramCounts = new HashMap(); + this.numberOfBigramsWithCount = new HashMap(); + this.numTrainingBigrams = p.numTrainingBigrams; try { writer = new FileWriter(new File("data/output.pos")); } catch (Exception e) { @@ -51,6 +62,57 @@ private int counts(HashMap> map, String key1, St return (map.containsKey(key1))? counts(map.get(key1), key2) : 0; } + //returns map[key] + private double counts(HashMap map, String key){ + return (map.containsKey(key)) ? map.get(key) : 0.0; + } + + //returns map[key1][key2] + private double counts(HashMap> map, String key1, String key2){ + return (map.containsKey(key1))? counts(map.get(key1), key2) : 0.0; + } + + private int numberOfBigramsWithCount(int count){ + if (numberOfBigramsWithCount.containsKey(count)) { + return numberOfBigramsWithCount.get(count); + } else { + return 0; + } + } + + private void makeGoodTuringCounts(){ + // Fill numberOfBigramsWithCount + for (String tag1 : tagBigramCounts.keySet()) { + HashMap innerMap = tagBigramCounts.get(tag1); + for (String tag2 : innerMap.keySet()) { + int count = innerMap.get(tag2); + if (numberOfBigramsWithCount.containsKey(count)) { + numberOfBigramsWithCount.put(count, 1+numberOfBigramsWithCount.get(count)); + } else { + numberOfBigramsWithCount.put(count, 1); + } + } + } + + // Fill goodTuringTagBigramCounts + for (String tag1 : tagBigramCounts.keySet()) { + HashMap innerMap = tagBigramCounts.get(tag1); + HashMap innerGTMap = new HashMap(); + goodTuringTagBigramCounts.put(tag1, innerGTMap); + + double unigramCount = 0; + for (String tag2 : innerMap.keySet()) { + int count = innerMap.get(tag2); + // c* = (c+1) * N(c+1) / N(c) + double newCount = ((double)count+1.0)*((double)numberOfBigramsWithCount(count+1))/((double)numberOfBigramsWithCount(count)); + innerGTMap.put(tag2, newCount); + unigramCount += newCount; + } + goodTuringTagUnigramCounts.put(tag1, unigramCount); + } + goodTuringCountsAvailable = true; + } + /* * Calculates P(word|tag) */ @@ -58,6 +120,8 @@ public double calcLikelihood(String tag, String word){ if(ADDONE){ int vocabSize = tagForWordCounts.keySet().size(); return (double) (counts(wordCounts,tag,word)+1) / (double) (counts(tagCounts,tag)+vocabSize); + } else if(GOODTURING) { + return (double) counts(wordCounts,tag,word) / (double) counts(tagCounts,tag); } else { return (double) counts(wordCounts,tag,word) / (double) counts(tagCounts,tag); } @@ -70,6 +134,19 @@ public double calcPriorProb(String tag1, String tag2){ if(ADDONE) { int vocabSize = tagCounts.keySet().size(); return (double) (counts(tagBigramCounts,tag1,tag2)+1) / (double) (counts(tagCounts,tag1)+vocabSize); + } else if(GOODTURING) { + if(!goodTuringCountsAvailable) { + System.out.println("Making good turing counts..."); + makeGoodTuringCounts(); + System.out.println("Done making good turing counts."); + } + double gtcount = counts(goodTuringTagBigramCounts, tag1, tag2); + // If this bigram has occurred, return good turing probability + if (gtcount > 0.0) { + return gtcount / counts(goodTuringTagUnigramCounts, tag1); + } + // Otherwise, return N1/N as per book (page 101) + return numberOfBigramsWithCount(1) / (double)numTrainingBigrams; } else { return (double) counts(tagBigramCounts,tag1,tag2) / (double) counts(tagCounts,tag1); } diff --git a/src/HMMParser.java b/src/HMMParser.java index cf1ab3c..7da9dbb 100644 --- a/src/HMMParser.java +++ b/src/HMMParser.java @@ -21,6 +21,7 @@ public static void main(String[] args){ HashMap> tagForWordCounts = new HashMap>(); String mostFreqTag = ""; int mostFreqTagCount = 0; + int numTrainingBigrams = 0; public HMMParser(String filename){ @@ -50,6 +51,8 @@ public void parseTrainer(){ mostFreqTag = currentTag; } + numTrainingBigrams++; + prevTag = currentTag; } }