Permalink
Browse files

Adding evaluation to DB indexing.

  • Loading branch information...
1 parent d8ac36f commit 33cb9a5eeb858339ece1acd1ae9b41e9d6a549b8 @jodaiber jodaiber committed Apr 5, 2013
View
@@ -131,7 +131,7 @@ echo "Loading Wikipedia dump into HDFS..."
if [ eval = "" ]; then
curl -# "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | hadoop fs -put - ${LANGUAGE}wiki-latest-pages-articles.xml
else
- curl -# "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_WDIR/pig/utilities/split_train_test.py 0.02 $WDIR/heldout.txt | hadoop fs -put - ${LANGUAGE}wiki-latest-pages-articles.xml
+ curl -# "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_WDIR/pig/utilities/split_train_test.py 12000 $WDIR/heldout.txt | hadoop fs -put - ${LANGUAGE}wiki-latest-pages-articles.xml
fi
#Load the stopwords into HDFS:
@@ -1,7 +1,7 @@
package org.dbpedia.spotlight.db.tokenize
import org.dbpedia.spotlight.model.{TokenType, Feature, Token, Text}
-import org.dbpedia.spotlight.db.model.{TextTokenizer, Stemmer, TokenTypeStore}
+import org.dbpedia.spotlight.db.model.{StringTokenizer, TextTokenizer, Stemmer, TokenTypeStore}
abstract class BaseTextTokenizer(tokenTypeStore: TokenTypeStore, stemmer: Stemmer) extends TextTokenizer {
@@ -15,6 +15,6 @@ abstract class BaseTextTokenizer(tokenTypeStore: TokenTypeStore, stemmer: Stemme
protected def getStemmedTokenType(token: String): TokenType = tokenTypeStore.getTokenType(stemmer.stem(token))
- def getStringTokenizer: BaseStringTokenizer
+ def getStringTokenizer: StringTokenizer
}
@@ -81,7 +81,22 @@ object EvalSpotter {
}
}
- def evalSpotting(annotatedTextSource: AnnotatedTextSource,
+ def evalSpotter(annotatedTextSource: AnnotatedTextSource,
+ spotter: Spotter,
+ expected: Traversable[SurfaceFormOccurrence]) {
+
+ // run spotting
+ var actual = Set[SurfaceFormOccurrence]()
+ for (paragraph <- annotatedTextSource) {
+ actual = JavaConversions.asScalaBuffer(spotter.extract(paragraph.text)).toSet union actual
+ }
+
+ // compare
+ printResults("%s and corpus %s".format(spotter.getName, annotatedTextSource.name), expected, actual)
+ }
+
+
+ private def evalSpotting(annotatedTextSource: AnnotatedTextSource,
indexSpotter: Traversable[SurfaceForm] => Spotter,
expected: Traversable[SurfaceFormOccurrence]) {
// index spotter
@@ -10,7 +10,7 @@ class EvaluateSpotlightModel {
def main(args: Array[String]) {
- val model = SpotlightModel.fromFolder(args(1))
+ val model = SpotlightModel.fromFolder(new File(args(1)))
val heldout = new File(args(2))
val corpus = WikipediaHeldoutCorpus.fromFile(heldout)
@@ -20,10 +20,10 @@ class EvaluateSpotlightModel {
//Spotting:
val expected = EvalSpotter.getExpectedResult(corpus)
- EvalSpotter.evalSpotting(corpus, spotter, expected)
+ EvalSpotter.evalSpotter(corpus, spotter, expected)
//Disambiguation
- EvaluateParagraphDisambiguator.evaluate(corpus, disambiguator, List(), List())
+ EvaluateParagraphDisambiguator.evaluate(corpus, disambiguator.disambiguator, List(), List())
}

0 comments on commit 33cb9a5

Please sign in to comment.