Permalink
Browse files

Added tuning for OpenNLP Spotter, SpotlightModel for running a DB-bac…

…ked server.
  • Loading branch information...
1 parent b3655b8 commit 48fc19d9002c496a3c39c816dce3816b885fcbf4 @jodaiber jodaiber committed Jan 8, 2013
View
6 core/pom.xml
@@ -242,12 +242,18 @@
<artifactId>spotter-aho</artifactId>
<version>0.1</version>
</dependency>
+
<dependency>
<groupId>org.scalaz</groupId>
<artifactId>scalaz-core_${scala.compiler.version}</artifactId>
<version>6.0.4</version>
</dependency>
+ <dependency>
+ <groupId>org.scalanlp</groupId>
+ <artifactId>breeze-learn_2.9.2</artifactId>
+ </dependency>
+
</dependencies>
</project>
View
1 core/src/main/scala/org/dbpedia/spotlight/db/DBTwoStepDisambiguator.scala
@@ -165,6 +165,7 @@ class DBTwoStepDisambiguator(
.map( kv =>
kv._2.head)
.toList
+ .sortBy(_.textOffset)
}
def name = "Database-backed 2 Step disambiguator (%s, %s)".format(contextSimilarity.getClass.getSimpleName, mixture.toString)
View
94 core/src/main/scala/org/dbpedia/spotlight/db/DefaultAnnotator.scala
@@ -1,94 +0,0 @@
-/**
- * Copyright 2011 Pablo Mendes, Max Jakob
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.dbpedia.spotlight.db
-
-import org.apache.commons.logging.LogFactory
-import org.dbpedia.spotlight.model._
-import org.dbpedia.spotlight.spot.Spotter
-import org.dbpedia.spotlight.exceptions.InputException
-import scala.collection.JavaConversions._
-import org.dbpedia.spotlight.disambiguate.{ParagraphDisambiguatorJ, Disambiguator}
-import org.dbpedia.spotlight.db.model.Tokenizer
-import org.dbpedia.spotlight.annotate.{ParagraphAnnotator, Annotator}
-
-/**
- * Annotates a text with DBpedia Resources.
- * This is just an example of how to wire the steps in our pipeline with default configurations.
- *
- * @author maxjakob, pablomendes
- */
-class DefaultAnnotator(val tokenizer: Tokenizer, val spotter : Spotter, val disambiguator: Disambiguator) extends Annotator {
-
- private val LOG = LogFactory.getLog(this.getClass)
-
- def this(spotter: Spotter, disambiguator: Disambiguator) {
- this(null, spotter, disambiguator)
- }
-
- @throws(classOf[InputException])
- def annotate(text : String) : java.util.List[DBpediaResourceOccurrence] = {
-
- val textObject = new Text(text)
-
- if (tokenizer != null) {
- LOG.info("Tokenizing input text...")
- val tokens = tokenizer.tokenize(textObject)
- textObject.setFeature(new Feature("tokens", tokens))
- }
-
- LOG.info("Spotting... ("+spotter.getName()+")")
- val spottedSurfaceForms : java.util.List[SurfaceFormOccurrence] = spotter.extract(textObject)
-
- LOG.info("Disambiguating... ("+disambiguator.name+")")
- val disambiguatedOccurrences : java.util.List[DBpediaResourceOccurrence] = disambiguator.disambiguate(spottedSurfaceForms)
-
- LOG.info("Done.")
- disambiguatedOccurrences
- }
-
-}
-
-class DefaultParagraphAnnotator(val tokenizer: Tokenizer, val spotter: Spotter, val disambiguator: ParagraphDisambiguatorJ) extends ParagraphAnnotator {
-
- def this(spotter: Spotter, disambiguator: ParagraphDisambiguatorJ) {
- this(null, spotter, disambiguator)
- }
-
- private val LOG = LogFactory.getLog(this.getClass)
-
- @throws(classOf[InputException])
- def annotate(text : String) : java.util.List[DBpediaResourceOccurrence] = {
-
- val textObject = new Text(text)
-
- if (tokenizer != null) {
- LOG.info("Tokenizing input text...")
- val tokens = tokenizer.tokenize(textObject)
- textObject.setFeature(new Feature("tokens", tokens))
- }
-
- LOG.info("Spotting... ("+spotter.getName()+")")
- val spottedSurfaceForms : List[SurfaceFormOccurrence] = asBuffer(spotter.extract(textObject)).toList
-
- LOG.info("Disambiguating... ("+disambiguator.name+")")
- val disambiguatedOccurrences : java.util.List[DBpediaResourceOccurrence] = disambiguator.disambiguate(Factory.Paragraph.from(spottedSurfaceForms))
-
- LOG.info("Done.")
- disambiguatedOccurrences
- }
-
-}
View
3 core/src/main/scala/org/dbpedia/spotlight/db/DefaultTokenizer.scala
@@ -9,9 +9,6 @@ import opennlp.tools.util.Span
/**
* @author Joachim Daiber
- *
- *
- *
*/
class DefaultTokenizer(
View
85 core/src/main/scala/org/dbpedia/spotlight/db/SpotlightModel.scala
@@ -0,0 +1,85 @@
+package org.dbpedia.spotlight.db
+
+import memory.MemoryStore
+import model._
+import org.tartarus.snowball.SnowballProgram
+import opennlp.tools.tokenize.{TokenizerModel, TokenizerME}
+import opennlp.tools.sentdetect.{SentenceModel, SentenceDetectorME}
+import opennlp.tools.postag.{POSModel, POSTaggerME}
+import org.dbpedia.spotlight.disambiguate.mixtures.UnweightedMixture
+import similarity.GenerativeContextSimilarity
+import org.dbpedia.spotlight.spot.opennlp.OpenNLPChunkerSpotterDB
+import scala.collection.JavaConverters._
+import org.dbpedia.spotlight.model.SpotterConfiguration.SpotterPolicy
+import org.dbpedia.spotlight.model.SpotlightConfiguration.DisambiguationPolicy
+import org.dbpedia.spotlight.disambiguate.ParagraphDisambiguatorJ
+import org.dbpedia.spotlight.spot.Spotter
+import java.io.{File, FileInputStream}
+import java.util.Properties
+import breeze.linalg.DenseVector
+
+class SpotlightModel(val tokenizer: Tokenizer,
+ val spotters: java.util.Map[SpotterPolicy, Spotter],
+ val disambiguators: java.util.Map[DisambiguationPolicy, ParagraphDisambiguatorJ],
+ val properties: Properties)
+
+object SpotlightModel {
+
+ def loadStopwords(modelFolder: File): Set[String] = scala.io.Source.fromFile(new File(modelFolder, "stopwords.list")).getLines().map(_.trim()).toSet
+ def loadSpotterThresholds(file: File): Seq[Double] = scala.io.Source.fromFile(file).getLines().next().split(" ").map(_.toDouble)
+
+ def fromFolder(modelFolder: File): SpotlightModel = {
+
+ val modelDataFolder = new File(modelFolder, "model")
+ val tokenTypeStore = MemoryStore.loadTokenTypeStore(new FileInputStream(new File(modelDataFolder, "tokens.mem")))
+ val sfStore = MemoryStore.loadSurfaceFormStore(new FileInputStream(new File(modelDataFolder, "sf.mem")))
+ val resStore = MemoryStore.loadResourceStore(new FileInputStream(new File(modelDataFolder, "res.mem")))
+ val candMapStore = MemoryStore.loadCandidateMapStore(new FileInputStream(new File(modelDataFolder, "candmap.mem")), resStore)
+ val contextStore = MemoryStore.loadContextStore(new FileInputStream(new File(modelDataFolder, "context.mem")), tokenTypeStore)
+
+ val stopwords = loadStopwords(modelFolder)
+
+ val properties = new Properties()
+ properties.load(new FileInputStream(new File(modelFolder, "model.properties")))
+
+ //Load the stemmer from the model file:
+ val stemmer: SnowballProgram = properties.getProperty("stemmer") match {
+ case s: String if s equals "None" => null
+ case s: String => Class.forName("org.tartarus.snowball.ext.%s".format(s)).newInstance().asInstanceOf[SnowballProgram]
+ }
+
+ //Create the tokenizer:
+ val tokenizer: Tokenizer = new DefaultTokenizer(
+ new TokenizerME(new TokenizerModel(new FileInputStream(new File(modelFolder, "opennlp/token.bin")))),
+ stopwords,
+ stemmer,
+ new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelFolder, "opennlp/sent.bin")))),
+ new POSTaggerME(new POSModel(new FileInputStream(new File(modelFolder, "opennlp/pos-maxent.bin")))),
+ tokenTypeStore
+ )
+
+ val searcher = new DBCandidateSearcher(resStore, sfStore, candMapStore)
+ val disambiguator = new ParagraphDisambiguatorJ(new DBTwoStepDisambiguator(
+ tokenTypeStore,
+ sfStore,
+ resStore,
+ searcher,
+ contextStore,
+ new UnweightedMixture(Set("P(e)", "P(c|e)", "P(s|e)")),
+ new GenerativeContextSimilarity(tokenTypeStore)
+ ))
+
+ val spotter = new OpenNLPChunkerSpotterDB(
+ new FileInputStream(new File(modelFolder, "opennlp/chunker.bin")),
+ sfStore,
+ stopwords,
+ Some(loadSpotterThresholds(new File(modelFolder, "opennlp_chunker_thresholds.txt"))),
+ Set("NP", "MWU", "PP"), "N"
+ ).asInstanceOf[Spotter]
+
+ val spotters: java.util.Map[SpotterPolicy, Spotter] = Map(SpotterPolicy.Default -> spotter).asJava
+ val disambiguators: java.util.Map[DisambiguationPolicy, ParagraphDisambiguatorJ] = Map(DisambiguationPolicy.Default -> disambiguator).asJava
+
+ new SpotlightModel(tokenizer, spotters, disambiguators, properties)
+ }
+}
View
8 core/src/main/scala/org/dbpedia/spotlight/db/model/Tokenizer.scala
@@ -1,6 +1,6 @@
package org.dbpedia.spotlight.db.model
-import org.dbpedia.spotlight.model.{Token, Text}
+import org.dbpedia.spotlight.model.{Feature, Token, Text}
/**
@@ -11,7 +11,7 @@ import org.dbpedia.spotlight.model.{Token, Text}
* @author Joachim Daiber
*/
-trait Tokenizer {
+abstract class Tokenizer {
/**
* Tokenize the text, return the Token objects. Features may be assigned to the [[org.dbpedia.spotlight.model.Text]]
@@ -21,4 +21,8 @@ trait Tokenizer {
*/
def tokenize(text: Text): List[Token]
+ def tokenizeMaybe(text: Text) {
+ if(text.feature("tokens").isEmpty)
+ text.setFeature(new Feature("tokens", tokenize(text)))
+ }
}
View
60 core/src/main/scala/org/dbpedia/spotlight/spot/opennlp/OpenNLPChunkerSpotterDB.scala
@@ -4,12 +4,13 @@ package org.dbpedia.spotlight.spot.opennlp
import org.dbpedia.spotlight.spot.Spotter
import java.util.LinkedList
import scala.util.control.Breaks._
-import java.io.{InputStream, FileInputStream}
+import java.io.InputStream
import org.dbpedia.spotlight.model.{Token, SurfaceForm, SurfaceFormOccurrence, Text}
import collection.mutable.ListBuffer
import opennlp.tools.chunker.{ChunkerModel, ChunkerME, Chunker}
import org.dbpedia.spotlight.exceptions.SurfaceFormNotFoundException
import org.dbpedia.spotlight.db.model.SurfaceFormStore
+import breeze.linalg.DenseVector
/**
@@ -25,33 +26,40 @@ class OpenNLPChunkerSpotterDB(
chunkerModel: InputStream,
surfaceFormStore: SurfaceFormStore,
stopwords: Set[String],
- npTag: String = "NP",
+ spotFeatureWeights: Option[Seq[Double]],
+ phraseTags: Set[String] = Set("NP"),
nnTag: String = "NN"
) extends Spotter {
- val MIN_ANNOTATION_PROBABILITY = 0.5
+ val spotFeatureWeightVector: Option[DenseVector[Double]] = spotFeatureWeights match {
+ case Some(w) => Some(DenseVector(w.toArray:_*))
+ case None => None
+ }
val chunker: Chunker =
new ChunkerME(new ChunkerModel(chunkerModel))
def extract(text: Text): java.util.List[SurfaceFormOccurrence] = {
val spots = new LinkedList[SurfaceFormOccurrence]
- val sentences: List[List[Token]] = tokensToSentences(text.feature("tokens").asInstanceOf[List[Token]])
+ val sentences: List[List[Token]] = tokensToSentences(text.featureValue[List[Token]]("tokens").get)
//Go through all sentences
sentences.foreach{ sentence: List[Token] =>
val tokens = sentence.map(_.token).toArray
val tags = sentence.map(_.featureValue[String]("pos").get).toArray
//Go through all chunks
+ //System.err.println(chunker.chunkAsSpans(tokens, tags).map(_.getType).mkString(","))
chunker.chunkAsSpans(tokens, tags)
//Only look at NPs
- .filter(chunkSpan => chunkSpan.getType.equals(npTag))
+ .filter(chunkSpan => phraseTags.contains(chunkSpan.getType))
.foreach(chunkSpan => {
breakable {
+
val firstToken = chunkSpan.getStart
val lastToken = chunkSpan.getEnd-1
+ System.err.println("Chunk:" + tokens.slice(firstToken, lastToken+1).mkString(" ") )
//Taking away a left member in each step, look for the longest sub-chunk in the SF dictionary
(firstToken to lastToken).foreach(startToken => {
@@ -62,7 +70,7 @@ class OpenNLPChunkerSpotterDB(
if (surfaceFormMatch(spot)) {
if ( !((lastToken == startToken) && !tags(startToken).toUpperCase.startsWith(nnTag) || stopwords.contains(spot.toLowerCase))) {
//The sub-chunk is in the dictionary, finish the processing of this chunk
- spots.add(new SurfaceFormOccurrence(new SurfaceForm(spot), text, startOffset))
+ spots.add(new SurfaceFormOccurrence(surfaceFormStore.getSurfaceForm(spot), text, startOffset))
break()
}
}
@@ -75,14 +83,6 @@ class OpenNLPChunkerSpotterDB(
spots
}
- private def surfaceFormMatch(spot: String): Boolean = {
- try {
- val sf = surfaceFormStore.getSurfaceForm(spot)
- sf.annotationProbability >= MIN_ANNOTATION_PROBABILITY
- } catch {
- case e: SurfaceFormNotFoundException => false
- }
- }
private var name = "Spotter based on an OpenNLP NP chunker and a simple spot dictionary."
def getName = name
@@ -110,4 +110,36 @@ class OpenNLPChunkerSpotterDB(
sentences.toList
}
+ private def surfaceFormMatch(spot: String): Boolean = {
+ try {
+ spotFeatureWeightVector match {
+ case Some(weights) => {
+ System.err.println("Checking %s: %s, %s".format(spot,
+ OpenNLPChunkerSpotterDB.spotFeatures(surfaceFormStore.getSurfaceForm(spot)).toString,
+ (weights dot OpenNLPChunkerSpotterDB.spotFeatures(surfaceFormStore.getSurfaceForm(spot))).toString))
+
+ (weights dot OpenNLPChunkerSpotterDB.spotFeatures(surfaceFormStore.getSurfaceForm(spot))) > 0.45 //we are being generous!
+ }
+ case None => surfaceFormStore.getSurfaceForm(spot) != null
+ }
+ } catch {
+ case e: SurfaceFormNotFoundException => false
+ }
+ }
+
+}
+
+object OpenNLPChunkerSpotterDB {
+ def spotFeatures(spot: SurfaceForm): DenseVector[Double] =
+ DenseVector(
+ //Annotation probability:
+ spot.annotationProbability,
+
+ //Abbreviations:
+ if(spot.name.toUpperCase.equals(spot.name) && spot.name.size < 5) 1.0 else 0.0,
+
+ //Bias:
+ 1.0
+ )
+
}
View
5 eval/src/main/scala/org/dbpedia/spotlight/evaluation/EvaluateParagraphDisambiguator.scala
@@ -23,12 +23,13 @@ import org.dbpedia.spotlight.disambiguate._
import java.io.{PrintWriter, File}
import org.dbpedia.spotlight.corpus.{PredoseCorpus, MilneWittenCorpus, AidaCorpus}
-import scalaj.collection.Imports._
-
import org.dbpedia.spotlight.model._
import org.dbpedia.spotlight.filter.occurrences.{UriWhitelistFilter, RedirectResolveFilter, OccurrenceFilter}
import scala.Some
+import scalaj.collection.Imports._
+
+
/**
* Evaluation for disambiguators that take one paragraph at a time, instead of one occurrence at a time.
*
View
6 index/pom.xml
@@ -232,6 +232,12 @@
<artifactId>commons-compress</artifactId>
<version>1.4.1</version>
</dependency>
+
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+
<dependency>
<groupId>weka</groupId>
<artifactId>weka</artifactId>
View
178 index/src/main/scala/org/dbpedia/spotlight/db/CreateSpotlightModel.scala
@@ -0,0 +1,178 @@
+package org.dbpedia.spotlight.db
+
+import io._
+import java.io.{FileOutputStream, FileInputStream, File}
+import memory.MemoryStore
+import model.Tokenizer
+import scala.io.Source
+import org.tartarus.snowball.SnowballProgram
+import java.util.Properties
+import org.dbpedia.spotlight.io.WikipediaHeldoutCorpus
+import org.apache.commons.io.FileUtils
+import opennlp.tools.tokenize.{TokenizerModel, TokenizerME}
+import opennlp.tools.sentdetect.{SentenceModel, SentenceDetectorME}
+import opennlp.tools.postag.{POSModel, POSTaggerME}
+import org.dbpedia.spotlight.spot.opennlp.OpenNLPChunkerSpotterDB
+
+/**
+ * @author Joachim Daiber
+ */
+
+object CreateSpotlightModel {
+
+ def main(args: Array[String]) {
+
+ val (rawDataFolder: File, outputFolder: File, opennlpFolder: File, stopwordsFile: File, stemmer: SnowballProgram) = try {
+ (
+ new File(args(0)),
+ new File(args(1)),
+ new File(args(2)),
+ new File(args(3)),
+ if (args(4) equals "None") null else Class.forName("org.tartarus.snowball.ext.%s".format(args(4))).newInstance()
+ )
+ } catch {
+ case e: Exception => {
+ e.printStackTrace()
+ System.err.println("Usage:")
+ System.err.println(" - English: mvn scala:run -DmainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args=\"/data/input /data/output /data/opennlp /data/stopwords.list EnglishStemmer\"")
+ System.err.println(" - no stemmer: mvn scala:run -DmainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args=\"/data/input /data/output /data/opennlp /data/stopwords.list None\"")
+ System.exit(1)
+ }
+ }
+
+ if(!outputFolder.mkdir()) {
+ System.err.println("Folder %s already exists, I am too afraid to overwrite it!".format(outputFolder.toString))
+ System.exit(1)
+ }
+
+ FileUtils.copyFile(stopwordsFile, new File(outputFolder, "stopwords.list"))
+
+ val opennlpModels = opennlpFolder.listFiles()
+ def getModel(name: String) = opennlpModels.filter(_.getName.endsWith(name)).headOption
+
+ val opennlpOut = new File(outputFolder, "opennlp")
+ opennlpOut.mkdir()
+
+ try {
+ FileUtils.copyFile(getModel("-sent.bin").get, new File(opennlpOut, "sent.bin"))
+ FileUtils.copyFile(getModel("-token.bin").get, new File(opennlpOut, "token.bin"))
+ FileUtils.copyFile(getModel("-pos-maxent.bin").get, new File(opennlpOut, "pos-maxent.bin"))
+
+ getModel("-chunker.bin") match {
+ case Some(model) => FileUtils.copyFile(model, new File(opennlpOut, "chunker.bin"))
+ case _ =>
+ }
+ } catch {
+ case _: Exception => {
+ System.err.println(
+ """Problem with OpenNLP models:
+ | You need to have at least the following model files in your opennlp folder:
+ | *-sent.bin
+ | *-token.bin
+ | *-pos-maxent.bin
+ |
+ | For the best result, you should also have:
+ | *-chunker.bin
+ """.stripMargin)
+ System.exit(1)
+ }
+ }
+ //TODO: add NER
+
+ //Set default properties
+ val defaultProperties = new Properties()
+ defaultProperties.setProperty("stemmer", args(4))
+
+ defaultProperties.store(new FileOutputStream(new File(outputFolder, "model.properties")), null)
+
+ //Create models:
+ val modelDataFolder = new File(outputFolder, "model")
+ modelDataFolder.mkdir()
+
+ val memoryIndexer = new MemoryStoreIndexer(modelDataFolder)
+ //val diskIndexer = new JDBMStoreIndexer(new File("data/"))
+
+ val wikipediaToDBpediaClosure = new WikipediaToDBpediaClosure(
+ new FileInputStream(new File(rawDataFolder, "redirects.nt")),
+ new FileInputStream(new File(rawDataFolder, "disambiguations.nt"))
+ )
+
+ memoryIndexer.addResources(
+ DBpediaResourceSource.fromPigFiles(
+ wikipediaToDBpediaClosure,
+ new File(rawDataFolder, "uriCounts"),
+ null //new File("raw_data/pig/instanceTypes.tsv")
+ )
+ )
+
+ val resStore = MemoryStore.loadResourceStore(new FileInputStream(new File(modelDataFolder, "res.mem")))
+
+ memoryIndexer.addSurfaceForms(
+ SurfaceFormSource.fromPigFiles(
+ new File(rawDataFolder, "sfAndTotalCounts"),
+ wikiClosure=wikipediaToDBpediaClosure,
+ resStore
+ )
+ )
+
+ val sfStore = MemoryStore.loadSurfaceFormStore(new FileInputStream(new File(modelDataFolder, "sf.mem")))
+
+ memoryIndexer.addCandidatesByID(
+ CandidateMapSource.fromPigFiles(
+ new File(rawDataFolder, "pairCounts"),
+ wikipediaToDBpediaClosure,
+ resStore,
+ sfStore
+ ),
+ sfStore.size
+ )
+
+ memoryIndexer.addTokenTypes(
+ TokenSource.fromPigFile(
+ new File(rawDataFolder, "token_counts")
+ )
+ )
+
+ val tokenStore = MemoryStore.loadTokenTypeStore(new FileInputStream(new File(modelDataFolder, "tokens.mem")))
+
+ memoryIndexer.createContextStore(resStore.size)
+ memoryIndexer.addTokenOccurrences(
+ TokenOccurrenceSource.fromPigFile(
+ new File(rawDataFolder, "token_counts"),
+ tokenStore,
+ wikipediaToDBpediaClosure,
+ resStore
+ )
+ )
+ memoryIndexer.writeTokenOccurrences()
+
+ val stopwords = SpotlightModel.loadStopwords(outputFolder)
+
+ //Tune Spotter:
+ val tokenizer: Tokenizer = new DefaultTokenizer(
+ new TokenizerME(new TokenizerModel(new FileInputStream(new File(opennlpOut, "token.bin")))),
+ stopwords,
+ stemmer,
+ new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(opennlpOut, "sent.bin")))),
+ new POSTaggerME(new POSModel(new FileInputStream(new File(opennlpOut, "pos-maxent.bin")))),
+ tokenStore
+ )
+
+ val spotter = new OpenNLPChunkerSpotterDB(
+ new FileInputStream(new File(opennlpOut, "chunker.bin")),
+ sfStore,
+ stopwords,
+ None,
+ "NP", "N"
+ )
+
+ SpotterTuner.tuneOpenNLP(
+ new WikipediaHeldoutCorpus(Source.fromFile(new File(rawDataFolder, "test.txt")).getLines()),
+ tokenizer,
+ spotter,
+ new File(outputFolder, "opennlp_chunker_thresholds.txt")
+ )
+
+ }
+
+}
View
81 index/src/main/scala/org/dbpedia/spotlight/db/ImportPig.scala
@@ -1,81 +0,0 @@
-package org.dbpedia.spotlight.db
-
-import io._
-import java.io.{FileInputStream, File}
-import memory.MemoryStore
-import model.ResourceStore
-
-/**
- * @author Joachim Daiber
- *
- *
- *
- */
-
-object ImportPig {
-
- def main(args: Array[String]) {
- val rawDataFolder = new File("/data/spotlight/processed/")
- val modelDataFolder = new File("/data/spotlight/models/")
-
- val memoryIndexer = new MemoryStoreIndexer(modelDataFolder)
- //val diskIndexer = new JDBMStoreIndexer(new File("data/"))
-
- val wikipediaToDBpediaClosure = new WikipediaToDBpediaClosure(
- new FileInputStream(new File(rawDataFolder, "pig/redirects.nt")),
- new FileInputStream(new File(rawDataFolder, "pig/disambiguations.nt"))
- )
-
- memoryIndexer.addResources(
- DBpediaResourceSource.fromPigFiles(
- wikipediaToDBpediaClosure,
- new File(rawDataFolder, "pig/uriCounts"),
- null //new File("raw_data/pig/instanceTypes.tsv")
- )
- )
-
- val resStore = MemoryStore.loadResourceStore(new FileInputStream(new File(modelDataFolder, "res.mem")))
-
- memoryIndexer.addSurfaceForms(
- SurfaceFormSource.fromPigFiles(
- new File(rawDataFolder, "pig/sfAndTotalCounts"),
- wikiClosure=wikipediaToDBpediaClosure,
- resStore
- )
- )
-
- val sfStore = MemoryStore.loadSurfaceFormStore(new FileInputStream(new File(modelDataFolder, "sf.mem")))
-
- memoryIndexer.addCandidatesByID(
- CandidateMapSource.fromPigFiles(
- new File(rawDataFolder, "pig/pairCounts"),
- wikipediaToDBpediaClosure,
- resStore,
- sfStore
- ),
- sfStore.size
- )
-
- memoryIndexer.addTokenTypes(
- TokenSource.fromPigFile(
- new File(rawDataFolder, "pig/token_counts")
- )
- )
-
- val tokenStore = MemoryStore.loadTokenTypeStore(new FileInputStream(new File(modelDataFolder, "tokens.mem")))
-
- memoryIndexer.createContextStore(resStore.size)
- memoryIndexer.addTokenOccurrences(
- TokenOccurrenceSource.fromPigFile(
- new File(rawDataFolder, "pig/token_counts"),
- tokenStore,
- wikipediaToDBpediaClosure,
- resStore
- )
- )
- memoryIndexer.writeTokenOccurrences()
-
-
- }
-
-}
View
106 index/src/main/scala/org/dbpedia/spotlight/db/SpotterTuner.scala
@@ -0,0 +1,106 @@
+package org.dbpedia.spotlight.db
+
+import memory.MemoryStore
+import model.{Tokenizer, SurfaceFormStore}
+import org.dbpedia.spotlight.spot.opennlp.OpenNLPChunkerSpotterDB.spotFeatures
+import breeze.linalg.{DenseMatrix, DenseVector}
+import org.dbpedia.spotlight.io.AnnotatedTextSource
+import breeze.regress.LinearRegression
+import java.io.{FileInputStream, File}
+import scala.io.Source
+import org.dbpedia.spotlight.model.{SurfaceForm, SurfaceFormOccurrence, AnnotatedParagraph, DBpediaResourceOccurrence}
+import org.dbpedia.spotlight.io.WikipediaHeldoutCorpus
+import org.apache.commons.io.FileUtils
+import org.dbpedia.spotlight.exceptions.SurfaceFormNotFoundException
+import org.dbpedia.spotlight.spot.opennlp.OpenNLPChunkerSpotterDB
+import opennlp.tools.tokenize.{TokenizerModel, TokenizerME}
+import opennlp.tools.sentdetect.{SentenceModel, SentenceDetectorME}
+import opennlp.tools.postag.{POSModel, POSTaggerME}
+import org.dbpedia.spotlight.spot.Spotter
+import org.tartarus.snowball.ext.DutchStemmer
+import scala.collection.JavaConversions._
+
+object SpotterTuner {
+
+ def tuneOpenNLP(corpus: AnnotatedTextSource, tokenizer: Tokenizer, spotter: OpenNLPChunkerSpotterDB, outputFile: File) {
+
+ System.err.println("Tuning Spotter model...")
+
+ val activeCorpus = corpus.take(20000)
+
+ val allSpots = activeCorpus.map{ par: AnnotatedParagraph => {
+ tokenizer.tokenizeMaybe(par.text)
+
+ /* We are supposing the case that the tuning material is
+ from Wikipedia. Therefore usually only the first link
+ is annotated, but all links with the same name in the
+ article should be annotated. For this reason, we use
+ only the sf string to identify a spot.
+ */
+ (par.occurrences.map(_.surfaceForm.name).toSet, spotter.extract(par.text))
+ }}
+
+
+ val (nx, ny) = (spotFeatures(new SurfaceForm("test")).activeSize, allSpots.map(_._2.size()).sum)
+
+ val x = DenseMatrix.zeros[Double](ny, nx)
+ val y = DenseVector.zeros[Double](ny)
+
+ var i = 0
+ allSpots.foreach{
+ case(goldSpotSet: Set[String], spots: java.util.List[SurfaceFormOccurrence]) => {
+ spots.foreach{ spot: SurfaceFormOccurrence =>
+ x(i,::) := spotFeatures(spot.surfaceForm).t
+ y(i) = ( if(goldSpotSet.contains(spot.surfaceForm.name)) 1.0 else 0.0 )
+ i += 1
+ }
+ }
+
+ }
+
+ //System.err.println(x)
+
+ FileUtils.write(
+ outputFile,
+ LinearRegression.regress(x, y).activeValuesIterator.mkString(" ")
+ )
+
+ }
+
+
+ def main(args: Array[String]) {
+ System.err.println("Run!")
+
+ val modelFolder = new File("/data/spotlight/models")
+
+ val tokenTypeStore = MemoryStore.loadTokenTypeStore(new FileInputStream(new File(modelFolder, "tokens.mem")))
+ val sfStore = MemoryStore.loadSurfaceFormStore(new FileInputStream(new File(modelFolder, "sf.mem")))
+
+ val stopwords = scala.io.Source.fromFile(new File(modelFolder, "stopwords.list")).getLines().map(_.trim()).toSet
+ val stemmer = new DutchStemmer()
+
+ val tokenizer: Tokenizer = new DefaultTokenizer(
+ new TokenizerME(new TokenizerModel(new FileInputStream(new File(modelFolder, "opennlp/token.bin")))),
+ stopwords,
+ stemmer,
+ new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelFolder, "opennlp/sent.bin")))),
+ new POSTaggerME(new POSModel(new FileInputStream(new File(modelFolder, "opennlp/pos-maxent.bin")))),
+ tokenTypeStore
+ )
+
+ val spotter = new OpenNLPChunkerSpotterDB(
+ new FileInputStream(new File(modelFolder, "opennlp/chunker.bin")),
+ sfStore,
+ stopwords,
+ None,
+ "NP", "N"
+ )
+
+ tuneOpenNLP(
+ new WikipediaHeldoutCorpus(Source.fromFile(new File("/data/spotlight/import/", "nl_test.txt")).getLines()),
+ tokenizer,
+ spotter,
+ new File("/data/spotlight/modelsspotting/chunking/threshholds.txt")
+ )
+ }
+}
View
3 ...light/corpus/WikipediaHeldoutCorpus.scala → ...spotlight/io/WikipediaHeldoutCorpus.scala
@@ -1,6 +1,5 @@
-package org.dbpedia.spotlight.corpus
+package org.dbpedia.spotlight.io
-import org.dbpedia.spotlight.io.{WikiOccurrenceSource, AnnotatedTextSource}
import org.dbpedia.spotlight.model.{Text, DBpediaResourceOccurrence, AnnotatedParagraph}
/**
View
12 pom.xml
@@ -342,6 +342,12 @@
</dependency>
<dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ </dependency>
+
+ <dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>r07</version>
@@ -512,6 +518,12 @@
<version>1.9.8</version>
</dependency>
+ <dependency>
+ <groupId>org.scalanlp</groupId>
+ <artifactId>breeze-learn_2.9.2</artifactId>
+ <version>0.1</version>
+ </dependency>
+
</dependencies>
</dependencyManagement>

0 comments on commit 48fc19d

Please sign in to comment.