Permalink
Browse files

Creating a new spotter AhoCorasickSpotter

  • Loading branch information...
sandroacoelho
sandroacoelho committed Nov 17, 2012
1 parent 6386044 commit 6e4abb1c9d8c1bda4dd803d44ae39427f400cc6d
View
@@ -68,6 +68,9 @@ org.dbpedia.spotlight.spot.opennlp.location=http://dbpedia.org/ontology/Place
# EXPERIMENTAL! Path to Kea Model
org.dbpedia.spotlight.spot.kea.model = /data/spotlight/3.7/kea/keaModel-1-3-1
+#EXPERIMENTAL! AhoCorasick Spotter
+org.dbpedia.spotlight.spot.ahocorasick.surfaceforms=/data/spotlight/surfaceforms.set
+
#----- CANDIDATE SELECTION -------
View
@@ -232,6 +232,22 @@
<artifactId>mahout-collections</artifactId>
</dependency>
+ <!--
+ Fixed dependency: scala-aho-corasick implementation
+ -->
+ <dependency>
+ <groupId>scala.ahocorasick</groupId>
+ <artifactId>library</artifactId>
+ <version>0.1</version>
+ <systemPath>${project.basedir}/../lib/scala-aho-corasick-0.1.jar</systemPath>
+ <scope>system</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.scalaz</groupId>
+ <artifactId>scalaz-core_${scala.compiler.version}</artifactId>
+ <version>6.0.4</version>
+ </dependency>
+
</dependencies>
</project>
@@ -49,6 +49,8 @@
private Map<String, String> openNLPModelsURI = new HashMap<String, String>(3);
+ private String spotterSurfaceForms = "";
+
public enum SpotterPolicy {Default,
LingPipeSpotter,
AtLeastOneNounSelector,
@@ -57,7 +59,8 @@
KeyphraseSpotter,
OpenNLPChunkerSpotter,
WikiMarkupSpotter,
- SpotXmlParser
+ SpotXmlParser,
+ AhoCorasickSpotter
}
@@ -136,6 +139,16 @@ public SpotterConfiguration(String fileName) throws ConfigurationException {
setOpenNLPModelsURI();
}
+ //Validate AhoCorasickSpotter
+ if(spotters.contains(SpotterPolicy.AhoCorasickSpotter))
+ {
+ //Load spotter configuration:
+ spotterSurfaceForms = config.getProperty("org.dbpedia.spotlight.spot.ahocorasick.surfaceforms").trim();
+ if(!new File(spotterSurfaceForms).isFile()) {
+ throw new ConfigurationException("Cannot find surfaceForms file "+spotterSurfaceForms);
+ }
+ }
+
}
@@ -233,5 +246,9 @@ public int getKeaCutoff() {
return policies;
}
+ public String getSpotterSurfaceForms() {
+ return spotterSurfaceForms;
+ }
+
}
@@ -26,6 +26,7 @@ import org.dbpedia.spotlight.disambiguate._
import org.dbpedia.spotlight.spot.lingpipe.LingPipeSpotter
import java.io.File
import org.dbpedia.spotlight.spot._
+import ahocorasick.AhoCorasickSpotter
import opennlp.{ProbabilisticSurfaceFormDictionary, OpenNLPChunkerSpotter}
import org.dbpedia.spotlight.filter.annotations.CombineAllAnnotationFilters
import org.dbpedia.spotlight.tagging.lingpipe.{LingPipeTextUtil, LingPipeTaggedTokenProvider, LingPipeFactory}
@@ -38,6 +39,7 @@ import org.dbpedia.spotlight.lucene.search.{LuceneCandidateSearcher, MergedOccur
import com.aliasi.util.AbstractExternalizable
import com.aliasi.dict.Dictionary
import org.dbpedia.spotlight.exceptions.ConfigurationException
+import io.Source
/**
* This class contains many of the "defaults" for DBpedia Spotlight.
@@ -104,6 +106,13 @@ class SpotlightFactory(val configuration: SpotlightConfiguration) {
SpotterWithSelector.getInstance(innerSpotter, new ChainedSelector(spotSelectors))
}
defaultSpotter
+ } else if(policy == SpotterConfiguration.SpotterPolicy.AhoCorasickSpotter) {
+ val overlap = configuration.getSpotterConfiguration.config.getOrElse("org.dbpedia.spotlight.spot.allowOverlap", "false").equals("true")
+ val caseSensitive = configuration.getSpotterConfiguration.config.getOrElse("org.dbpedia.spotlight.spot.caseSensitive", "false").equals("true")
+ val sourceChunks = Source.fromFile(configuration.getSpotterConfiguration.getSpotterSurfaceForms)
+ val spotter = AhoCorasickSpotter.fromSurfaceForms(sourceChunks.getLines(), caseSensitive, overlap)
+ sourceChunks.close
+ spotters.getOrElse(policy,spotter)
} else if(policy == SpotterConfiguration.SpotterPolicy.LingPipeSpotter) {
val overlap = configuration.getSpotterConfiguration.config.getOrElse("org.dbpedia.spotlight.spot.allowOverlap", "false").equals("true")
val caseSensitive = configuration.getSpotterConfiguration.config.getOrElse("org.dbpedia.spotlight.spot.caseSensitive", "false").equals("true")
@@ -0,0 +1,187 @@
+/**
+ * Copyright 2012
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.spot.ahocorasick
+
+import collection.mutable.ListBuffer
+import com.corruptmemory.aho_corasick.AhoCorasickBuilder.Data
+import com.corruptmemory.aho_corasick.{Match, AhoCorasickBuilder}
+import util.Sorting
+import org.dbpedia.spotlight.spot.Spotter
+import org.apache.commons.logging.LogFactory
+import org.dbpedia.spotlight.model.{SurfaceForm, Text, SurfaceFormOccurrence}
+import scala.collection.JavaConversions._
+
+/**
+ * AhoCorasick Spotter Class
+ *
+ * @param builder AhoCorasickBuilder instance
+ * @param overlap overlap: true or false?
+ * @param pattern regex pattern to check if the chunk is a complete word
+ */
+class AhoCorasickSpotter(val builder: AhoCorasickBuilder[String], val overlap: Boolean, pattern: String = "\\s|\\n|\\t|[,.:;¿?¡!()\\-'\"]") extends Spotter {
+
+ private val LOG = LogFactory.getLog(this.getClass)
+ private val finder = builder.build()
+ private var name = ""
+
+ LOG.debug("Allow overlap: " + overlap)
+
+ /**
+ * Find a specific text
+ * @param text a text that you want spotting
+ *
+ */
+ def extract(text: Text): java.util.List[SurfaceFormOccurrence] = {
+ val results = finder.find(text.text)
+
+ val buffer: ListBuffer[SurfaceFormOccurrence] = ListBuffer()
+
+ if (overlap) {
+ results.foreach(result =>
+ buffer.append(new SurfaceFormOccurrence(new SurfaceForm(result.actual), text, result.start))
+ )
+
+ } else {
+ filter(results, text.text, pattern).foreach(result =>
+ buffer.append(new SurfaceFormOccurrence(new SurfaceForm(result.actual), text, result.start))
+ )
+ }
+
+
+ seqAsJavaList[SurfaceFormOccurrence](buffer.toSeq)
+ }
+
+
+ /**
+ * Comparator by Match start point and length
+ */
+ object StartLengthOrdering extends Ordering[Match[String]] {
+ def compare(matchA: Match[String], matchB: Match[String]): Int = {
+ Ordering.Tuple2(Ordering.Int, Ordering.Int).compare((matchA.start, matchB.actual.length), (matchB.start, matchA.actual.length))
+ }
+ }
+
+
+ /**
+ * When overlap is false, this method try to filter most relevant sequences
+ *
+ * E.g. Finding a text with the name Dilma Rousseff, Scala-aho-corasick found the follow sequences
+ * Match(13,DI,Di,)
+ * Match(13,D,D,)
+ * Match(13,Dilma Rousseff,Dilma Rousseff,)
+ * Match(13,Dilma,Dilma,)
+ * Match(14,ILMA,ilma,)
+ * Match(14,IL,il,)
+ * Match(15,L,l,)
+ * Match(15,LM,lm,)
+ * Match(15,LMA,lma,)
+ * Match(16,Ma,ma,)
+ * Match(16,M,m,)
+ * Match(17,a,a,)
+ * Match(19,R,R,)
+ * Match(19,Ro,Ro,)
+ * Match(19,Rousseff,Rousseff,)
+ * Match(19,Rousse,Rousse,)
+ * Match(20,Ousse,ousse,)
+ * Match(21,US,us,)
+ * Match(21,USS,uss,)
+ * Match(22,Sse,sse,)
+ * Match(22,SS,ss,)
+ * Match(22,S,s,)
+ * Match(23,SEF,sef,)
+ * Match(23,S,s,)
+ * Match(24,Ef,ef,)
+ * Match(25,F,f,)
+ * Match(25,FF,ff,)
+ * Match(26,F,f,)
+ * Is relevant for me only the complete Match, like the third Match- Match(13,Dilma Rousseff,Dilma Rousseff,)
+ *
+ * @param result
+ * @param originalText
+ * @param pattern
+ * @return a Seq of Match[T]
+ */
+
+ private def filter(result: Seq[Match[String]], originalText: String, pattern: String): Seq[Match[String]] = {
+
+ var chunk: Match[String] = null
+ val buffer: ListBuffer[Match[String]] = ListBuffer()
+ val resultsArray = result.toArray[Match[String]]
+
+ Sorting.quickSort(resultsArray)(StartLengthOrdering)
+
+ //println(originalText)
+
+ resultsArray.foreach(resultArray => {
+ //println(resultArray)
+ var position = resultArray.start + resultArray.actual.length;
+
+ if ((resultArray.start == 0 || originalText.charAt(resultArray.start - 1).toString.matches(pattern)) &&
+ (position >= originalText.length ||
+ originalText.charAt(position).toString.matches(pattern))) {
+ if (chunk == null) {
+ buffer.append(resultArray)
+ chunk = resultArray
+ //println("* selected: * " + resultArray)
+ }
+ else if (chunk.start + chunk.actual.length < resultArray.start) {
+ buffer.append(resultArray)
+ chunk = resultArray
+ //println("* selected: * " + resultArray)
+ }
+ }
+ }
+ )
+
+
+ buffer.toSeq
+
+
+ }
+
+ /**
+ * Every spotter has a name that describes its strategy
+ * (for comparing multiple spotters during evaluation)
+ */
+ def getName(): String = {
+ if (name == "") {
+ val allMatches = if (overlap) "overlapping" else "non-overlapping"
+ "AhoCorasickSpotter[" + allMatches + "]"
+ } else {
+ name
+ }
+ }
+
+ def setName(newName: String) {
+ name = newName
+ }
+}
+
+object AhoCorasickSpotter {
+ /**
+ * Build an AhoCorasick trie from surface forms file
+ * @param surfaceForms
+ * @param caseSensitive case sensitive: true or false?
+ */
+ def fromSurfaceForms(surfaceForms: TraversableOnce[String], caseSensitive: Boolean, overlap: Boolean): AhoCorasickSpotter = {
+ val builder = AhoCorasickBuilder[String](surfaceForms.map(Data(_, "")).toSeq,
+ if (caseSensitive) _.toChar else _.toLower)
+
+ new AhoCorasickSpotter(builder, overlap)
+ }
+
+}
Oops, something went wrong.

0 comments on commit 6e4abb1

Please sign in to comment.