Permalink
Browse files

FSASpotter was using RawTokenizer, install Spotlight before pignlproc…

… while indexing.
  • Loading branch information...
1 parent 08182a2 commit f68723061d0f6dd3acd2d975f5975e4702d8f92a @jodaiber jodaiber committed Mar 15, 2013
View
@@ -84,6 +84,23 @@ curl -# http://downloads.dbpedia.org/current/$LANGUAGE/redirects_$LANGUAGE.nt.bz
curl -# http://downloads.dbpedia.org/current/$LANGUAGE/disambiguations_$LANGUAGE.nt.bz2 | bzcat > disambiguations.nt
curl -# http://downloads.dbpedia.org/current/$LANGUAGE/instance_types_$LANGUAGE.nt.bz2 | bzcat > instance_types.nt
+
+#Set up Spotlight:
+cd $BASE_WDIR
+
+if [ -d dbpedia-spotlight ]; then
+ echo "Updating DBpedia Spotlight..."
+ cd dbpedia-spotlight
+ git reset --hard HEAD
+ git pull
+ mvn -q clean install
+else
+ echo "Setting up DBpedia Spotlight..."
+ git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight.git
+ cd dbpedia-spotlight
+ mvn -q clean install
+fi
+
cd $BASE_DIR
#Set up pig:
@@ -103,19 +120,6 @@ else
mvn -q assembly:assembly -Dmaven.test.skip=true
fi
-#Set up Spotlight:
-cd $BASE_WDIR
-
-if [ -d dbpedia-spotlight ]; then
- echo "Updating DBpedia Spotlight..."
- cd dbpedia-spotlight
- git reset --hard HEAD
- git pull
-else
- echo "Setting up DBpedia Spotlight..."
- git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight.git
-fi
-
#Load the dump into HDFS:
echo "Loading Wikipedia dump into HDFS..."
@@ -2,10 +2,11 @@ package org.dbpedia.spotlight.db
import memory.MemoryStore
import org.dbpedia.spotlight.model._
-import model.{RawTokenizer, SurfaceFormStore}
+import model.{AnnotationTokenizer, RawTokenizer, SurfaceFormStore}
import opennlp.tools.util.Span
import collection.mutable.ArrayBuffer
import collection.mutable.Map
+import tokenize.LanguageIndependentTokenizer
/**
@@ -69,7 +70,7 @@ object FSASpotter {
//State ID for None
val REJECTING_STATE = -2
- def buildDictionary(sfStore: SurfaceFormStore, tokenizer: RawTokenizer): FSADictionary = {
+ def buildDictionary(sfStore: SurfaceFormStore, tokenizer: LanguageIndependentTokenizer): FSADictionary = {
//Temporary FSA DSs:
val transitions: ArrayBuffer[Map[Int, Int]] = ArrayBuffer[Map[Int, Int]]()
@@ -193,7 +193,7 @@ object CreateSpotlightModel {
memoryIndexer.writeTokenOccurrences()
if(opennlpFolder.isEmpty) {
- val fsaDict = FSASpotter.buildDictionary(sfStore, rawTokenizer)
+ val fsaDict = FSASpotter.buildDictionary(sfStore, new LanguageIndependentTokenizer(SpotlightModel.loadStopwords(outputFolder), stemmer, locale, tokenStore))
MemoryStore.dump(fsaDict, new File(outputFolder, "fsa_dict.mem"))
}

0 comments on commit f687230

Please sign in to comment.