Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

More generic config; PhoneticAnalyzer made final to pass lucene's ass…

…ertions.
  • Loading branch information...
commit 0ee7492c892202b91383c5064226b0f7f9ad68a7 1 parent 76ab01a
@pablomendes pablomendes authored
View
192 conf/server.properties
@@ -1,96 +1,96 @@
-
-#
-# Copyright 2011 DBpedia Spotlight Development Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
-#
-
-# Server hostname and port to be used by DBpedia Spotlight REST API
-org.dbpedia.spotlight.web.rest.uri = http://localhost:2222/rest
-
-# Internationalization (i18n) support -- work in progress
-# Defines the languages the system should support.
-org.dbpedia.spotlight.language = English
-# Stop word list
-# An example can be downloaded from: http://spotlight.dbpedia.org/download/release-0.4/stopwords.en.list
-org.dbpedia.spotlight.data.stopWords.english = /data/spotlight/3.7/stopwords.en.list
-
-#----- SPOTTING -------
-
-# Comma-separated list of spotters to load.
-# Accepted values are LingPipeSpotter,WikiMarkupSpotter,AtLeastOneNounSelector,CoOccurrenceBasedSelector,NESpotter,OpenNLPNGramSpotter,OpenNLPChunkerSpotter,KeaSpotter
-# Some spotters may require extra files and config parameters. See org.dbpedia.spotlight.model.SpotterConfiguration
-org.dbpedia.spotlight.spot.spotters = LingPipeSpotter,WikiMarkupSpotter
-
-# Path to serialized LingPipe dictionary used by LingPipeSpotter
-org.dbpedia.spotlight.spot.dictionary = /fastdata/spotlight/3.6/surface_forms-Wikipedia-TitRedDis.thresh3.spotterDictionary
-jcs.default.cacheattributes.MaxObjects = 5000
-org.dbpedia.spotlight.tagging.hmm = ../dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
-
-# Configurations for the CoOccurrenceBasedSelector
-# From: http://spotlight.dbpedia.org/download/release-0.5/spot_selector.tgz
-org.dbpedia.spotlight.spot.cooccurrence.datasource = ukwac
-org.dbpedia.spotlight.spot.cooccurrence.database.jdbcdriver = org.hsqldb.jdbcDriver
-org.dbpedia.spotlight.spot.cooccurrence.database.connector = jdbc:hsqldb:file:/fastdata/spotlight/3.7/spotsel/ukwac_candidate;shutdown=true&readonly=true
-org.dbpedia.spotlight.spot.cooccurrence.database.user = sa
-org.dbpedia.spotlight.spot.cooccurrence.database.password =
-org.dbpedia.spotlight.spot.cooccurrence.classifier.unigram = /fastdata/spotlight/3.7/spotsel/ukwac_unigram.model
-org.dbpedia.spotlight.spot.cooccurrence.classifier.ngram = /fastdata/spotlight/3.7/spotsel/ukwac_ngram.model
-
-# Path to serialized HMM model for LingPipe-based POS tagging. Required by AtLeastOneNounSelector and CoOccurrenceBasedSelector
-org.dbpedia.spotlight.tagging.hmm = dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
-
-# Path to dir containing several OpenNLP models for NER, chunking, etc. This is required for spotters that are based on OpenNLP.
-# Can be downloaded from http://spotlight.dbpedia.org/download/release-0.5/opennlp_models.tgz
-org.dbpedia.spotlight.spot.opennlp.dir = /fastdata/spotlight/3.7/opennlp
-
-# EXPERIMENTAL! Path to Kea Model
-org.dbpedia.spotlight.spot.kea.model = /data/spotlight/3.7/kea/keaModel-1-3-1
-
-
-#----- CANDIDATE SELECTION -------
-
-# Choose between jdbc or lucene for DBpedia Resource creation. Also, if the jdbc throws an error, lucene will be used.
-org.dbpedia.spotlight.core.database = jdbc
-org.dbpedia.spotlight.core.database.jdbcdriver = org.hsqldb.jdbcDriver
-org.dbpedia.spotlight.core.database.connector = jdbc:hsqldb:file:/data/spotlight/3.7/database/spotlight-db;shutdown=true&readonly=true
-org.dbpedia.spotlight.core.database.user = sa
-org.dbpedia.spotlight.core.database.password =
-
-# From http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
-org.dbpedia.spotlight.candidateMap.dir = /fastdata/spotlight/3.7/candidateIndexTitRedDis
-# Path to Lucene index containing only the candidate map. It is used by document-oriented disambiguators such as Document,TwoStepDisambiguator
-# Only used if one such disambiguator is loaded. Data is at: http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
-#org.dbpedia.spotlight.candidateMap.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
-
-
-#----- DISAMBIGUATION -------
-
-# List of disambiguators to load: Document,Occurrences,CuttingEdge,Default
-org.dbpedia.spotlight.disambiguate.disambiguators = Default,Document
-
-# Path to a directory containing Lucene index files. These can be downloaded from the website or created by org.dbpedia.spotlight.lucene.index.IndexMergedOccurrences
-org.dbpedia.spotlight.index.dir = /fastdata/spotlight/3.7/index-withSF-withTypes-compressed/
-# Class used to process context around DBpedia mentions (tokenize, stem, etc.)
-org.dbpedia.spotlight.lucene.analyzer = SnowballAnalyzer
-# How large can the cache be for ICFDisambiguator.
-jcs.default.cacheattributes.MaxObjects = 5000
-
-
-#----- LINKING / FILTERING -------
-
-# Configuration for SparqlFilter
-org.dbpedia.spotlight.sparql.endpoint = http://dbpedia.org/sparql
-org.dbpedia.spotlight.sparql.graph = http://dbpedia.org
+
+#
+# Copyright 2011 DBpedia Spotlight Development Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
+#
+
+# Server hostname and port to be used by DBpedia Spotlight REST API
+org.dbpedia.spotlight.web.rest.uri = http://localhost:2222/rest
+
+# Internationalization (i18n) support -- work in progress
+# Defines the languages the system should support.
+org.dbpedia.spotlight.language = English
+# Stop word list
+# An example can be downloaded from: http://spotlight.dbpedia.org/download/release-0.4/stopwords.en.list
+org.dbpedia.spotlight.data.stopWords.english = stopwords.en.list
+
+#----- SPOTTING -------
+
+# Comma-separated list of spotters to load.
+# Accepted values are LingPipeSpotter,WikiMarkupSpotter,AtLeastOneNounSelector,CoOccurrenceBasedSelector,NESpotter,OpenNLPNGramSpotter,OpenNLPChunkerSpotter,KeaSpotter
+# Some spotters may require extra files and config parameters. See org.dbpedia.spotlight.model.SpotterConfiguration
+org.dbpedia.spotlight.spot.spotters = LingPipeSpotter,WikiMarkupSpotter,NESpotter
+
+# Path to serialized LingPipe dictionary used by LingPipeSpotter
+org.dbpedia.spotlight.spot.dictionary = dist/src/deb/control/data/usr/share/dbpedia-spotlight/spotter.dict
+jcs.default.cacheattributes.MaxObjects = 5000
+org.dbpedia.spotlight.tagging.hmm = dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
+
+# Configurations for the CoOccurrenceBasedSelector
+# From: http://spotlight.dbpedia.org/download/release-0.5/spot_selector.tgz
+org.dbpedia.spotlight.spot.cooccurrence.datasource = ukwac
+org.dbpedia.spotlight.spot.cooccurrence.database.jdbcdriver = org.hsqldb.jdbcDriver
+org.dbpedia.spotlight.spot.cooccurrence.database.connector = jdbc:hsqldb:file:/fastdata/spotlight/3.7/spotsel/ukwac_candidate;shutdown=true&readonly=true
+org.dbpedia.spotlight.spot.cooccurrence.database.user = sa
+org.dbpedia.spotlight.spot.cooccurrence.database.password =
+org.dbpedia.spotlight.spot.cooccurrence.classifier.unigram = /fastdata/spotlight/3.7/spotsel/ukwac_unigram.model
+org.dbpedia.spotlight.spot.cooccurrence.classifier.ngram = /fastdata/spotlight/3.7/spotsel/ukwac_ngram.model
+
+# Path to serialized HMM model for LingPipe-based POS tagging. Required by AtLeastOneNounSelector and CoOccurrenceBasedSelector
+org.dbpedia.spotlight.tagging.hmm = dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
+
+# Path to dir containing several OpenNLP models for NER, chunking, etc. This is required for spotters that are based on OpenNLP.
+# Can be downloaded from http://spotlight.dbpedia.org/download/release-0.5/opennlp_models.tgz
+org.dbpedia.spotlight.spot.opennlp.dir = dist/opennlp
+
+# EXPERIMENTAL! Path to Kea Model
+org.dbpedia.spotlight.spot.kea.model = /data/spotlight/3.7/kea/keaModel-1-3-1
+
+
+#----- CANDIDATE SELECTION -------
+
+# Choose between jdbc or lucene for DBpedia Resource creation. Also, if the jdbc throws an error, lucene will be used.
+org.dbpedia.spotlight.core.database = jdbc
+org.dbpedia.spotlight.core.database.jdbcdriver = org.hsqldb.jdbcDriver
+org.dbpedia.spotlight.core.database.connector = jdbc:hsqldb:file:/data/spotlight/3.7/database/spotlight-db;shutdown=true&readonly=true
+org.dbpedia.spotlight.core.database.user = sa
+org.dbpedia.spotlight.core.database.password =
+
+# From http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
+org.dbpedia.spotlight.candidateMap.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+# Path to Lucene index containing only the candidate map. It is used by document-oriented disambiguators such as Document,TwoStepDisambiguator
+# Only used if one such disambiguator is loaded. Data is at: http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
+#org.dbpedia.spotlight.candidateMap.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+
+
+#----- DISAMBIGUATION -------
+
+# List of disambiguators to load: Document,Occurrences,CuttingEdge,Default
+org.dbpedia.spotlight.disambiguate.disambiguators = Default,Document
+
+# Path to a directory containing Lucene index files. These can be downloaded from the website or created by org.dbpedia.spotlight.lucene.index.IndexMergedOccurrences
+org.dbpedia.spotlight.index.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+# Class used to process context around DBpedia mentions (tokenize, stem, etc.)
+org.dbpedia.spotlight.lucene.analyzer = SnowballAnalyzer
+# How large can the cache be for ICFDisambiguator.
+jcs.default.cacheattributes.MaxObjects = 5000
+
+
+#----- LINKING / FILTERING -------
+
+# Configuration for SparqlFilter
+org.dbpedia.spotlight.sparql.endpoint = http://dbpedia.org/sparql
+org.dbpedia.spotlight.sparql.graph = http://dbpedia.org
View
170 core/src/main/java/org/dbpedia/spotlight/lucene/analysis/PhoneticAnalyzer.java
@@ -1,86 +1,86 @@
-/*
- * Copyright 2012 DBpedia Spotlight Development Team
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
- */
-
-package org.dbpedia.spotlight.lucene.analysis;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Set;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.LowerCaseFilter;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.util.Version;
-import org.dbpedia.spotlight.model.SpotlightConfiguration;
-
-/**
- * Used for indexing text that can contain spelling anomalies
- *
- * @author pablomendes
- */
-public class PhoneticAnalyzer extends Analyzer {
-
- private Set<String> mStopWordSet;
- private Version mMatchVersion;
- private int mMaxCodeLength = 8;
-
- public PhoneticAnalyzer(Version aMatchVersion, Set<String> aStopWordSet) {
- this.mStopWordSet = aStopWordSet;
- this.mMatchVersion = aMatchVersion;
- }
-
- public PhoneticAnalyzer(Version aMatchVersion, Set<String> aStopWordSet, int aMaxCodeLength) {
- this.mStopWordSet = aStopWordSet;
- this.mMatchVersion = aMatchVersion;
- this.mMaxCodeLength = aMaxCodeLength;
- }
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(mMatchVersion, reader);
- result = new StandardFilter(mMatchVersion, result);
- result = new LowerCaseFilter(mMatchVersion, result); // lowercased only
- result = new StopFilter(mMatchVersion, result, mStopWordSet); // remove stopwords
- result = new DoubleMetaphoneFilter(result,mMaxCodeLength,true); // store phonetic code
- result = new ShingleFilter(result, 2, 3); // create token n-grams
- return result;
- }
-
- public static void main(String[] args) throws IOException {
- String myString = "cancer";
- Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36,SpotlightConfiguration.DEFAULT_STOPWORDS);
- System.out.println("Analyzing: \"" + myString +"\"");
- StringReader reader = new StringReader(myString);
- TokenStream stream = analyzer.tokenStream("field", reader);
- stream.reset();
-
- // print all tokens until stream is exhausted
- while (stream.incrementToken()) {
- System.out.println("token: "+stream);
- }
-
- stream.end();
- stream.close();
- }
+/*
+ * Copyright 2012 DBpedia Spotlight Development Team
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
+ */
+
+package org.dbpedia.spotlight.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.dbpedia.spotlight.model.SpotlightConfiguration;
+
+/**
+ * Used for indexing text that can contain spelling anomalies
+ *
+ * @author pablomendes
+ */
+public final class PhoneticAnalyzer extends Analyzer {
+
+ private Set<String> mStopWordSet;
+ private Version mMatchVersion;
+ private int mMaxCodeLength = 8;
+
+ public PhoneticAnalyzer(Version aMatchVersion, Set<String> aStopWordSet) {
+ this.mStopWordSet = aStopWordSet;
+ this.mMatchVersion = aMatchVersion;
+ }
+
+ public PhoneticAnalyzer(Version aMatchVersion, Set<String> aStopWordSet, int aMaxCodeLength) {
+ this.mStopWordSet = aStopWordSet;
+ this.mMatchVersion = aMatchVersion;
+ this.mMaxCodeLength = aMaxCodeLength;
+ }
+
+ @Override
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new StandardTokenizer(mMatchVersion, reader);
+ result = new StandardFilter(mMatchVersion, result);
+ result = new LowerCaseFilter(mMatchVersion, result); // lowercased only
+ result = new StopFilter(mMatchVersion, result, mStopWordSet); // remove stopwords
+ result = new DoubleMetaphoneFilter(result,mMaxCodeLength,true); // store phonetic code
+ result = new ShingleFilter(result, 2, 3); // create token n-grams
+ return result;
+ }
+
+ public static void main(String[] args) throws IOException {
+ String myString = "cancer";
+ Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36,SpotlightConfiguration.DEFAULT_STOPWORDS);
+ System.out.println("Analyzing: \"" + myString +"\"");
+ StringReader reader = new StringReader(myString);
+ TokenStream stream = analyzer.tokenStream("field", reader);
+ stream.reset();
+
+ // print all tokens until stream is exhausted
+ while (stream.incrementToken()) {
+ System.out.println("token: "+stream);
+ }
+
+ stream.end();
+ stream.close();
+ }
}
Please sign in to comment.
Something went wrong with that request. Please try again.