Skip to content
Browse files

Initial commit of topical stuff

  • Loading branch information...
0 parents commit 8c612321de0045f55008a8ba01a988a02d7f3dc1 @dirkweissenborn dirkweissenborn committed Jul 25, 2012
Showing with 9,891 additions and 0 deletions.
  1. +9 −0 .gitignore
  2. +116 −0 README.md
  3. +3 −0 bin/getSurfaceFormMapFromOccs.sh
  4. +55 −0 bin/index.sh
  5. +13 −0 bin/package.sh
  6. +1 −0 bin/run.sh
  7. +5 −0 bin/stopwords.sh
  8. +67 −0 conf/indexing.properties
  9. +44 −0 conf/indexing.properties.default
  10. +106 −0 conf/server.properties
  11. +9 −0 conf/server.properties.default
  12. +307 −0 core/pom.xml
  13. +39 −0 core/src/main/java/org/dbpedia/spotlight/annotate/Annotator.java
  14. +43 −0 core/src/main/java/org/dbpedia/spotlight/annotate/ParagraphAnnotator.java
  15. +180 −0 core/src/main/java/org/dbpedia/spotlight/disambiguate/CustomScoresDisambiguator.java
  16. +103 −0 core/src/main/java/org/dbpedia/spotlight/disambiguate/Disambiguator.java
  17. +84 −0 core/src/main/java/org/dbpedia/spotlight/disambiguate/RandomDisambiguator.java
  18. +38 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/AnnotationException.java
  19. +38 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/AuthenticationException.java
  20. +32 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/ConfigurationException.java
  21. +37 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/DisambiguationException.java
  22. +37 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/IndexException.java
  23. +35 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/InitializationException.java
  24. +38 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/InputException.java
  25. +35 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/ItemNotFoundException.java
  26. +38 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/OutputException.java
  27. +37 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/SearchException.java
  28. +39 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/SparqlExecutionException.java
  29. +39 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/SpottingException.java
  30. +34 −0 core/src/main/java/org/dbpedia/spotlight/exceptions/TimeoutException.java
  31. +198 −0 core/src/main/java/org/dbpedia/spotlight/io/DataLoader.java
  32. +321 −0 core/src/main/java/org/dbpedia/spotlight/io/DatasetSplitter.java
  33. +55 −0 core/src/main/java/org/dbpedia/spotlight/lucene/LuceneFeatureVector.java
  34. +932 −0 core/src/main/java/org/dbpedia/spotlight/lucene/LuceneManager.java
  35. +67 −0 core/src/main/java/org/dbpedia/spotlight/lucene/analysis/NGramAnalyzer.java
  36. +86 −0 core/src/main/java/org/dbpedia/spotlight/lucene/analysis/PhoneticAnalyzer.java
  37. +171 −0 core/src/main/java/org/dbpedia/spotlight/lucene/disambiguate/LucenePriorDisambiguator.java
  38. +231 −0 core/src/main/java/org/dbpedia/spotlight/lucene/disambiguate/MergedOccurrencesDisambiguator.java
  39. +103 −0 core/src/main/java/org/dbpedia/spotlight/lucene/disambiguate/MixedWeightsDisambiguator.java
  40. +461 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/BaseSearcher.java
  41. +166 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/CandidateResourceQuery.java
  42. +141 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/LuceneCandidateSearcher.java
  43. +457 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/MergedOccurrencesContextSearcher.java
  44. +120 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/SeparateOccurrencesContextSearcher.java
  45. +236 −0 core/src/main/java/org/dbpedia/spotlight/lucene/search/TermScorer.java
  46. +198 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/CachedInvCandFreqSimilarity.java
  47. +37 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/CachedSimilarity.java
  48. +114 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/InvCandFreqSimilarity.java
  49. +139 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/JCSTermCache.java
  50. +144 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/NewSimilarity.java
  51. +181 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/TermCache.java
  52. +132 −0 core/src/main/java/org/dbpedia/spotlight/lucene/similarity/TermsFilter.java
  53. +50 −0 core/src/main/java/org/dbpedia/spotlight/model/CandidateSearcher.java
  54. +36 −0 core/src/main/java/org/dbpedia/spotlight/model/ContextSearcher.java
  55. +100 −0 core/src/main/java/org/dbpedia/spotlight/model/DisambiguatorConfiguration.java
  56. +277 −0 core/src/main/java/org/dbpedia/spotlight/model/SpotlightConfiguration.java
  57. +204 −0 core/src/main/java/org/dbpedia/spotlight/model/SpotterConfiguration.java
  58. +73 −0 core/src/main/java/org/dbpedia/spotlight/model/TopicalClassificationConfiguration.java
  59. +192 −0 core/src/main/java/org/dbpedia/spotlight/sparql/SparqlQueryExecuter.java
  60. +249 −0 core/src/main/java/org/dbpedia/spotlight/spot/CoOccurrenceBasedSelector.java
  61. +129 −0 core/src/main/java/org/dbpedia/spotlight/spot/KeaSpotter.java
  62. +217 −0 core/src/main/java/org/dbpedia/spotlight/spot/NESpotter.java
  63. +82 −0 core/src/main/java/org/dbpedia/spotlight/spot/NGram.java
  64. +280 −0 core/src/main/java/org/dbpedia/spotlight/spot/OpenNLPNGramSpotter.java
  65. +204 −0 core/src/main/java/org/dbpedia/spotlight/spot/OpenNLPUtil.java
  66. +41 −0 core/src/main/java/org/dbpedia/spotlight/spot/RandomSelector.java
  67. +50 −0 core/src/main/java/org/dbpedia/spotlight/spot/Spotter.java
  68. +130 −0 core/src/main/java/org/dbpedia/spotlight/spot/SpotterWithSelector.java
  69. +64 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/ClassifierFactory.java
  70. +89 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/InstanceBuilderFactory.java
  71. +11 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/classification/SpotClass.java
  72. +49 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/classification/SpotClassification.java
  73. +123 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/classification/SpotClassifier.java
  74. +356 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/features/CandidateFeatures.java
  75. +87 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/features/data/CandidateData.java
  76. +87 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/features/data/CoOccurrenceData.java
  77. +59 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/features/data/OccurrenceDataProvider.java
  78. +150 −0 ...rc/main/java/org/dbpedia/spotlight/spot/cooccurrence/features/data/OccurrenceDataProviderSQL.java
  79. +81 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/filter/Filter.java
  80. +70 −0 core/src/main/java/org/dbpedia/spotlight/spot/cooccurrence/filter/FilterPOS.java
Sorry, we could not display the entire diff because too many files (474) changed.
9 .gitignore
@@ -0,0 +1,9 @@
+*.iml
+.idea
+.classpath
+.project
+.settings/
+target
+*.log
+*~
+index/output
116 README.md
@@ -0,0 +1,116 @@
+# DBpedia Spotlight
+#### Shedding Light on the Web of Documents
+
+DBpedia Spotlight looks for ~3.5M things of ~320 types in text and tries to link them to their global unique identifiers in [DBpedia](http://dbpedia.org).
+
+#### Demonstration
+
+Go to our [demonstration](http://spotlight.dbpedia.org/demo/) page, copy+paste some text and play with the parameters to see how it works.
+
+#### Call our web service
+
+You can use our demonstration [Web Service](http://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Web-service) directly from your application.
+
+ curl http://spotlight.dbpedia.org/rest/annotate \
+ --data-urlencode "text=President Obama called Wednesday on Congress to extend a tax break
+ for students included in last year's economic stimulus package, arguing
+ that the policy provides more generous assistance." \
+ --data "confidence=0.2" \
+ --data "support=20"
+
+#### Run your own server
+
+If you need service reliability and lower response times, you can run DBpedia Spotlight in your own [InHouse-Server](http://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/InHouse_Server).
+
+ wget http://spotlight.dbpedia.org/download/release-0.5/dbpedia-spotlight-quickstart.zip
+ unzip dbpedia-spotlight-quickstart.zip
+ cd dbpedia-spotlight-quickstart/
+ ./run.sh
+
+#### Build from source
+
+We provide a [Java/Scala API](http://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Java%2FScala%20API) for you to use our code in your application.
+More info [here](http://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Java%2FScala%20API).
+
+[![Build Status](https://secure.travis-ci.org/dbpedia-spotlight/dbpedia-spotlight.png?branch=master)](http://travis-ci.org/dbpedia-spotlight/dbpedia-spotlight)
+
+## Introduction
+
+DBpedia Spotlight is a tool for automatically annotating mentions of DBpedia resources in text, providing a solution for linking unstructured information sources to the Linked Open Data cloud through DBpedia. DBpedia Spotlight recognizes that names of concepts or entities have been mentioned (e.g. "Michael Jordan"), and subsequently matches these names to unique identifiers (e.g. [dbpedia:Michael_I._Jordan](http://dbpedia.org/page/Michael_I._Jordan), the machine learning professor or [dbpedia:Michael_Jordan](http://dbpedia.org/page/Michael_Jordan) the basketball player). It can also be used for building your solution for [Named Entity Recognition](http://en.wikipedia.org/wiki/Named_entity_recognition), Keyphrase Extraction, Tagging, etc. amongst other information extraction tasks.
+
+Text annotation has the potential of enhancing a wide range of applications, including search, faceted browsing and navigation. By connecting text documents with DBpedia, our system enables a range of interesting use cases. For instance, the ontology can be used as background knowledge to display complementary information on web pages or to enhance information retrieval tasks. Moreover, faceted browsing over documents and customization of web feeds based on semantics become feasible. Finally, by following links from DBpedia into other data sources, the Linked Open Data cloud is pulled closer to the Web of Documents.
+
+Take a look at our [Known Uses] (http://dbpedia.org/spotlight/knownuses) page for other examples of how DBpedia Spotlight can be used. If you use DBpedia Spotlight in your project, please add a link to http://spotlight.dbpedia.org. If you use it in a paper, please use the citation available in the end of this page.
+
+You can try out DBpedia Spotlight through our Web Application or Web Service endpoints. The Web Application is a user interface that allows you to enter text in a form and generates an HTML annotated version of the text with links to DBpedia. The Web Service endpoints provide programmatic access to the demo, allowing you to retrieve data also in XML or JSON.
+## Documentation
+
+We split the documentation according to the depth at which we give explanations. Please feel free to take a look at our:
+ * [User's Manual](http://dbpedia.org/spotlight/usersmanual), if you are not interested in details of how things happen, but you would like to use the system in your website or software project.
+ * [Technical Documentation](http://dbpedia.org/spotlight/technicaldocumentation), if you want to have an overview of technical details before you go into the source code.
+ * [Source code](http://sourceforge.net/projects/dbp-spotlight/), if you really want to know every detail, our source code is open, free and loves to meet new people. ;)
+
+
+## Downloads
+
+DBpedia Spotlight looks for ~3.5M things of ~320 types in text and tries to disambiguate them to their global unique identifiers in DBpedia. It uses the entire Wikipedia in order to learn how to annotate DBpedia Resources, the entire dataset cannot be distributed alongside the code, and can be downloaded in varied sizes from the [download page](http://dbpedia.org/spotlight/downloads). A tiny dataset is included in the distribution for demonstration purposes only.
+After you've downloaded the files, you need to modify the configuration in server.properties with the correct path to the files. More info [here](https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Installation).
+
+## Licenses
+
+The program can be used under the terms of the [Apache License, 2.0](http://www.apache.org/licenses/LICENSE-2.0.html).
+Part of the code uses [LingPipe](http://alias-i.com/lingpipe/) under the [Royalty Free License](http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt). Therefore, this license also applies to the output of the currently deployed web service.
+
+The documentation on this website is shared as [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License)
+
+## Citation
+
+If you use this work on your research, please cite:
+
+Pablo N. Mendes, Max Jakob, Andrés García-Silva and Christian Bizer. [DBpedia Spotlight: Shedding Light on the Web of Documents](http://www.wiwiss.fu-berlin.de/en/institute/pwo/bizer/research/publications/Mendes-Jakob-GarciaSilva-Bizer-DBpediaSpotlight-ISEM2011.pdf). *Proceedings of the 7th International Conference on Semantic Systems (I-Semantics)*. Graz, Austria, 7–9 September 2011.
+
+```bibtex
+@inproceedings{isem2011mendesetal,
+ title = {DBpedia Spotlight: Shedding Light on the Web of Documents},
+ author = {Pablo N. Mendes and Max Jakob and Andr\'{e}s Garc\'{i}a-Silva and Christian Bizer},
+ year = {2011},
+ booktitle = {Proceedings of the 7th International Conference on Semantic Systems (I-Semantics)},
+ abstract = {Interlinking text documents with Linked Open Data enables the Web of Data to be used as background knowledge within document-oriented applications such as search and faceted browsing. As a step towards interconnecting the Web of Documents with the Web of Data, we developed DBpedia Spotlight, a system for automatically annotating text documents with DBpedia URIs. DBpedia Spotlight allows users to configure the annotations to their specific needs through the DBpedia Ontology and quality measures such as prominence, topical pertinence, contextual ambiguity and disambiguation confidence. We compare our approach with the state of the art in disambiguation, and evaluate our results in light of three baselines and six publicly available annotation systems, demonstrating the competitiveness of our system. DBpedia Spotlight is shared as open source and deployed as a Web Service freely available for public use.}
+}
+```
+
+The corpus used to evaluate DBpedia Spotlight in this work is described [here](http://wiki.dbpedia.org/spotlight/evaluation).
+
+## Support and Feedback
+The best way to get help with DBpedia Spotlight is to send a message to our [mailing list](https://lists.sourceforge.net/mailman/listinfo/dbp-spotlight-users) at *dbp-spotlight-users@lists.sourceforge.net*.
+
+You can also join the #dbpedia-spotlight IRC channel on Freenode. We also hear [Tweets](http://search.twitter.com/search.atom?q=+dbpedia+spotlight).
+
+We'd love if you gave us some feedback.
+
+
+
+## Team
+
+The DBpedia Spotlight team includes the names cited below. Individual contributions are acknowledged in the source code and publications.
+
+#### Maintainers
+[Pablo Mendes](http://www.wiwiss.fu-berlin.de/en/institute/pwo/bizer/team/MendesPablo.html) (Freie Universität Berlin), Jun 2010-present.
+
+[Max Jakob](http://www.wiwiss.fu-berlin.de/en/institute/pwo/bizer/team/JakobMax.html) (Freie Universität Berlin), Jun 2010-Sep 2011, Apr 2012-present.
+
+[Jo Daiber](http://jodaiber.de/) (Charles University in Prague), Mar 2011-present.
+
+Prof. Dr. [Chris Bizer](http://www.wiwiss.fu-berlin.de/en/institute/pwo/bizer/team/BizerChristian.html) (Freie Universität Berlin), supervisor, Jun 2010-present.
+
+#### Collaborators
+[Andrés García-Silva](http://grafias.dia.fi.upm.es/Sem4Tags/about.html) (Universidad Politécnica de Madrid), Jul-Dec 2010.
+
+[Rohana Rajapakse](http://www.linkedin.com/pub/rohana-rajapakse/3/9a1/8) (Goss Interactive Ltd.), Oct-2011.
+
+
+## Acknowledgements
+
+This work has been funded by:
+ * [Neofonie GmbH](http://www.neofonie.de/), a Berlin-based company offering leading technologies in the area of Web search, social media and mobile applications. (Jun 2010-Jun 2011)
+ * The European Commission through the project [LOD2 - Creating Knowledge out of Linked Data](http://lod2.eu/). (Jun 2010-present)
3 bin/getSurfaceFormMapFromOccs.sh
@@ -0,0 +1,3 @@
+cat output/occs.uriSorted.tsv | cut -d$'\t' -f 2,3 | perl -F/\\t/ -lane 'print "$F[1]\t$F[0]";' > output/surfaceForms-fromOccs.tsv
+sort output/surfaceForms-fromOccs.tsv | uniq -c > output/surfaceForms-fromOccs.count
+grep -Pv " [123] " output/surfaceForms-fromOccs.count | sed -r "s|\s+[0-9]+\s(.+)|\1|" > output/surfaceForms-fromOccs-thresh3.tsv
55 bin/index.sh
@@ -0,0 +1,55 @@
+# You are expected to run the commands in this script from inside the bin directory in your DBpedia Spotlight installation
+# Adjust the paths here if you don't. This script is meant more as a step-by-step guidance than a real automated run-all.
+# If this is your first time running the script, we advise you to copy/paste commands from here, closely watching the messages
+# and the final output.
+#
+# @author maxjakob, pablomendes
+
+here=`pwd`
+
+INDEX_CONFIG_FILE=../conf/indexing.properties
+
+# the indexing process merges occurrences in memory to speed up the process. the more memory the better
+export JAVA_OPTS="-Xmx14G"
+export MAVEN_OPTS="-Xmx14G"
+export SCALA_OPTS="-Xmx14G"
+
+# you have to run maven2 from the module that contains the indexing classes
+cd ../index
+# the indexing process will generate files in the directory below
+mkdir output
+
+# first step is to extract valid URIs, synonyms and surface forms from DBpedia
+mvn scala:run -DmainClass=org.dbpedia.spotlight.util.ExtractCandidateMap "-DaddArgs=$INDEX_CONFIG_FILE"
+
+# now we collect parts of Wikipedia dump where DBpedia resources occur and output those occurrences as Tab-Separated-Values
+mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.ExtractOccsFromWikipedia "-DaddArgs=$INDEX_CONFIG_FILE|output/occs.tsv"
+
+# (recommended) sorting the occurrences by URI will speed up context merging during indexing
+sort -t $'\t' -k2 output/occs.tsv >output/occs.uriSorted.tsv
+
+# create a lucene index out of the occurrences
+mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.IndexMergedOccurrences "-DaddArgs=$INDEX_CONFIG_FILE|output/occs.uriSorted.tsv"
+
+# (optional) make a backup copy of the index before you lose all the time you've put into this
+cp -R output/index output/index-backup
+
+# (optional) preprocess surface forms however you want: produce acronyms, abbreviations, alternative spellings, etc.
+# in the example below we scan paragraphs for uri->sf mappings that occurred together more than 3 times.
+../bin/getSurfaceFormMapFromOccs.sh
+cp output/surfaceForms.tsv output/surfaceForms-fromTitRedDis.tsv
+cat output/surfaceForms-fromTitRedDis.tsv output/surfaceForms-fromOccs.tsv > output/surfaceForms.tsv
+
+# add surface forms to index
+ mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.AddSurfaceFormsToIndex "-DaddArgs=$INDEX_CONFIG_FILE"
+# or
+ mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.CandidateIndexer "-DaddArgs=output/surfaceForms.tsv|output/candidateIndex|3|case-insensitive|overwrite"
+
+# add entity types to index
+mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.AddTypesToIndex "-DaddArgs=$INDEX_CONFIG_FILE"
+
+# (optional) reduce index size by unstoring fields (attention: you won't be able to see contents of fields anymore)
+mvn scala:run -DmainClass=org.dbpedia.spotlight.lucene.index.CompressIndex "-DaddArgs=$INDEX_CONFIG_FILE|10"
+
+# train a linker (most simple is based on similarity-thresholds)
+# mvn scala:run -DmainClass=org.dbpedia.spotlight.evaluation.EvaluateDisambiguationOnly
13 bin/package.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#rm tmp/ -rf
+deb=dbpedia-spotlight-0.4.9.deb
+cd ../rest
+mvn package
+mkdir tmp
+cd tmp
+ar -x ../$deb
+cat debian-binary control.tar.gz data.tar.gz > combined-contents
+gpg -abs -o _gpgorigin combined-contents
+ar rc $deb \
+ _gpgorigin debian-binary control.tar.gz data.tar.gz
+cp $deb ../
1 bin/run.sh
@@ -0,0 +1 @@
+mvn scala:run -DaddArgs=conf/test.properties
5 bin/stopwords.sh
@@ -0,0 +1,5 @@
+# Runs ExtractStopwords.java to get a list of top terms from the index, then cleans it up a bit
+
+#java -jar dbpedia-spotlight.jar ExtractStopwords index CONTEXT 2000 > top-df-terms.set
+cut -f 1 -d " " top-df-terms.set | sed s/CONTEXT:// | egrep -v "[0-9]+" | sort -u > stopwords.set
+
67 conf/indexing.properties
@@ -0,0 +1,67 @@
+# Wikipedia Dump
+# --------------
+org.dbpedia.spotlight.data.wikipediaDump = /media/Data/Wikipedia/enwiki-20110722-pages-articles.xml
+
+# Location for DBpedia resources index (output
+org.dbpedia.spotlight.index.dir = /media/Data/Wikipedia/index
+
+# DBpedia Datasets
+# ----------------
+org.dbpedia.spotlight.data.labels = /media/Data/Wikipedia/labels_en.nt
+org.dbpedia.spotlight.data.redirects = /media/Data/Wikipedia/redirects_en.nt
+org.dbpedia.spotlight.data.disambiguations = /media/Data/Wikipedia/disambiguations_en.nt
+org.dbpedia.spotlight.data.instanceTypes = /media/Data/Wikipedia/instance_types_en.nt
+org.dbpedia.spotlight.data.sortedArticlesCategories = /media/Data/Wikipedia/sorted.article_categories_en.nt
+org.dbpedia.spotlight.data.categories=/media/Data/Wikipedia/skos_categories_en.nt
+org.dbpedia.spotlight.data.concepts=/media/Data/Wikipedia/topical_concepts.nt
+
+
+# Files created from DBpedia Datasets
+# -----------------------
+org.dbpedia.spotlight.data.conceptURIs = output/conceptURIs.list
+org.dbpedia.spotlight.data.redirectsTC = output/redirects_tc.tsv
+org.dbpedia.spotlight.data.surfaceForms = output/surfaceForms.tsv
+
+# Language-specific config
+# --------------
+org.dbpedia.spotlight.language = English
+org.dbpedia.spotlight.lucene.analyzer = SnowballAnalyzer
+
+# Stop word list
+org.dbpedia.spotlight.data.stopWords.english = /media/Data/Wikipedia/stopwords.en.list
+#org.dbpedia.spotlight.data.stopWords.portuguese = /data/spotlight/3.6/pt/stopwords.pt.list
+#org.dbpedia.spotlight.data.stopWords.spanish = /data/spotlight/3.6/es/stopwords.es.list
+
+# URI patterns that should not be indexed. e.g. List_of_*
+org.dbpedia.spotlight.data.badURIs.english = /media/Data/Wikipedia/blacklistedURIPatterns.en.list
+
+# Will discard surface forms that are too long (reduces complexity of spotting and generally size in disk/memory)
+org.dbpedia.spotlight.data.maxSurfaceFormLength = 50
+# Will index only words closest to resource occurrence
+org.dbpedia.spotlight.data.maxContextWindowSize = 200
+org.dbpedia.spotlight.data.minContextWindowSize = 0
+
+# Other files
+org.dbpedia.spotlight.data.priors = /home/pablo/eval/grounder/gold/g1b_spotlight.words.uris.counts
+
+# Yahoo! Boss properties
+# ----------------------
+# application ID
+org.dbpedia.spotlight.yahoo.appID =
+# number of results returned at for one query (maximum: 50)
+org.dbpedia.spotlight.yahoo.maxResults = 50
+# number of iteration; each iteration returns YahooBossResults results
+org.dbpedia.spotlight.yahoo.maxIterations = 100
+## important for Yahoo! Boss query string: both language and region must be set according to
+## http://developer.yahoo.com/search/boss/boss_guide/supp_regions_lang.html
+org.dbpedia.spotlight.yahoo.language = en
+org.dbpedia.spotlight.yahoo.region = us
+
+# Topic configurations
+# -----------------------
+org.dbpedia.spotlight.topic.dictionary=/media/Data/Wikipedia/Dictionary/model.word_id.dict
+org.dbpedia.spotlight.topic.categories.dictionary=/media/Data/Wikipedia/Dictionary/cluster.topic.dict
+org.dbpedia.spotlight.topic.flattenedHierarchy=/media/Data/Wikipedia/FlattenedHierarchyByTopics
+org.dbpedia.spotlight.topic.info=/home/dirk/workspace/dbpedia-spotlight/index/src/main/resources/topic_descriptions.xml
+
+org.dbpedia.spotlight.topic.dictionary.maxsize=128000
44 conf/indexing.properties.default
@@ -0,0 +1,44 @@
+# Maximum heap space for indexing
+# -------------------------------
+org.dbpedia.spotlight.index.heapspace = 2g
+
+
+# Wikipedia Dump
+# --------------
+org.dbpedia.spotlight.data.wikipediaDump = data/enwiki-20100312-pages-articles.xml
+org.dbpedia.spotlight.index.occurrences =
+org.dbpedia.spotlight.index.dir =
+
+
+# DBpedia Datasets
+# ----------------
+org.dbpedia.spotlight.data.labels = data/dbpedia/labels_en.nt
+org.dbpedia.spotlight.data.redirects = data/dbpedia/redirects_en.nt
+org.dbpedia.spotlight.data.disambiguations = data/dbpedia/disambiguations_en.nt
+org.dbpedia.spotlight.data.instanceTypes = data/dbpedia/instance_types_en.nt
+
+
+# Important created files
+# -----------------------
+org.dbpedia.spotlight.data.conceptURIs = data/conceptURIs.list
+org.dbpedia.spotlight.data.redirectsTC = data/redirects_tc.tsv
+org.dbpedia.spotlight.data.surfaceForms = data/surface_forms-Wikipedia-TitRedDis.tsv
+
+
+# Stop word list
+# --------------
+org.dbpedia.spotlight.data.stopWords = data/stopword.list
+
+
+# Yahoo! Boss properties
+# ----------------------
+# application ID
+org.dbpedia.spotlight.yahoo.appID = please-specify-if-you-want-to-experiment-with-WebOccurrences!
+# number of results returned at for one query (maximum: 50)
+org.dbpedia.spotlight.yahoo.maxResults = 50
+# number of iteration; each iteration returns YahooBossResults results
+org.dbpedia.spotlight.yahoo.maxIterations = 100
+## important for Yahoo! Boss query string: both language and region must be set according to
+## http://developer.yahoo.com/search/boss/boss_guide/supp_regions_lang.html
+org.dbpedia.spotlight.yahoo.language = en
+org.dbpedia.spotlight.yahoo.region = us
106 conf/server.properties
@@ -0,0 +1,106 @@
+
+#
+# Copyright 2011 DBpedia Spotlight Development Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
+#
+
+# Server hostname and port to be used by DBpedia Spotlight REST API
+org.dbpedia.spotlight.web.rest.uri = http://localhost:2222/rest
+
+# Internationalization (i18n) support -- work in progress
+org.dbpedia.spotlight.default_namespace = http://dbpedia.org/resource/
+# Defines the languages the system should support.
+org.dbpedia.spotlight.language = English
+# Stop word list
+# An example can be downloaded from: http://spotlight.dbpedia.org/download/release-0.4/stopwords.en.list
+org.dbpedia.spotlight.data.stopWords.english = stopwords.en.list
+
+#----- SPOTTING -------
+
+# Comma-separated list of spotters to load.
+# Accepted values are LingPipeSpotter,WikiMarkupSpotter,AtLeastOneNounSelector,CoOccurrenceBasedSelector,NESpotter,OpenNLPNGramSpotter,OpenNLPChunkerSpotter,KeaSpotter
+# Some spotters may require extra files and config parameters. See org.dbpedia.spotlight.model.SpotterConfiguration
+org.dbpedia.spotlight.spot.spotters = LingPipeSpotter,WikiMarkupSpotter,NESpotter
+
+# Path to serialized LingPipe dictionary used by LingPipeSpotter
+org.dbpedia.spotlight.spot.dictionary = dist/src/deb/control/data/usr/share/dbpedia-spotlight/spotter.dict
+jcs.default.cacheattributes.MaxObjects = 5000
+org.dbpedia.spotlight.tagging.hmm = dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
+
+# Configurations for the CoOccurrenceBasedSelector
+# From: http://spotlight.dbpedia.org/download/release-0.5/spot_selector.tgz
+org.dbpedia.spotlight.spot.cooccurrence.datasource = ukwac
+org.dbpedia.spotlight.spot.cooccurrence.database.jdbcdriver = org.hsqldb.jdbcDriver
+org.dbpedia.spotlight.spot.cooccurrence.database.connector = jdbc:hsqldb:file:/fastdata/spotlight/3.7/spotsel/ukwac_candidate;shutdown=true&readonly=true
+org.dbpedia.spotlight.spot.cooccurrence.database.user = sa
+org.dbpedia.spotlight.spot.cooccurrence.database.password =
+org.dbpedia.spotlight.spot.cooccurrence.classifier.unigram = /fastdata/spotlight/3.7/spotsel/ukwac_unigram.model
+org.dbpedia.spotlight.spot.cooccurrence.classifier.ngram = /fastdata/spotlight/3.7/spotsel/ukwac_ngram.model
+
+# Path to serialized HMM model for LingPipe-based POS tagging. Required by AtLeastOneNounSelector and CoOccurrenceBasedSelector
+org.dbpedia.spotlight.tagging.hmm = dist/src/deb/control/data/usr/share/dbpedia-spotlight/pos-en-general-brown.HiddenMarkovModel
+
+# Path to dir containing several OpenNLP models for NER, chunking, etc. This is required for spotters that are based on OpenNLP.
+# Can be downloaded from http://spotlight.dbpedia.org/download/release-0.5/opennlp_models.tgz
+org.dbpedia.spotlight.spot.opennlp.dir = /data/spotlight/3.7/opennlp
+
+# EXPERIMENTAL! Path to Kea Model
+org.dbpedia.spotlight.spot.kea.model = /data/spotlight/3.7/kea/keaModel-1-3-1
+
+
+#----- CANDIDATE SELECTION -------
+
+# Choose between jdbc or lucene for DBpedia Resource creation. Also, if the jdbc throws an error, lucene will be used.
+org.dbpedia.spotlight.core.database = jdbc
+org.dbpedia.spotlight.core.database.jdbcdriver = org.hsqldb.jdbcDriver
+org.dbpedia.spotlight.core.database.connector = jdbc:hsqldb:file:/data/spotlight/3.7/database/spotlight-db;shutdown=true&readonly=true
+org.dbpedia.spotlight.core.database.user = sa
+org.dbpedia.spotlight.core.database.password =
+
+# From http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
+org.dbpedia.spotlight.candidateMap.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+# Path to Lucene index containing only the candidate map. It is used by document-oriented disambiguators such as Document,TwoStepDisambiguator
+# Only used if one such disambiguator is loaded. Data is at: http://spotlight.dbpedia.org/download/release-0.5/candidate-index-full.tgz
+#org.dbpedia.spotlight.candidateMap.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+
+
+#----- DISAMBIGUATION -------
+
+# List of disambiguators to load: Document,Occurrences,CuttingEdge,Default
+org.dbpedia.spotlight.disambiguate.disambiguators = Default,Document
+
+# Path to a directory containing Lucene index files. These can be downloaded from the website or created by org.dbpedia.spotlight.lucene.index.IndexMergedOccurrences
+org.dbpedia.spotlight.index.dir = dist/src/deb/control/data/usr/share/dbpedia-spotlight/index
+# Class used to process context around DBpedia mentions (tokenize, stem, etc.)
+org.dbpedia.spotlight.lucene.analyzer = SnowballAnalyzer
+# How large can the cache be for ICFDisambiguator.
+jcs.default.cacheattributes.MaxObjects = 5000
+
+
+#----- LINKING / FILTERING -------
+
+# Configuration for SparqlFilter
+org.dbpedia.spotlight.sparql.endpoint = http://dbpedia.org/sparql
+org.dbpedia.spotlight.sparql.graph = http://dbpedia.org
+
+# Topical classification properties
+org.dbpedia.spotlight.topic.model.type=org.dbpedia.spotlight.topic.WekaSingleLabelClassifier
+org.dbpedia.spotlight.topic.model.path=/home/dirk/GSOC2012/model/model3.dat
+org.dbpedia.spotlight.topic.topics.info=/home/dirk/GSOC2012/model/category3.info
+org.dbpedia.spotlight.topic.dictionary=/home/dirk/GSOC2012/model/model3.word_id.dict
+
+org.dbpedia.spotlight.topic.dictionary.maxsize=128000
+org.dbpedia.spotlight.topic.priors=/media/Data/Wikipedia/counts
9 conf/server.properties.default
@@ -0,0 +1,9 @@
+
+org.dbpedia.spotlight.web.rest.uri = http://localhost:2222/rest
+org.dbpedia.spotlight.index.dir = data/index
+org.dbpedia.spotlight.spot.dictionary = data/default.dict
+jcs.default.cacheattributes.MaxObjects = 5000
+
+org.dbpedia.spotlight.sparql.endpoint = http://dbpedia.org/sparql
+org.dbpedia.spotlight.sparql.graph = http://dbpedia.org
+
307 core/pom.xml
@@ -0,0 +1,307 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ~ Copyright 2012 DBpedia Spotlight Development Team
+ ~
+ ~ Licensed under the Apache License, Version 2.0 (the "License");
+ ~ you may not use this file except in compliance with the License.
+ ~ You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ ~
+ ~ Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.dbpedia.spotlight</groupId>
+ <artifactId>spotlight</artifactId>
+ <version>0.6</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>core</artifactId>
+ <packaging>jar</packaging>
+ <name>DBpedia Spotlight Core</name>
+
+ <build>
+ <plugins>
+ <plugin>
+
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-install-plugin</artifactId>
+ <version>${maven.install.plugin.version}</version>
+ <executions>
+ <execution>
+ <id>install-nx-parser-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>org.semanticweb.yars</groupId>
+ <artifactId>nx-parser</artifactId>
+ <version>1.1</version>
+ <packaging>jar</packaging>
+ <file>${basedir}/../lib/nxparser-1.1.jar</file>
+ </configuration>
+ </execution>
+ <execution>
+ <id>install-lingpipe-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>com.aliasi</groupId>
+ <artifactId>lingpipe</artifactId>
+ <version>4.0.0</version>
+ <packaging>jar</packaging>
+ <file>${basedir}/../lib/lingpipe-4.0.0.jar</file>
+ </configuration>
+ </execution>
+ <execution>
+ <id>install-cloud9-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>edu.umd</groupId>
+ <artifactId>cloud9</artifactId>
+ <version>SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <file>${basedir}/../lib/cloud9.jar</file>
+ </configuration>
+ </execution>
+ <!--execution>
+ <id>install-lucene-queries-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queries</artifactId>
+ <version>3.6.0</version>
+ <packaging>jar</packaging>
+ <file>/home/pablo/workspace/lucene-3.6.0/build/contrib/queries/lucene-queries-3.6-SNAPSHOT.jar</file>
+ </configuration>
+ </execution-->
+ <execution>
+ <id>install-kea-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>edu.umd</groupId>
+ <artifactId>cloud9</artifactId>
+ <version>SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <file>${basedir}/../lib/kea-goss-5.0-weka-3.7.3-SNAPSHOT.jar</file>
+ </configuration>
+ </execution>
+ <execution>
+ <id>install-weka-jar</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>install-file</goal>
+ </goals>
+ <configuration>
+ <groupId>weka</groupId>
+ <artifactId>weka</artifactId>
+ <version>3.7.3</version>
+ <packaging>jar</packaging>
+ <file>${basedir}/../lib/weka-trunk.jar</file>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
+
+ <properties>
+ <spotlight.basedir>${project.basedir}/..</spotlight.basedir>
+ </properties>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>3.6.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers</artifactId>
+ <version>3.6.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-misc</artifactId>
+ <version>3.6.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queries</artifactId>
+ <version>3.6.0</version>
+ <type>jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-phonetic</artifactId>
+ <version>3.6.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>jcs</groupId>
+ <artifactId>jcs</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>xerces</groupId>
+ <artifactId>xerces</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>xerces</groupId>
+ <artifactId>xercesImpl</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>nekohtml</groupId>
+ <artifactId>nekohtml</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.semanticweb.yars</groupId>
+ <artifactId>nx-parser</artifactId>
+ <version>1.1</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.aliasi</groupId>
+ <artifactId>lingpipe</artifactId>
+ <version>4.0.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>edu.umd</groupId>
+ <artifactId>cloud9</artifactId>
+ <version>SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>weka</groupId>
+ <artifactId>kea-goss-weka</artifactId>
+ <version>5.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>weka</groupId>
+ <artifactId>weka</artifactId>
+ <version>3.7.3</version>
+ </dependency>
+
+ <dependency>
+ <groupId>net.liftweb</groupId>
+ <artifactId>lift-json_2.9.1</artifactId>
+ <version>2.4</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.2</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.scalaj</groupId>
+ <artifactId>scalaj-collection_2.9.1</artifactId>
+ <version>1.2</version>
+ </dependency>
+
+ <dependency>
+ <groupId>net.sf.opencsv</groupId>
+ <artifactId>opencsv</artifactId>
+ <version>2.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>hsqldb</groupId>
+ <artifactId>hsqldb</artifactId>
+ <version>1.8.0.1</version>
+ </dependency>
+
+ <!-- this dependency can be removed when experiments with mongodb are
+ indeed finished-->
+ <!--dependency>
+ <groupId>com.mongodb</groupId>
+ <artifactId>mongo</artifactId>
+ <version>1.0</version>
+ </dependency-->
+
+ <dependency>
+ <groupId>it.unimi.dsi</groupId>
+ <artifactId>fastutil</artifactId>
+ <version>6.3</version>
+ </dependency>
+
+ <dependency>
+ <groupId>opennlp</groupId>
+ <artifactId>maxent</artifactId>
+ <version>3.0.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.5.1-incubating</version>
+ </dependency>
+
+ <!--dependency>
+ <groupId>opennlp</groupId>
+ <artifactId>tools</artifactId>
+ <version>1.5.0</version>
+ </dependency-->
+
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ </dependency>
+
+ </dependencies>
+
+</project>
39 core/src/main/java/org/dbpedia/spotlight/annotate/Annotator.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.annotate;
+
+import org.dbpedia.spotlight.disambiguate.Disambiguator;
+import org.dbpedia.spotlight.exceptions.InputException;
+import org.dbpedia.spotlight.exceptions.SearchException;
+import org.dbpedia.spotlight.spot.Spotter;
+import org.dbpedia.spotlight.model.DBpediaResourceOccurrence;
+
+import java.util.List;
+
+/**
+ * Interface for annotators.
+ */
+
+public interface Annotator {
+
+ public List<DBpediaResourceOccurrence> annotate(String text) throws SearchException, InputException;
+
+ public Disambiguator disambiguator();
+
+ public Spotter spotter();
+
+}
43 core/src/main/java/org/dbpedia/spotlight/annotate/ParagraphAnnotator.java
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.annotate;
+
+import org.dbpedia.spotlight.disambiguate.Disambiguator;
+import org.dbpedia.spotlight.disambiguate.ParagraphDisambiguator;
+import org.dbpedia.spotlight.disambiguate.ParagraphDisambiguatorJ;
+import org.dbpedia.spotlight.exceptions.InputException;
+import org.dbpedia.spotlight.exceptions.SearchException;
+import org.dbpedia.spotlight.model.DBpediaResourceOccurrence;
+import org.dbpedia.spotlight.spot.Spotter;
+
+import java.util.List;
+
+/**
+ * Interface for document-centric annotators.
+ */
+
+public interface ParagraphAnnotator {
+
+ //TODO should this return AnnotatedText instead?
+ public List<DBpediaResourceOccurrence> annotate(String text) throws SearchException, InputException;
+
+ //TODO this java/scala incompatibility here has to be worked out
+ public ParagraphDisambiguatorJ disambiguator();
+
+ public Spotter spotter();
+
+}
180 core/src/main/java/org/dbpedia/spotlight/disambiguate/CustomScoresDisambiguator.java
@@ -0,0 +1,180 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.disambiguate;
+
+import com.google.common.collect.Ordering;
+import com.google.common.primitives.Doubles;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.store.FSDirectory;
+import org.dbpedia.spotlight.exceptions.InputException;
+import org.dbpedia.spotlight.exceptions.ItemNotFoundException;
+import org.dbpedia.spotlight.exceptions.SearchException;
+import org.dbpedia.spotlight.exceptions.TimeoutException;
+import org.dbpedia.spotlight.io.DataLoader;
+import org.dbpedia.spotlight.lucene.search.LuceneCandidateSearcher;
+import org.dbpedia.spotlight.model.*;
+import org.dbpedia.spotlight.lucene.LuceneManager;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Preliminary class to take a list of weights for each DBpediaResource as the only cue to decide which surrogate to choose for a given surface form occurrence.
+ * For example, this has been used to get a list of prior probabilities computed offline in a Hadoop/Pig job
+ *
+ * @author pablomendes
+ */
+public class CustomScoresDisambiguator implements Disambiguator {
+
+ Log LOG = LogFactory.getLog(this.getClass());
+
+ Map<String,Double> scores = new HashMap<String,Double>();
+
+ CandidateSearcher surrogateSearcher;
+
+ public CustomScoresDisambiguator(CandidateSearcher surrogates, DataLoader loader) {
+ this.surrogateSearcher = surrogates;
+ if (loader!=null)
+ scores = loader.loadPriors();
+ LOG.debug(loader+": "+ scores.size()+" scores loaded.");
+ }
+
+ public List<SurfaceFormOccurrence> spotProbability(List<SurfaceFormOccurrence> sfOccurrences) {
+ return sfOccurrences; //FIXME IMPLEMENT
+ }
+
+ @Override
+ public DBpediaResourceOccurrence disambiguate(SurfaceFormOccurrence sfOccurrence) throws SearchException, ItemNotFoundException, InputException {
+ List<DBpediaResourceOccurrence> candidates = bestK(sfOccurrence, 1);
+ if (candidates.size()==0)
+ throw new ItemNotFoundException(sfOccurrence.surfaceForm()+" was not found in the index.");
+ return candidates.get(0);
+ }
+
+ public List<DBpediaResourceOccurrence> disambiguate(List<SurfaceFormOccurrence> sfOccurrences) throws SearchException, InputException {
+ List<DBpediaResourceOccurrence> disambiguated = new ArrayList<DBpediaResourceOccurrence>();
+ for (SurfaceFormOccurrence sfOcc: sfOccurrences) {
+ try {
+ disambiguated.add(disambiguate(sfOcc));
+ } catch (TimeoutException e) {
+ LOG.error("Could not disambiguate. Surface form took too long: "+sfOcc.surfaceForm()+": "+e);
+ } catch (ItemNotFoundException e) {
+ LOG.error("Could not disambiguate. Surface form not found: "+sfOcc.surfaceForm()+": "+e);
+ }
+ }
+ return disambiguated;
+ }
+
+ @Override
+ public List<DBpediaResourceOccurrence> bestK(SurfaceFormOccurrence sfOccurrence, int k) throws SearchException, ItemNotFoundException {
+ Set<DBpediaResource> candidates = surrogateSearcher.getCandidates(sfOccurrence.surfaceForm());
+
+ if (candidates.size()==0)
+ return new LinkedList<DBpediaResourceOccurrence>();
+
+ List<DBpediaResourceOccurrence> all = getScores(sfOccurrence, candidates);
+
+ Ordering descOrder = new Ordering<DBpediaResourceOccurrence>() {
+ public int compare(DBpediaResourceOccurrence left, DBpediaResourceOccurrence right) {
+ return Doubles.compare(right.similarityScore(), left.similarityScore());
+
+ }
+ };
+
+ return descOrder.sortedCopy(all).subList(0, Math.min(k, all.size()));
+ }
+
+ protected List<DBpediaResourceOccurrence> getScores(SurfaceFormOccurrence sfOccurrence, Set<DBpediaResource> candidates) {
+ List<DBpediaResourceOccurrence> occurrences = new ArrayList<DBpediaResourceOccurrence>();
+ try {
+ for(DBpediaResource r: candidates) {
+ Double score = scores.get(r);
+ if (score ==null) {
+ LOG.debug("No score found for URI: "+r);
+ score = 0.0;
+ }
+ DBpediaResourceOccurrence occ = new DBpediaResourceOccurrence(r,
+ sfOccurrence.surfaceForm(),
+ sfOccurrence.context(),
+ sfOccurrence.textOffset(),
+ score);
+ occurrences.add(occ);
+ }
+ } catch (NullPointerException e2) {
+ LOG.error("NullPointerException here. Resource: "+candidates);
+ }
+ return occurrences;
+ }
+
+
+
+
+ public static void main(String[] args) throws IOException {
+ String luceneIndexFileName = "data/apple-example/LuceneIndex-apple50_test";
+ String resourcePriorsFileName = "data/apple-example/3apples-scores.tsv";
+
+ // Lucene Manager - Controls indexing and searching
+ LuceneManager luceneManager = new LuceneManager(FSDirectory.open(new File(luceneIndexFileName)));
+
+ try {
+ new CustomScoresDisambiguator(new LuceneCandidateSearcher(luceneManager, false), new DataLoader(new DataLoader.TSVParser(), new File("data/Distinct-surfaceForm-By-uri.grouped")));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+
+ @Override
+ public String name() {
+ return this.getClass().getSimpleName();
+ }
+
+ @Override
+ public int support(DBpediaResource resource) throws SearchException {
+ // for the WikiPageContext, the training size is always 1 page per resource
+ return 1;
+ }
+
+ @Override
+ public List<Explanation> explain(DBpediaResourceOccurrence goldStandardOccurrence, int nExplanations) throws SearchException {
+ throw new SearchException("Not implemented yet.");
+ }
+
+ @Override
+ public int ambiguity(SurfaceForm sf) throws SearchException {
+ int s = 0;
+ try {
+ s = surrogateSearcher.getCandidates(sf).size();
+ } catch (ItemNotFoundException e) {
+ s = 0; // surface form not found
+ }
+ return s;
+ }
+
+ @Override
+ public int contextTermsNumber(DBpediaResource resource) throws SearchException {
+ return 0; // prior works without context
+ }
+
+ @Override
+ public double averageIdf(Text context) throws IOException {
+ throw new IOException(this.getClass()+" has no index available to calculate averageIdf");
+ }
+}
103 core/src/main/java/org/dbpedia/spotlight/disambiguate/Disambiguator.java
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.disambiguate;
+
+import org.apache.lucene.search.Explanation;
+import org.dbpedia.spotlight.exceptions.InputException;
+import org.dbpedia.spotlight.exceptions.ItemNotFoundException;
+import org.dbpedia.spotlight.exceptions.SearchException;
+import org.dbpedia.spotlight.model.*;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Interface for occurrence-based disambiguators.
+ * TODO consider renaming to OccurrenceDisambiguator
+ * @author pablomendes
+ * @author maxjakob
+ */
+public interface Disambiguator {
+
+ public List<SurfaceFormOccurrence> spotProbability(List<SurfaceFormOccurrence> sfOccurrences) throws SearchException;
+
+ /**
+ * Executes disambiguation per individual occurrence.
+ * Can be seen as a classification task: unlabeled instance in, labeled instance out.
+ *
+ * @param sfOccurrence
+ * @return
+ * @throws SearchException
+ * @throws ItemNotFoundException
+ * @throws InputException
+ */
+ public DBpediaResourceOccurrence disambiguate(SurfaceFormOccurrence sfOccurrence) throws SearchException, ItemNotFoundException, InputException; //TODO DisambiguationException
+
+ /**
+ * Executes disambiguation per paragraph (collection of occurrences).
+ * Can be seen as a classification task: unlabeled instances in, labeled instances out.
+ *
+ * @param sfOccurrences
+ * @return
+ * @throws SearchException
+ * @throws InputException
+ */
+ public List<DBpediaResourceOccurrence> disambiguate(List<SurfaceFormOccurrence> sfOccurrences) throws SearchException, InputException; //TODO DisambiguationException
+
+
+ /**
+ * Executes disambiguation per occurrence, returns a list of possible candidates.
+ * Can be seen as a ranking (rather than classification) task: query instance in, ranked list of target URIs out.
+ *
+ * @param sfOccurrence
+ * @param k
+ * @return
+ * @throws SearchException
+ * @throws ItemNotFoundException
+ * @throws InputException
+ */
+ //TODO consider moving this to CandidateSelector / CandidateSearcher interface
+ public List<DBpediaResourceOccurrence> bestK(SurfaceFormOccurrence sfOccurrence, int k) throws SearchException, ItemNotFoundException, InputException;
+
+ /**
+ * Every disambiguator has a name that describes its settings (used in evaluation to compare results)
+ * @return a short description of the Disambiguator
+ */
+ public String name();
+
+ /**
+ * Every disambiguator should know how to measure the ambiguity of a surface form.
+ * @param sf
+ * @return ambiguity of surface form (number of candidates)
+ */
+ public int ambiguity(SurfaceForm sf) throws SearchException;
+
+ /**
+ * Counts how many occurrences we indexed for a given URI. (size of training set for that URI)
+ * @param resource
+ * @return
+ * @throws SearchException
+ */
+ public int support(DBpediaResource resource) throws SearchException;
+
+ public List<Explanation> explain(DBpediaResourceOccurrence goldStandardOccurrence, int nExplanations) throws SearchException;
+
+ public int contextTermsNumber(DBpediaResource resource) throws SearchException;
+
+ public double averageIdf(Text context) throws IOException;
+
+}
84 core/src/main/java/org/dbpedia/spotlight/disambiguate/RandomDisambiguator.java
@@ -0,0 +1,84 @@
+package org.dbpedia.spotlight.disambiguate;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dbpedia.spotlight.model.*;
+
+import java.util.*;
+
+/**
+ * Randomly picks one of the candidate URIs for a surface form. Used in evaluation as a baseline.
+ * Uses an approximate Gaussian distribution
+ *
+ * @author pablomendes
+ */
+public class RandomDisambiguator extends CustomScoresDisambiguator {
+
+ Log LOG = LogFactory.getLog(this.getClass());
+ RandomGaussian gaussian = new RandomGaussian();
+
+ public RandomDisambiguator(CandidateSearcher surrogates) {
+ super(surrogates, null);
+ }
+
+ /**
+ * Overrides the CustomScoresDisambiguator.getScores method in order to generate random scores instead of getting from a map.
+ * @param candidates
+ * @return
+ */
+ protected List<DBpediaResourceOccurrence> getScores(SurfaceFormOccurrence sfOccurrence, Set<DBpediaResource> candidates) {
+ List<DBpediaResourceOccurrence> occurrences = new ArrayList<DBpediaResourceOccurrence>();
+ for (DBpediaResource c: candidates) {
+ Double score = gaussian.getGaussian();
+ DBpediaResourceOccurrence occ = new DBpediaResourceOccurrence(c,
+ sfOccurrence.surfaceForm(),
+ sfOccurrence.context(),
+ sfOccurrence.textOffset(),
+ score);
+ occurrences.add(occ);
+ }
+ return occurrences;
+ }
+
+ /**
+ Generate pseudo-random floating point values, with an
+ approximately Gaussian (normal) distribution.
+
+ Many physical measurements have an approximately Gaussian
+ distribution; this provides a way of simulating such values.
+ http://www.javapractices.com/topic/TopicAction.do?Id=62
+ */
+ public static final class RandomGaussian {
+
+ double mMean = 100.0f;
+ double mVariance = 5.0f;
+
+ private Random fRandom = new Random();
+
+ public RandomGaussian() {}
+
+ public RandomGaussian(double aMean, double aVariance) {
+ mMean = aMean;
+ mVariance = aVariance;
+ }
+
+ private double getGaussian(){
+ return mMean + fRandom.nextGaussian() * mVariance;
+ }
+
+ private double getGaussian(double aMean, double aVariance){
+ return aMean + fRandom.nextGaussian() * aVariance;
+ }
+
+ }
+
+ public static void main(String... aArgs){
+ RandomGaussian gaussian = new RandomGaussian();
+ double MEAN = 100.0f;
+ double VARIANCE = 5.0f;
+ for (int idx = 1; idx <= 10; ++idx){
+ System.out.println(String.format("Generated %s ", gaussian.getGaussian(MEAN, VARIANCE)));
+ }
+ }
+
+}
38 core/src/main/java/org/dbpedia/spotlight/exceptions/AnnotationException.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Used by (external) annotation clients to communicate an error when trying to annotate text.
+ *
+ * @author pablomendes
+ */
+public class AnnotationException extends Exception {
+
+ public AnnotationException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public AnnotationException(String msg) {
+ super(msg);
+ }
+
+ public AnnotationException(Exception e) {
+ super(e);
+ }
+
+}
38 core/src/main/java/org/dbpedia/spotlight/exceptions/AuthenticationException.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Thrown when the user tries to call some unauthorized action.
+ *
+ * @author pablomendes
+ */
+public class AuthenticationException extends AnnotationException {
+
+ public AuthenticationException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public AuthenticationException(String msg) {
+ super(msg);
+ }
+
+ public AuthenticationException(Exception e) {
+ super(e);
+ }
+
+}
32 core/src/main/java/org/dbpedia/spotlight/exceptions/ConfigurationException.java
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Indicates that an error happened when getting configuration parameters for initializing objects.
+ * @author pablomendes
+ */
+public class ConfigurationException extends Exception {
+
+ public ConfigurationException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public ConfigurationException(String msg) {
+ super(msg);
+ }
+}
37 core/src/main/java/org/dbpedia/spotlight/exceptions/DisambiguationException.java
@@ -0,0 +1,37 @@
+/*
+ * *
+ * * Copyright 2011 Pablo Mendes, Max Jakob
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Happens when trying to disambiguate entities.
+ * If the error is caused when searching the index, then a {@see SearchException} is thrown instead.
+ *
+ * @author pablomendes
+ */
+public class DisambiguationException extends Exception {
+
+ public DisambiguationException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public DisambiguationException(String msg) {
+ super(msg);
+ }
+
+}
37 core/src/main/java/org/dbpedia/spotlight/exceptions/IndexException.java
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Happens while attempting to index or modify index.
+ * @author pablomendes
+ */
+public class IndexException extends Exception {
+
+ public IndexException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public IndexException(String msg) {
+ super(msg);
+ }
+
+ public IndexException(Exception e) {
+ super(e);
+ }
+
+}
35 core/src/main/java/org/dbpedia/spotlight/exceptions/InitializationException.java
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Indicates that an error occurred during the initialization of something,
+ * a cause of this may be incorrect configuration.
+ * TODO consider merging with ConfigurationException
+ *
+ * @author jodaiber
+ */
+public class InitializationException extends Exception {
+
+ public InitializationException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public InitializationException(String msg) {
+ super(msg);
+ }
+}
38 core/src/main/java/org/dbpedia/spotlight/exceptions/InputException.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Thrown when the user provides not acceptable input (e.g. too short of a text).
+ *
+ * @author maxjakob
+ */
+public class InputException extends AnnotationException {
+
+ public InputException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public InputException(String msg) {
+ super(msg);
+ }
+
+ public InputException(Exception e) {
+ super(e);
+ }
+
+}
35 core/src/main/java/org/dbpedia/spotlight/exceptions/ItemNotFoundException.java
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: PabloMendes
+ * Date: Jun 29, 2010
+ * Time: 3:48:46 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ItemNotFoundException extends Exception {
+
+ public ItemNotFoundException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public ItemNotFoundException(String msg) {
+ super(msg);
+ }
+}
38 core/src/main/java/org/dbpedia/spotlight/exceptions/OutputException.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Thrown when we fail in creating an output for the user
+ *
+ * @author pablomendes
+ */
+public class OutputException extends AnnotationException {
+
+ public OutputException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public OutputException(String msg) {
+ super(msg);
+ }
+
+ public OutputException(Exception e) {
+ super(e);
+ }
+
+}
37 core/src/main/java/org/dbpedia/spotlight/exceptions/SearchException.java
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * This exception is thrown when an error in DBpedia Spotlight stems from searching/querying an index/database of DBpedia Resources
+ *
+ * @author pablomendes
+ */
+public class SearchException extends Exception {
+
+ public SearchException(Exception e) {
+ super(e);
+ }
+
+ public SearchException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public SearchException(String msg) {
+ super(msg);
+ }
+}
39 core/src/main/java/org/dbpedia/spotlight/exceptions/SparqlExecutionException.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Thrown when trying to execute SPARQL queries.
+ * Should appropriately inform users that their SPARQL syntax was wrong, or that the server is down, etc.
+ *
+ * @author pablomendes
+ */
+public class SparqlExecutionException extends AnnotationException {
+
+ public SparqlExecutionException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public SparqlExecutionException(String msg) {
+ super(msg);
+ }
+
+ public SparqlExecutionException(Exception e) {
+ super(e);
+ }
+
+}
39 core/src/main/java/org/dbpedia/spotlight/exceptions/SpottingException.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011 DBpedia Spotlight Development Team
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * Thrown when an error occurs during the spotting stage of DBpedia Spotlight.
+ *
+ * @author pablomendes
+ */
+public class SpottingException extends Exception {
+
+ public SpottingException(Exception e) {
+ super(e);
+ }
+
+ public SpottingException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public SpottingException(String msg) {
+ super(msg);
+ }
+}
34 core/src/main/java/org/dbpedia/spotlight/exceptions/TimeoutException.java
@@ -0,0 +1,34 @@
+/*
+ * *
+ * * Copyright 2011 Pablo Mendes, Max Jakob
+ * *
+ * * Licensed under the Apache License, Version 2.0 (the "License");
+ * * you may not use this file except in compliance with the License.
+ * * You may obtain a copy of the License at
+ * *
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing, software
+ * * distributed under the License is distributed on an "AS IS" BASIS,
+ * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * * See the License for the specific language governing permissions and
+ * * limitations under the License.
+ *
+ */
+
+package org.dbpedia.spotlight.exceptions;
+
+/**
+ * This SearchException happens when we gave up because the search was taking too long.
+ * @author pablomendes
+ */
+public class TimeoutException extends SearchException {
+
+ public TimeoutException(String msg, Exception e) {
+ super(msg,e);
+ }
+
+ public TimeoutException(String msg) {
+ super(msg);
+ }
+}
198 core/src/main/java/org/dbpedia/spotlight/io/DataLoader.java
@@ -0,0 +1,198 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.io;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: PabloMendes
+ * Date: Jul 23, 2010
+ * Time: 3:53:12 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class DataLoader {
+
+ Log LOG = LogFactory.getLog(this.getClass());
+ LineParser mParser;
+ File f;
+
+ public DataLoader(LineParser parser) {
+ this.mParser = parser;
+ }
+
+ public DataLoader(LineParser parser, File f) {
+ this.mParser = parser;
+ this.f = f;
+ }
+
+ public Map<String,Double> loadPriors() {
+ return loadPriors(f);
+ }
+
+ public Map<String,Double> loadPriors(InputStream in) throws IOException {
+
+ Map<String,Double> items = new HashMap<String,Double>();
+ Scanner scanner = new Scanner(new InputStreamReader(in, "UTF-8"));
+ int i = 0;
+ while (scanner.hasNext()) {
+ String line = scanner.nextLine();
+ try {
+ mParser.add(line.toString(), items);
+ i++;
+
+ in.close();
+
+ } catch (IOException e) {e.printStackTrace();}
+ }
+ LOG.info("Done. Loaded "+items.size()+" items.");
+
+ return items;
+ }
+
+ public Map<String,Double> loadPriors(File f) {
+
+ LOG.info("Loading items from "+f.getPath());
+ Map<String,Double> items = new HashMap<String,Double>();
+
+ if (f.getName().length() != 0) {
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(f)); //FastBufferedReader in = new FastBufferedReader(new FileReader(f));
+ String line; //MutableString line = new MutableString();
+ int i = 0;
+ while ((line = in.readLine()) != null) {
+
+ if (line==null || line.trim().equals(""))
+ continue;
+
+ mParser.add(line.toString(), items);
+ i++;
+ }
+ in.close();
+
+ } catch (IOException e) {e.printStackTrace();}
+ }
+ LOG.info("Done. Loaded "+items.size()+" items.");
+
+ return items;
+ }
+
+ abstract static class LineParser {
+ Log LOG = LogFactory.getLog(this.getClass());
+ public abstract void add(String line, Map<String,Double> items);
+ }
+
+ public static class TSVParser extends LineParser {
+ int key = 0;
+ int value = 1;
+ //public TSVParser(int key, int value) { this.key = key; this.value=value; }
+ @Override
+ public void add(String line, Map<String,Double> items) {
+ String[] elements = line.split("[\\t\\s]+");
+
+ try {
+ String uri = elements[key];
+ Double prior = new Double(elements[value]);
+ items.put(uri,prior);
+ } catch (IndexOutOfBoundsException e) {
+ LOG.error("Expecting tsv file with one (String,Double) per line. Strange line: " + line);
+ }
+ }
+ }
+
+ public static class PigDumpParser extends LineParser {
+
+ Writer mWriter;
+ long nOutputItems = 0;
+
+ public PigDumpParser() { }
+
+ public PigDumpParser(Writer writer) {
+ this.mWriter = writer;
+ }
+
+ /*
+ ((British India (band),1L,{(British_India_%28band%29)}),1L,{()})
+ ((British India Army,2L,{(British_India_Army),(British_India_Army)}),1L,{()})
+ */
+ @Override
+ public void add(String line, Map<String,Double> items) {
+ Pattern regex = Pattern.compile("\\((.+),(\\d+)L,.*");
+ Matcher matcher = regex.matcher(line.toString());
+ try {
+ StringBuilder sb = new StringBuilder();
+ if (matcher.find()) {
+ //String uri = matcher.group(1);
+ //Double prior = new Double(matcher.group(2));
+// LOG.trace(line);
+// LOG.trace(uri + "\t" + prior);
+
+ try {
+
+ if (mWriter!=null) {
+ sb.append(matcher.group(1)); // uri
+ sb.append("\t");
+ sb.append(matcher.group(2)); // prior
+ sb.append("\n");
+ mWriter.write(sb.toString());
+ nOutputItems++;
+ } else {
+ items.put(matcher.group(1),new Double(matcher.group(2)));
+ }
+ } catch (IOException e) {
+ e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
+ }
+
+ } else {
+ LOG.error("Cannot parse "+line.toString());
+ }
+ } catch(NumberFormatException e) {
+ LOG.error("Cannot parse "+line.toString()+" >>>> "+matcher.group(0)+" "+matcher.group(1));
+ e.printStackTrace();
+ } catch (IndexOutOfBoundsException e) {
+ LOG.error("Expecting PigDump file with one (String,Long) per line. Strange line: " + line);
+ }
+ }
+
+ public void close() throws IOException {
+ if (mWriter!=null)
+ mWriter.close();
+ LOG.info("Done. Output "+nOutputItems+" items.");
+ }
+ }
+
+ public static void main(String args[]) {
+ try {
+ File inputFile = new File("data/Distinct-uri-By-surfaceForm.grouped");
+ PigDumpParser parser = new PigDumpParser(new BufferedWriter(new FileWriter("data/Distinct-uri-By-surfaceForm.csv")));
+ DataLoader loader = new DataLoader(parser);
+ loader.loadPriors(inputFile);
+ parser.close();
+ } catch (IOException e) {
+ e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
+ }
+ }
+
+}
321 core/src/main/java/org/dbpedia/spotlight/io/DatasetSplitter.java
@@ -0,0 +1,321 @@
+/**
+ * Copyright 2011 Pablo Mendes, Max Jakob
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.dbpedia.spotlight.io;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.store.FSDirectory;
+import org.dbpedia.spotlight.exceptions.SearchException;
+import org.dbpedia.spotlight.lucene.LuceneManager;
+import org.dbpedia.spotlight.lucene.search.LuceneCandidateSearcher;
+import org.dbpedia.spotlight.model.DBpediaResource;
+import org.dbpedia.spotlight.model.SurfaceForm;
+import org.semanticweb.yars.nx.Node;
+import org.semanticweb.yars.nx.parser.NxParser;
+import org.semanticweb.yars.nx.parser.ParseException;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: PabloMendes
+ * Date: Jul 23, 2010
+ * Time: 3:53:58 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public abstract class DatasetSplitter {
+
+ Log LOG = LogFactory.getLog(this.getClass());
+ int incrementalId = 0;
+
+ Writer mTrainingSetWriter;
+ Writer mTestSetWriter;
+
+ /**
+ * Abstract constructor. Please see @link{BySize} and @link{BySurfaceForm}}
+ * @param trainingSetFile
+ * @param testSetFile
+ * @throws IOException
+ */
+ public DatasetSplitter(File trainingSetFile, File testSetFile) throws IOException {
+ this.mTrainingSetWriter = new BufferedWriter(new FileWriter(trainingSetFile));
+ this.mTestSetWriter = new BufferedWriter(new FileWriter(testSetFile));
+ }
+
+ public abstract boolean shouldKeepTheseOccurrences(List<String> items);
+
+ public abstract void split(List<String> items) throws IOException;
+
+ //TODO Max: question: does this assume sorting by URI?
+ public void run(InputStream stream) throws IOException {
+ String currentItem = "";
+ List<String> items = new ArrayList<String>();
+ Scanner scanner = new Scanner(new InputStreamReader(stream, "UTF-8"));
+ int nItemsKept = 0;
+ while (scanner.hasNext()) {
+ String line = scanner.nextLine();
+ incrementalId++;
+
+ if (line==null || line.trim().equals(""))
+ continue;
+
+ String[] fields = line.split("\t");
+ String uri;
+ if (fields.length >= 5) {
+ uri = fields[0];
+ }
+ else {
+ uri = fields[1];
+ }
+// String surfaceForm = fields[1];
+// String context = fields[2];
+// String offset = fields[3];
+// String type = fields[4];
+
+ //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type);
+
+ if ( !uri.equals(currentItem)){
+
+ if (shouldKeepTheseOccurrences(items)) {
+ nItemsKept++;
+ LOG.trace("End of current item: "+currentItem+" / size: "+items.size()+" - saving!");
+ split(items);
+ } // else ignore
+ //reset current item
+ currentItem = uri;
+ items = new ArrayList<String>();
+ }
+ items.add(line.toString());
+
+ if (incrementalId % 50000 == 0)
+ LOG.info("Processed "+incrementalId+" occurrences. Kept occurrences for "+nItemsKept+" URIs.");
+ }
+ scanner.close();
+ LOG.info("Processed "+incrementalId+" occurrences. Kept occurrences for "+nItemsKept+" URIs");
+ }
+
+// public void run(File f) {
+// LOG.info("Loading occurrences from "+f.getPath());
+// String currentItem = "";
+// //Set<Tuple5> items = new HashSet<Tuple5>();
+// List<String> items = new ArrayList<String>();
+//
+// if (f.getName().length() != 0) {
+// try {
+// FastBufferedReader in = new FastBufferedReader(new FileReader(f));
+// MutableString line = new MutableString();
+// int i = 0;
+// while ((line = in.readLine(line)) != null) {
+// incrementalId++;
+//
+// if (line==null || line.trim().equals(""))
+// continue;
+//
+// String[] fields = line.toString().split("\t");
+//// String surfaceForm = fields[0];
+// String uri = fields[1];
+//// String context = fields[2];
+//// String offset = fields[3];
+//// String type = fields[4];
+//
+// //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type);
+//
+// if ( !uri.equals(currentItem)){
+// if (i >= mMinNumberOfExamples) {
+// uniformSplit(items);
+// } // else ignore
+// //reset current item
+// currentItem = uri;
+// items = new ArrayList<String>();
+// }
+// items.add(line.toString());
+// i++;
+// }
+// in.close();
+//
+// } catch (IOException e) {e.printStackTrace();}
+// }
+// LOG.info("Done. Loaded "+items.size()+" items.");
+//
+// }
+
+
+
+ public void write(int id, String item, Writer writer) throws IOException {
+ StringBuffer sb = new StringBuffer();
+ sb.append(id);
+ sb.append("\t");
+ sb.append(item);
+ sb.append("\n");
+ writer.write(sb.toString());
+ }
+
+ public static class BySize extends DatasetSplitter {
+
+ int mMinNumberOfExamples = 1;
+ double mPercentSplit = 0.5;
+
+ public BySize(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit) throws IOException {
+ super(trainingSetFile, testSetFile);
+ this.mMinNumberOfExamples = minNumberOfExamples;
+ this.mPercentSplit = percentSplit;
+ }
+
+ @Override
+ public boolean shouldKeepTheseOccurrences(List<String> items) {
+ return items.size() >= mMinNumberOfExamples;
+ }
+
+ @Override
+ public void split(List<String> items) throws IOException {
+ int i = incrementalId-items.size(); // set
+ int n = (new Double(items.size() * mPercentSplit)).intValue();
+ for (String item: items) {
+ if((n>0) && // When there are enough items for dividing in training and testing
+ (i % (items.size() / n) == 0)){ // For a 10% split, uniformSplit every 10th entry
+ LOG.trace("Writing to test: "+i+" "+items.size()+"/"+ n );
+ write(i, item, mTestSetWriter);
+ } else {
+ // For a 10% split, it will write to training 90% of the times, plus
+ // when there are not enough examples to split between training and testing
+ // That should assure that all senses are in training to be picked.
+ LOG.trace("Writing to training: "+i);
+ write(i, item, mTrainingSetWriter);
+ }
+
+ i++;
+ }
+ }
+
+ }
+
+ public static class BySurfaceForm extends BySize {
+
+ Set<String> mValidSurfaceForms = new HashSet<String>();
+
+ public BySurfaceForm(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit, Set<String> validSurfaceForms) throws IOException {
+ super(trainingSetFile, testSetFile, minNumberOfExamples, percentSplit);
+ mValidSurfaceForms = validSurfaceForms;
+ LOG.info("Assuming "+validSurfaceForms.size()+" valid surface forms to acquire occurrence samples.");
+ }
+
+ @Override
+ public boolean shouldKeepTheseOccurrences(List<String> items) {
+ boolean shouldKeep = false;
+ for (String item: items) {
+ StringBuffer sf = new StringBuffer();
+ try {
+ String[] fields = item.split("\t");
+ if (fields.length >= 5) {
+ sf = sf.append(fields[2]);
+ }
+ else {
+ sf = sf.append(fields[1]);
+ }
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ LOG.debug("Error parsing line: "+item);
+ }
+ for (String validSf: mValidSurfaceForms) {
+ //if (sf.toString().toLowerCase().contains(validSf.toLowerCase())) { // relaxed
+ if (sf.toString().toLowerCase().equals(validSf.toLowerCase())) { // strict
+ shouldKeep = true;
+ LOG.trace("Kept:"+sf+" because it matches "+validSf);
+ break;
+ }
+ }
+ }
+ return shouldKeep;
+ }
+
+ }
+
+
+ /**
+ * TODO created by Max: this functions allows for one call to create "confusable-with" sets
+ * For a given type, goes through the data set that keeps the types for each resource.
+ * If the type matches, look in the surrogate index for this URI (opposite direction as usually)
+ * for all surface forms that can relate to this URI.
+ * Return all surface forms found this way.
+ */
+ public static Set<String> getConfusableSurfaceForms(String targetType, File instancesFile, LuceneCandidateSearcher surrogateSearcher) throws IOException, ParseException {
+ System.err.println("Getting all surface forms for "+targetType+"s...");
+ Set<String> surfaceForms = new HashSet<String>();
+ if (!targetType.startsWith("http://dbpedia.org/ontology/"))
+ targetType = "http://dbpedia.org/ontology/"+ targetType;
+ NxParser parser = new NxParser(new FileInputStream(instancesFile));
+ while (parser.hasNext()) {
+ Node[] triple = parser.next();
+ if (triple[2].toString().equals(targetType)) {
+ String targetUri = triple[0].toString().replace("http://dbpedia.org/resource/", "");
+ try {
+ Set<SurfaceForm> surfaceFormsForURI = surrogateSearcher.getSurfaceForms(new DBpediaResource(targetUri));
+ for (SurfaceForm sf : surfaceFormsForURI) {
+ surfaceForms.add(sf.name());
+ }
+ }
+ catch (SearchException e) {
+ System.err.println("URI "+targetUri+" not found in surrogate index. Skipping.");
+ }
+ }
+ }
+
+ return surfaceForms;
+ }
+
+
+
+ //TODO Make this guy parameterizable from command line.
+ public static void main(String[] args) throws IOException, ParseException {
+ /**
+ * Split dataset in training and test.
+ * percentageSplit indicates how much to save for testing
+ * minSize indicates the minimum number of occurrences a URI has to have for it to make it to training/testing
+ */
+ int minSize = 2;
+ double percentageSplit = 0.5;
+ String targetType = "Actor"; //"Person"; //Place //Organisation
+ /*
+ Here I'm using wikipediaOccurrences.ambiguous.tsv.gz
+ Be careful here. Do not use withtype because it is a join with the types,
+ so for URIs that have multiple types the same entry is repeated multiple times.
+ */
+
+ System.err.println("Making confusable with "+targetType+" data sets.");
+
+ File inputFile = new File("data/WikipediaOccurrences-IDs-clean_enwiki-20100312.uriSorted.tsv");
+ File trainingFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTraining."+(new Double((1-percentageSplit)*100)).intValue()+"."+targetType+".amb.tsv");
+ File testFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTest."+(new Double(percentageSplit*100)).intValue()+"."+targetType+".amb.tsv");