package org.dbpedia.spotlight.db
import org.semanticweb.yars.nx.parser.NxParser
import org.dbpedia.spotlight.log.SpotlightLog
import collection.immutable.ListSet
import scala.Predef._
import org.dbpedia.spotlight.exceptions.NotADBpediaResourceException
import org.dbpedia.spotlight.model.SpotlightConfiguration
import org.dbpedia.extraction.util.WikiUtil
import scala.collection.mutable.ListBuffer
* Parts of this are taken from
* org.dbpedia.spotlight.util.ExtractCandidateMap
* @author Joachim Daiber
* @author maxjakob
* @author pablomendes
class WikipediaToDBpediaClosure (
val namespace: String,
val redirectsTriples: InputStream,
val disambiguationTriples: InputStream
) {
def this(redirectsTriples: InputStream, disambiguationTriples: InputStream) {
this(SpotlightConfiguration.DEFAULT_NAMESPACE, redirectsTriples, disambiguationTriples)
private def decodeURL(uri: String) = URLDecoder.decode(uri,"utf-8"), "Loading redirects...")
var linkMap = Map[String, String]()
val redParser = new NxParser(redirectsTriples)
while (redParser.hasNext) {
val triple =
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
val obj = decodeURL(triple(2).toString.replace(namespace, ""))
linkMap = linkMap.updated(subj, obj)
}, "Done."), "Loading disambiguations...")
var disambiguationsSet = Set[String]()
val disParser = new NxParser(disambiguationTriples)
while (disParser.hasNext) {
val triple =
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
disambiguationsSet = disambiguationsSet + subj
}, "Done.")
val WikiURL = """http://([a-z]+)[.]wikipedia[.]org/wiki/(.*)$""".r
val DBpediaURL = """http://([a-z]+)[.]dbpedia[.]org/resource/(.*)$""".r
val DBpediaENURL = """http://dbpedia[.]org/resource/(.*)$""".r
private def cutOffBeforeAnchor(url: String): String = {
if(url.contains("%23")) //Take only the part of the URI before the last anchor (#)
else if(url.contains("#"))
private def removeLeadingSlashes(url: String): String = {
url match {
case t: String if url.startsWith("/") => removeLeadingSlashes(t.tail)
case t: String => url
* Use only the part before the anchor, ensure the URL encoding is correct.
* @param url the full DBpedia or Wikipedia URL
* @return
private def decodedNameFromURL(url: String): String = url match {
case WikiURL(language, title) => WikiUtil.wikiEncode(decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title))))
case DBpediaURL(language, title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case DBpediaENURL(title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case _ => throw new NotADBpediaResourceException("Resource is a disambiguation page."); SpotlightLog.error(this.getClass, "Invalid Wikipedia URL %s", url); null
* Get the end of the redirect chain for the URL or name (last part of the URL).
* @param url either full Wiki or DBpedia URL or only the name
* @return
def wikipediaToDBpediaURI(url: String): String = {
val uri = if(url.startsWith("http:")) {
} else {
if (disambiguationsSet.contains(uri) || uri == null)
throw new NotADBpediaResourceException("Resource is a disambiguation page.")
def getEndOfChainURI(uri: String): String = getEndOfChainURI(uri, Set(uri))
private def getEndOfChainURI(uri: String, alreadyTraversed:Set[String]): String = linkMap.get(uri) match {
case Some(s: String) => if (alreadyTraversed.contains(s)) uri else getEndOfChainURI(s, alreadyTraversed + s)
case None => uri