Skip to content
This repository has been archived by the owner on Oct 20, 2018. It is now read-only.
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Cannot retrieve contributors at this time
package org.dbpedia.spotlight.db
import org.semanticweb.yars.nx.parser.NxParser
import org.dbpedia.spotlight.log.SpotlightLog
import collection.immutable.ListSet
import scala.Predef._
import org.dbpedia.spotlight.exceptions.NotADBpediaResourceException
import org.dbpedia.spotlight.model.SpotlightConfiguration
import org.dbpedia.extraction.util.WikiUtil
import scala.collection.mutable.ListBuffer
* Parts of this are taken from
* org.dbpedia.spotlight.util.ExtractCandidateMap
* @author Joachim Daiber
* @author maxjakob
* @author pablomendes
class WikipediaToDBpediaClosure (
val namespace: String,
val redirectsTriples: InputStream,
val disambiguationTriples: InputStream
) {
def this(redirectsTriples: InputStream, disambiguationTriples: InputStream) {
this(SpotlightConfiguration.DEFAULT_NAMESPACE, redirectsTriples, disambiguationTriples)
private def decodeURL(uri: String) = URLDecoder.decode(uri,"utf-8"), "Loading redirects...")
var linkMap = Map[String, String]()
val redParser = new NxParser(redirectsTriples)
while (redParser.hasNext) {
val triple =
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
val obj = decodeURL(triple(2).toString.replace(namespace, ""))
linkMap = linkMap.updated(subj, obj)
}, "Done."), "Loading disambiguations...")
var disambiguationsSet = Set[String]()
val disParser = new NxParser(disambiguationTriples)
while (disParser.hasNext) {
val triple =
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
disambiguationsSet = disambiguationsSet + subj
}, "Done.")
val WikiURL = """http://([a-z]+)[.]wikipedia[.]org/wiki/(.*)$""".r
val DBpediaURL = """http://([a-z]+)[.]dbpedia[.]org/resource/(.*)$""".r
val DBpediaENURL = """http://dbpedia[.]org/resource/(.*)$""".r
private def cutOffBeforeAnchor(url: String): String = {
if(url.contains("%23")) //Take only the part of the URI before the last anchor (#)
else if(url.contains("#"))
private def removeLeadingSlashes(url: String): String = {
url match {
case t: String if url.startsWith("/") => removeLeadingSlashes(t.tail)
case t: String => url
* Use only the part before the anchor, ensure the URL encoding is correct.
* @param url the full DBpedia or Wikipedia URL
* @return
private def decodedNameFromURL(url: String): String = url match {
case WikiURL(language, title) => WikiUtil.wikiEncode(decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title))))
case DBpediaURL(language, title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case DBpediaENURL(title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case _ => throw new NotADBpediaResourceException("Resource is a disambiguation page."); SpotlightLog.error(this.getClass, "Invalid Wikipedia URL %s", url); null
* Get the end of the redirect chain for the URL or name (last part of the URL).
* @param url either full Wiki or DBpedia URL or only the name
* @return
def wikipediaToDBpediaURI(url: String): String = {
val uri = if(url.startsWith("http:")) {
} else {
if (disambiguationsSet.contains(uri) || uri == null)
throw new NotADBpediaResourceException("Resource is a disambiguation page.")
def getEndOfChainURI(uri: String): String = getEndOfChainURI(uri, Set(uri))
private def getEndOfChainURI(uri: String, alreadyTraversed:Set[String]): String = linkMap.get(uri) match {
case Some(s: String) => if (alreadyTraversed.contains(s)) uri else getEndOfChainURI(s, alreadyTraversed + s)
case None => uri