This repository has been archived by the owner on Oct 20, 2018. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
dbpedia-spotlight/index/src/main/scala/org/dbpedia/spotlight/db/WikipediaToDBpediaClosure.scala
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
117 lines (99 sloc)
3.99 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.dbpedia.spotlight.db | |
import org.semanticweb.yars.nx.parser.NxParser | |
import java.io.InputStream | |
import org.dbpedia.spotlight.log.SpotlightLog | |
import collection.immutable.ListSet | |
import scala.Predef._ | |
import org.dbpedia.spotlight.exceptions.NotADBpediaResourceException | |
import java.net.URLDecoder | |
import org.dbpedia.spotlight.model.SpotlightConfiguration | |
import org.dbpedia.extraction.util.WikiUtil | |
import scala.collection.mutable.ListBuffer | |
/** | |
* Parts of this are taken from | |
* org.dbpedia.spotlight.util.ExtractCandidateMap | |
* | |
* @author Joachim Daiber | |
* @author maxjakob | |
* @author pablomendes | |
*/ | |
class WikipediaToDBpediaClosure ( | |
val namespace: String, | |
val redirectsTriples: InputStream, | |
val disambiguationTriples: InputStream | |
) { | |
def this(redirectsTriples: InputStream, disambiguationTriples: InputStream) { | |
this(SpotlightConfiguration.DEFAULT_NAMESPACE, redirectsTriples, disambiguationTriples) | |
} | |
private def decodeURL(uri: String) = URLDecoder.decode(uri,"utf-8") | |
SpotlightLog.info(this.getClass, "Loading redirects...") | |
var linkMap = Map[String, String]() | |
val redParser = new NxParser(redirectsTriples) | |
while (redParser.hasNext) { | |
val triple = redParser.next | |
val subj = decodeURL(triple(0).toString.replace(namespace, "")) | |
val obj = decodeURL(triple(2).toString.replace(namespace, "")) | |
linkMap = linkMap.updated(subj, obj) | |
} | |
SpotlightLog.info(this.getClass, "Done.") | |
SpotlightLog.info(this.getClass, "Loading disambiguations...") | |
var disambiguationsSet = Set[String]() | |
val disParser = new NxParser(disambiguationTriples) | |
while (disParser.hasNext) { | |
val triple = disParser.next | |
val subj = decodeURL(triple(0).toString.replace(namespace, "")) | |
disambiguationsSet = disambiguationsSet + subj | |
} | |
SpotlightLog.info(this.getClass, "Done.") | |
val WikiURL = """http://([a-z]+)[.]wikipedia[.]org/wiki/(.*)$""".r | |
val DBpediaURL = """http://([a-z]+)[.]dbpedia[.]org/resource/(.*)$""".r | |
val DBpediaENURL = """http://dbpedia[.]org/resource/(.*)$""".r | |
private def cutOffBeforeAnchor(url: String): String = { | |
if(url.contains("%23")) //Take only the part of the URI before the last anchor (#) | |
url.take(url.lastIndexOf("%23")) | |
else if(url.contains("#")) | |
url.take(url.lastIndexOf("#")) | |
else | |
url | |
} | |
private def removeLeadingSlashes(url: String): String = { | |
url match { | |
case t: String if url.startsWith("/") => removeLeadingSlashes(t.tail) | |
case t: String => url | |
} | |
} | |
/** | |
* Use only the part before the anchor, ensure the URL encoding is correct. | |
* | |
* @param url the full DBpedia or Wikipedia URL | |
* @return | |
*/ | |
private def decodedNameFromURL(url: String): String = url match { | |
case WikiURL(language, title) => WikiUtil.wikiEncode(decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))) | |
case DBpediaURL(language, title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title))) | |
case DBpediaENURL(title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title))) | |
case _ => throw new NotADBpediaResourceException("Resource is a disambiguation page."); SpotlightLog.error(this.getClass, "Invalid Wikipedia URL %s", url); null | |
} | |
/** | |
* Get the end of the redirect chain for the URL or name (last part of the URL). | |
* | |
* @param url either full Wiki or DBpedia URL or only the name | |
* @return | |
*/ | |
def wikipediaToDBpediaURI(url: String): String = { | |
val uri = if(url.startsWith("http:")) { | |
getEndOfChainURI(decodedNameFromURL(url)) | |
} else { | |
getEndOfChainURI(decodeURL(url)) | |
} | |
if (disambiguationsSet.contains(uri) || uri == null) | |
throw new NotADBpediaResourceException("Resource is a disambiguation page.") | |
else | |
uri | |
} | |
def getEndOfChainURI(uri: String): String = getEndOfChainURI(uri, Set(uri)) | |
private def getEndOfChainURI(uri: String, alreadyTraversed:Set[String]): String = linkMap.get(uri) match { | |
case Some(s: String) => if (alreadyTraversed.contains(s)) uri else getEndOfChainURI(s, alreadyTraversed + s) | |
case None => uri | |
} | |
} |