This repository has been archived by the owner on Oct 20, 2018. It is now read-only.
/
WikipediaToDBpediaClosure.scala
117 lines (99 loc) · 3.99 KB
/
WikipediaToDBpediaClosure.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package org.dbpedia.spotlight.db
import org.semanticweb.yars.nx.parser.NxParser
import java.io.InputStream
import org.dbpedia.spotlight.log.SpotlightLog
import collection.immutable.ListSet
import scala.Predef._
import org.dbpedia.spotlight.exceptions.NotADBpediaResourceException
import java.net.URLDecoder
import org.dbpedia.spotlight.model.SpotlightConfiguration
import org.dbpedia.extraction.util.WikiUtil
import scala.collection.mutable.ListBuffer
/**
* Parts of this are taken from
* org.dbpedia.spotlight.util.ExtractCandidateMap
*
* @author Joachim Daiber
* @author maxjakob
* @author pablomendes
*/
class WikipediaToDBpediaClosure (
val namespace: String,
val redirectsTriples: InputStream,
val disambiguationTriples: InputStream
) {
def this(redirectsTriples: InputStream, disambiguationTriples: InputStream) {
this(SpotlightConfiguration.DEFAULT_NAMESPACE, redirectsTriples, disambiguationTriples)
}
private def decodeURL(uri: String) = URLDecoder.decode(uri,"utf-8")
SpotlightLog.info(this.getClass, "Loading redirects...")
var linkMap = Map[String, String]()
val redParser = new NxParser(redirectsTriples)
while (redParser.hasNext) {
val triple = redParser.next
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
val obj = decodeURL(triple(2).toString.replace(namespace, ""))
linkMap = linkMap.updated(subj, obj)
}
SpotlightLog.info(this.getClass, "Done.")
SpotlightLog.info(this.getClass, "Loading disambiguations...")
var disambiguationsSet = Set[String]()
val disParser = new NxParser(disambiguationTriples)
while (disParser.hasNext) {
val triple = disParser.next
val subj = decodeURL(triple(0).toString.replace(namespace, ""))
disambiguationsSet = disambiguationsSet + subj
}
SpotlightLog.info(this.getClass, "Done.")
val WikiURL = """http://([a-z]+)[.]wikipedia[.]org/wiki/(.*)$""".r
val DBpediaURL = """http://([a-z]+)[.]dbpedia[.]org/resource/(.*)$""".r
val DBpediaENURL = """http://dbpedia[.]org/resource/(.*)$""".r
private def cutOffBeforeAnchor(url: String): String = {
if(url.contains("%23")) //Take only the part of the URI before the last anchor (#)
url.take(url.lastIndexOf("%23"))
else if(url.contains("#"))
url.take(url.lastIndexOf("#"))
else
url
}
private def removeLeadingSlashes(url: String): String = {
url match {
case t: String if url.startsWith("/") => removeLeadingSlashes(t.tail)
case t: String => url
}
}
/**
* Use only the part before the anchor, ensure the URL encoding is correct.
*
* @param url the full DBpedia or Wikipedia URL
* @return
*/
private def decodedNameFromURL(url: String): String = url match {
case WikiURL(language, title) => WikiUtil.wikiEncode(decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title))))
case DBpediaURL(language, title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case DBpediaENURL(title) => decodeURL(removeLeadingSlashes(cutOffBeforeAnchor(title)))
case _ => throw new NotADBpediaResourceException("Resource is a disambiguation page."); SpotlightLog.error(this.getClass, "Invalid Wikipedia URL %s", url); null
}
/**
* Get the end of the redirect chain for the URL or name (last part of the URL).
*
* @param url either full Wiki or DBpedia URL or only the name
* @return
*/
def wikipediaToDBpediaURI(url: String): String = {
val uri = if(url.startsWith("http:")) {
getEndOfChainURI(decodedNameFromURL(url))
} else {
getEndOfChainURI(decodeURL(url))
}
if (disambiguationsSet.contains(uri) || uri == null)
throw new NotADBpediaResourceException("Resource is a disambiguation page.")
else
uri
}
def getEndOfChainURI(uri: String): String = getEndOfChainURI(uri, Set(uri))
private def getEndOfChainURI(uri: String, alreadyTraversed:Set[String]): String = linkMap.get(uri) match {
case Some(s: String) => if (alreadyTraversed.contains(s)) uri else getEndOfChainURI(s, alreadyTraversed + s)
case None => uri
}
}