/
MissingAbstractsExtractor.scala
402 lines (335 loc) · 13.3 KB
/
MissingAbstractsExtractor.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
package org.dbpedia.extraction.mappings
import scala.collection.mutable
import scala.xml.XML
import scala.io.Source
import scala.language.reflectiveCalls
import java.io._
import java.net.{URLEncoder, URL}
import java.util.logging.{Logger, Level}
import org.dbpedia.extraction.destinations.{DBpediaDatasets,Quad,QuadBuilder}
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util.Language
import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}
import org.dbpedia.util.text.ParseExceptionIgnorer
/**
* Extracts page abstracts which are not yet extracted. For each page which is a candidate for extraction
*
* From now on we use MobileFrontend for MW <2.21 and TextExtracts for MW > 2.22
* The patched mw instance is no longer needed except from minor customizations in LocalSettings.php
* TODO: we need to adapt the TextExtracts extension to accept custom wikicode syntax.
* TextExtracts now uses the article entry and extracts the abstract. The retional for
* the new extension is that we will not need to load all articles in MySQL, just the templates
* At the moment, setting up the patched MW takes longer than the loading of all articles in MySQL :)
* so, even this way it's way better and cleaner ;)
* We leave the old code commented since we might re-use it soon
*/
class MissingAbstractsExtractor(
context : {
def ontology : Ontology
def language : Language
}
)
extends PageNodeExtractor
{
//TODO make this configurable
protected def apiUrl: String = "http://localhost/mediawiki/api.php"
private val maxRetries = 3
/** timeout for connection to web server, milliseconds */
private val connectMs = 2000
/** timeout for result from web server, milliseconds */
private val readMs = 8000
/** sleep between retries, milliseconds, multiplied by CPU load */
private val sleepFactorMs = 4000
private val language = context.language.wikiCode
private val logger = Logger.getLogger(classOf[AbstractExtractor].getName)
//private val apiParametersFormat = "uselang="+language+"&format=xml&action=parse&prop=text&title=%s&text=%s"
private val apiParametersFormat = "uselang="+language+"&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s"
// lazy so testing does not need ontology
private lazy val shortProperty = context.ontology.properties("rdfs:comment")
// lazy so testing does not need ontology
private lazy val longProperty = context.ontology.properties("abstract")
private lazy val longQuad = QuadBuilder(context.language, DBpediaDatasets.MissingLongAbstracts, longProperty, null) _
private lazy val shortQuad = QuadBuilder(context.language, DBpediaDatasets.MissingShortAbstracts, shortProperty, null) _
override val datasets = Set(DBpediaDatasets.MissingLongAbstracts, DBpediaDatasets.MissingShortAbstracts)
private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean()
private val availableProcessors = osBean.getAvailableProcessors()
override def extract(pageNode : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
{
// only run extraction if subjectUri is not in list of extracted data
if (MissingAbstractsExtractor.existingAbstracts(subjectUri)) {
Seq.empty
}
else {
//Only extract abstracts for pages from the Main namespace
if (pageNode.title.namespace != Namespace.Main) {
return Seq.empty
}
//Don't extract abstracts from redirect and disambiguation pages
if (pageNode.isRedirect || pageNode.isDisambiguation) {
return Seq.empty
}
println(s"Detected missing abstract for '$subjectUri'")
//Reproduce wiki text for abstract
//val abstractWikiText = getAbstractWikiText(pageNode)
// if(abstractWikiText == "") return Seq.empty
//Retrieve page text
var text = retrievePage(pageNode.title /*, abstractWikiText*/)
text = postProcess(pageNode.title, text)
if (text.trim.isEmpty) {
logger.info(s"Empty abstract for subject $subjectUri")
return Seq.empty
}
//Create a short version of the abstract
val shortText = short(text)
//Create statements
val quadLong = longQuad(subjectUri, text, pageNode.sourceUri)
val quadShort = shortQuad(subjectUri, shortText, pageNode.sourceUri)
if (shortText.isEmpty) {
Seq(quadLong)
}
else {
Seq(quadLong, quadShort)
}
}
}
/**
* Retrieves a Wikipedia page.
*
* @param pageTitle The encoded title of the page
* @return The page as an Option
*/
def retrievePage(pageTitle : WikiTitle/*, pageWikiText : String*/) : String =
{
// The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
// so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
// TODO: there may be other characters that need to be escaped.
var titleParam = pageTitle.encodedWithNamespace
AbstractExtractor.CHARACTERS_TO_ESCAPE foreach { case (search, replacement) =>
titleParam = titleParam.replace(search, replacement);
}
// Fill parameters
val parameters = apiParametersFormat.format(titleParam/*, URLEncoder.encode(pageWikiText, "UTF-8")*/)
val url = new URL(apiUrl)
for(counter <- 1 to maxRetries)
{
try
{
// Send data
val conn = url.openConnection
conn.setDoOutput(true)
conn.setConnectTimeout(connectMs)
conn.setReadTimeout(readMs)
val writer = new OutputStreamWriter(conn.getOutputStream)
writer.write(parameters)
writer.flush()
writer.close()
// Read answer
return readInAbstract(conn.getInputStream)
}
catch
{
case ex: Exception => {
// The web server may still be trying to render the page. If we send new requests
// at once, there will be more and more tasks running in the web server and the
// system eventually becomes overloaded. So we wait a moment. The higher the load,
// the longer we wait.
var loadFactor = Double.NaN
var sleepMs = sleepFactorMs
// if the load average is not available, a negative value is returned
val load = osBean.getSystemLoadAverage()
if (load >= 0) {
loadFactor = load / availableProcessors
sleepMs = (loadFactor * sleepFactorMs).toInt
}
if (counter < maxRetries) {
logger.log(Level.INFO, "Error retrieving abstract of " + pageTitle + ". Retrying after " + sleepMs + " ms. Load factor: " + loadFactor, ex)
Thread.sleep(sleepMs)
}
else {
ex match {
case e : java.net.SocketTimeoutException => logger.log(Level.INFO,
"Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
loadFactor, ex)
case _ => logger.log(Level.INFO,
"Error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
loadFactor, ex)
}
}
}
}
}
throw new Exception("Could not retrieve abstract for page: " + pageTitle)
}
/**
* Returns the first sentences of the given text that have less than 500 characters.
* A sentence ends with a dot followed by whitespace.
* TODO: probably doesn't work for most non-European languages.
* TODO: analyse ActiveAbstractExtractor, I think this works quite well there,
* because it takes the first two or three sentences
* @param text
* @param max max length
* @return result string
*/
def short(text : String, max : Int = 500) : String =
{
if (text.size < max) return text
val builder = new StringBuilder()
var size = 0
for(sentence <- text.split("""(?<=\.\s)"""))
{
if(size + sentence.size > max)
{
if (builder.isEmpty)
{
return sentence
}
return builder.toString().trim
}
size += sentence.size
builder.append(sentence)
}
builder.toString().trim
}
/**
* Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
* It returns
* <api> <query> <pages> <page> <extract> ABSTRACT_TEXT <extract> <page> <pages> <query> <api>
* /// <api> <parse> <text> ABSTRACT_TEXT </text> </parse> </api>
*/
private def readInAbstract(inputStream : InputStream) : String =
{
// for XML format
val xmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
//val text = (XML.loadString(xmlAnswer) \ "parse" \ "text").text.trim
val text = (XML.loadString(xmlAnswer) \ "query" \ "pages" \ "page" \ "extract").text.trim
decodeHtml(text)
}
private def postProcess(pageTitle: WikiTitle, text: String): String =
{
val startsWithLowercase =
if (text.isEmpty) {
false
} else {
val firstLetter = text.substring(0,1)
firstLetter != firstLetter.toUpperCase(context.language.locale)
}
//HACK
if (startsWithLowercase)
{
val decodedTitle = pageTitle.decoded.replaceFirst(" \\(.+\\)$", "")
if (! text.toLowerCase.contains(decodedTitle.toLowerCase))
{
// happens mainly for Japanese names (abstract starts with template)
return decodedTitle + " " + text
}
}
text
}
//private val destinationNamespacesToRender = List(Namespace.Main, Namespace.Template)
/*
private def renderNode(node : Node) = node match
{
case InternalLinkNode(destination, _, _, _) => destinationNamespacesToRender contains destination.namespace
case ParserFunctionNode(_, _, _) => false
case _ => true
}
*/
/**
* Get the wiki text that contains the abstract text.
*/
/*
def getAbstractWikiText(pageNode : PageNode) : String =
{
// From first TextNode
val start = pageNode.children.indexWhere{
case TextNode(text, _) => text.trim != ""
case InternalLinkNode(destination, _, _, _) => destination.namespace == Namespace.Main
case _ => false
}
// To first SectionNode (exclusive)
var end = pageNode.children.indexWhere{
case sectionNode : SectionNode => true
case _ => false
}
// If there is no SectionNode, To last non-empty TextNode (inclusive)
if(end == -1)
{
val reverseLastTextIndex = pageNode.children.reverse.indexWhere{
case TextNode(text, _) => text.trim != ""
case _ => false
}
if(reverseLastTextIndex != -1)
{
end = pageNode.children.length - reverseLastTextIndex
}
}
// No result if there is no TextNode or no text before a SectionNode
if(start == -1 || end == -1 || start >= end)
{
return ""
}
// Re-generate wiki text for found range of nodes
val text = pageNode.children.slice(start, end)
.filter(renderNode)
.map(_.toWikiText)
.mkString("").trim
// decode HTML entities - the result is plain text
decodeHtml(text)
}
*/
def decodeHtml(text: String): String = {
val coder = new HtmlCoder(XmlCodes.NONE)
coder.setErrorHandler(ParseExceptionIgnorer.INSTANCE)
coder.code(text)
}
}
object MissingAbstractsExtractor {
private val logger = Logger.getLogger(classOf[MissingAbstractsExtractor].getName)
/**
* List of all characters which are reserved in a query component according to RFC 2396
* with their escape sequences as determined by the JavaScript function encodeURIComponent.
*/
val CHARACTERS_TO_ESCAPE = List(
(";", "%3B"),
("/", "%2F"),
("?", "%3F"),
(":", "%3A"),
("@", "%40"),
("&", "%26"),
("=", "%3D"),
("+", "%2B"),
(",", "%2C"),
("$", "%24")
)
lazy val existingAbstracts = {
val file = new File("existing-abstracts.tsv")
logger.info(s"Starting to read list of existing abstracts from file '${file.getAbsolutePath}'")
val reader = try {
new BufferedReader(new FileReader("existing-abstracts.tsv"))
}
catch {
case e: FileNotFoundException => logger.severe(s"Unable to find file '${file.getAbsolutePath}'." +
s"Please generate it and put it in the given location.")
throw e
case e : Throwable => throw e
}
val set: mutable.HashSet[String] = mutable.HashSet()
var line: String = null
var first = true
while ( {
line = reader.readLine(); line != null
}) {
if (first) {
first = false
}
else {
val parts = line.split("\t")
set.add(parts(1))
}
}
reader.close()
logger.info(s"Done reading existing abstract names: ${set.size} abstracts already existing")
set
}
}